# Requirements
We'll only need the [Python Elasticsearch Client](https://elasticsearch-py.readthedocs.io/en/master/). 

`pip install elasticsearch`

In [6]:
from elasticsearch import Elasticsearch
es = Elasticsearch()

That's it. You don't need to have an active instance of elastic running.

# Using elasticsearch analyzers as tokenizers

In [7]:
text = "Die junge Informatikerin Katie Bouman machte die historische Aufnahme "\
       " eines schwarzen Lochs möglich."

## Using the German analyzer
The German Analyzer includes conversion to lowercase, stemming and stop word filtering. 
See [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#german-analyzer) for more details.

In [8]:
tokens = es.indices.analyze(
  body={"analyzer": "german",
        "text": text})['tokens']
for token in tokens:
    print(token)

{'token': 'jung', 'start_offset': 4, 'end_offset': 9, 'type': '<ALPHANUM>', 'position': 1}
{'token': 'informatikerin', 'start_offset': 10, 'end_offset': 24, 'type': '<ALPHANUM>', 'position': 2}
{'token': 'kati', 'start_offset': 25, 'end_offset': 30, 'type': '<ALPHANUM>', 'position': 3}
{'token': 'bouman', 'start_offset': 31, 'end_offset': 37, 'type': '<ALPHANUM>', 'position': 4}
{'token': 'macht', 'start_offset': 38, 'end_offset': 44, 'type': '<ALPHANUM>', 'position': 5}
{'token': 'historisch', 'start_offset': 49, 'end_offset': 60, 'type': '<ALPHANUM>', 'position': 7}
{'token': 'aufnahm', 'start_offset': 61, 'end_offset': 69, 'type': '<ALPHANUM>', 'position': 8}
{'token': 'schwarz', 'start_offset': 77, 'end_offset': 86, 'type': '<ALPHANUM>', 'position': 10}
{'token': 'loch', 'start_offset': 87, 'end_offset': 92, 'type': '<ALPHANUM>', 'position': 11}
{'token': 'moglich', 'start_offset': 93, 'end_offset': 100, 'type': '<ALPHANUM>', 'position': 12}


## Using the German analyzer w/o stemming
Since stemming is usually rather agressive and we may loose a lot of semantic interpretability, we can exclude it like this:

In [9]:
tokens = es.indices.analyze(
  body={"tokenizer": "standard", "filter": ["lowercase", {"type": "stop", "stopwords": "_german_"}],
        "text": text})['tokens']
for token in tokens:
    print(token)

{'token': 'junge', 'start_offset': 4, 'end_offset': 9, 'type': '<ALPHANUM>', 'position': 1}
{'token': 'informatikerin', 'start_offset': 10, 'end_offset': 24, 'type': '<ALPHANUM>', 'position': 2}
{'token': 'katie', 'start_offset': 25, 'end_offset': 30, 'type': '<ALPHANUM>', 'position': 3}
{'token': 'bouman', 'start_offset': 31, 'end_offset': 37, 'type': '<ALPHANUM>', 'position': 4}
{'token': 'machte', 'start_offset': 38, 'end_offset': 44, 'type': '<ALPHANUM>', 'position': 5}
{'token': 'historische', 'start_offset': 49, 'end_offset': 60, 'type': '<ALPHANUM>', 'position': 7}
{'token': 'aufnahme', 'start_offset': 61, 'end_offset': 69, 'type': '<ALPHANUM>', 'position': 8}
{'token': 'schwarzen', 'start_offset': 77, 'end_offset': 86, 'type': '<ALPHANUM>', 'position': 10}
{'token': 'lochs', 'start_offset': 87, 'end_offset': 92, 'type': '<ALPHANUM>', 'position': 11}
{'token': 'möglich', 'start_offset': 93, 'end_offset': 100, 'type': '<ALPHANUM>', 'position': 12}
