**Connect to Elasticsearch**

In [46]:
from pprint import pprint 
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

client_info = es.info()

print("Connected to Elasticsearch!")

pprint(client_info.body)

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'T1HeaWnRTOqX_BBgREVVbA',
 'name': '64c49e436740',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2025-10-21T10:06:21.288851013Z',
             'build_flavor': 'default',
             'build_hash': '25d88452371273dd27356c98598287b669a03eae',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.3.1',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.19.0',
             'number': '9.2.0'}}


**1. Character filters**

1.1. HTML Strip Character Filter

In [47]:
from pprint import pprint 

response = es.indices.analyze(
    char_filter=[
        "html_strip"
    ],
    text="I&apos;m so happy</b>!</p>",
)

pprint(response.body)

{'tokens': [{'end_offset': 26,
             'position': 0,
             'start_offset': 0,
             'token': "I'm so happy!\n",
             'type': 'word'}]}


1.2. Mapping character filter

In [48]:
response = es.indices.analyze(
    tokenizer="keyword", 
    char_filter=[
        {
            "type": "mapping",
            "mappings": [
                "٠ => 0",
                "١ => 1",
                "٢ => 2",
                "٣ => 3",
                "٤ => 4",
                "٥ => 5",
                "٦ => 6",
                "٧ => 7",
                "٨ => 8",
                "٩ => 9"
            ]
        }
    ],
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)

pprint(response.body)

{'tokens': [{'end_offset': 37,
             'position': 0,
             'start_offset': 0,
             'token': 'I saw comet Tsuchinshan Atlas in 2024',
             'type': 'word'}]}


**Tokenizer**

2.1. Standard

In [49]:
response = es.indices.analyze(
    tokenizer="standard", 
    text="The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
)

tokens = response.body["tokens"]

In [50]:
for token in tokens:
    print(f"Token: {token['token']}, Type: {token["type"]}")

Token: The, Type: <ALPHANUM>
Token: 2, Type: <NUM>
Token: QUICK, Type: <ALPHANUM>
Token: Brown, Type: <ALPHANUM>
Token: Foxes, Type: <ALPHANUM>
Token: jumped, Type: <ALPHANUM>
Token: over, Type: <ALPHANUM>
Token: the, Type: <ALPHANUM>
Token: lazy, Type: <ALPHANUM>
Token: dog's, Type: <ALPHANUM>
Token: bone, Type: <ALPHANUM>


2.2. Lowercase

In [51]:
response = es.indices.analyze(
    tokenizer="lowercase",
    text = "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
)

tokens = response.body['tokens']

for token in tokens:
    print(f"Token: {token["token"]}, Type: {token['type']}")

Token: the, Type: word
Token: quick, Type: word
Token: brown, Type: word
Token: foxes, Type: word
Token: jumped, Type: word
Token: over, Type: word
Token: the, Type: word
Token: lazy, Type: word
Token: dog, Type: word
Token: s, Type: word
Token: bone, Type: word


**3. Token filter**

3.1. Apostrophe

In [52]:
response = es.indices.analyze(
    tokenizer="standard", 
    text="The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
    filter=[
        "apostrophe"
    ]
)

tokens = response.body['tokens']
for token in tokens:
    print(f"Token: {token['token']}, Type: {token["type"]}")

Token: The, Type: <ALPHANUM>
Token: 2, Type: <NUM>
Token: QUICK, Type: <ALPHANUM>
Token: Brown, Type: <ALPHANUM>
Token: Foxes, Type: <ALPHANUM>
Token: jumped, Type: <ALPHANUM>
Token: over, Type: <ALPHANUM>
Token: the, Type: <ALPHANUM>
Token: lazy, Type: <ALPHANUM>
Token: dog, Type: <ALPHANUM>
Token: bone, Type: <ALPHANUM>


3.2. Decimal digit

In [53]:
response = es.indices.analyze(
    tokenizer="standard", 
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
    filter=[
        "decimal_digit"
    ]
)

tokens = response.body['tokens']

for token in tokens:
    print(f"Token: {token['token']}")

Token: I
Token: saw
Token: comet
Token: Tsuchinshan
Token: Atlas
Token: in
Token: 2024


3.3. Reverse

In [54]:
result = es.indices.analyze(
    tokenizer="standard", 
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤", 
    filter=[
        "reverse"
    ],
)

tokens = result.body['tokens']

for token in tokens:
    print(f"Token: {token['token']}")

Token: I
Token: was
Token: temoc
Token: nahsnihcusT
Token: saltA
Token: ni
Token: ٤٢٠٢


**4. Built-in analyzers**
# 
4.1. Standard

In [55]:
response = es.indices.analyze(
    analyzer="standard",
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

Token: 'i'
Token: 'saw'
Token: 'comet'
Token: 'tsuchinshan'
Token: 'atlas'
Token: 'in'
Token: '٢٠٢٤'


4.2. Stop

In [56]:
response = es.indices.analyze(
    analyzer="stop",
    text="I saw comet. Tsuchinshan Atlas in ٢٠٢٤",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

Token: 'i'
Token: 'saw'
Token: 'comet'
Token: 'tsuchinshan'
Token: 'atlas'


4.3. Keyword

In [57]:
response = es.indices.analyze(
    analyzer="keyword",
    text="I saw comet Tsuchinshan Atlas in ٢٠٢٤",
)
tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: '{token['token']}'")

Token: 'I saw comet Tsuchinshan Atlas in ٢٠٢٤'


**5. Index time VS Search time analysis**
#
5.1. Index time
#
Index-time analysis transforms text before it's stored in the index. In this example, let's create an index with an analyzer that lowercases text, removes HTML tags, and replaces ampersands (&) with the word "and."

In [72]:
index_name = "index_time_example"
settings = {
    "settings": {
        "analysis": {
            "char_filter": {
                "ampersand_replacement": {
                    "type": "mapping",
                    "mappings": ["& => and"]
                }
            },
            "analyzer": {
                "custom_index_analyzer": {
                    "type": "custom",
                    "char_filter": ["html_strip", "ampersand_replacement"],
                    "tokenizer": "standard",
                    "filter": ["lowercase"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "analyzer": "custom_index_analyzer"
            }
        }
    }
}


# delete and create index
es.indices.delete(index=index_name, ignore_unavailable=True)
response = es.indices.create(index=index_name, body=settings)

pprint(response.body)

{'acknowledged': True,
 'index': 'index_time_example',
 'shards_acknowledged': True}


**Index Document**

In [73]:
document = {
    "content": "Visit my website https://myuniversehub.com/ & like some images!"
}

response = es.index(
    index=index_name,
    body=document
)

pprint(response.body)

{'_id': 'x8BgZJoBlg0ijCqcq7PV',
 '_index': 'index_time_example',
 '_primary_term': 1,
 '_seq_no': 0,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_version': 1,
 'result': 'created'}


When searching for the document, you'll notice that the content appears unchanged. This is expected because Elasticsearch stores the transformed tokens in an inverted index for searching purposes, while keeping the original document intact in the _source field.

In [74]:
result = es.search(
    index=index_name,
    body={
        "query":  {
            "match_all": {}
        }
    }
)

hits = result.body["hits"]["hits"]

for hit in hits:
    pprint(f"{hit["_source"]}")

"{'content': 'Visit my website https://myuniversehub.com/ & like some images!'}"


We can verify that the custom analyzer is working by applying it to the document like this.

In [75]:
response = es.indices.analyze(
    index=index_name,
    body={
        "field": "content",
        "text": "Visit my website https://myuniversehub.com/ & like some images!"
    }
)

tokens = response.body["tokens"]
for token in tokens:
    print(f"Token: {token["token"]}")

Token: visit
Token: my
Token: website
Token: https
Token: myuniversehub.com
Token: and
Token: like
Token: some
Token: images


**5.2. Search time**
#
Search-time analysis transforms text only when a search query is performed, not when data is indexed. In this example, we’ll perform a search with a search-time analyzer that transforms text differently (e.g., it lowercases and removes stop words).

In [85]:
result = es.search(
    index=index_name,
    body={
        "query": {
            "match": { # match it for full-text search
                "content": {
                    "query": "myuniversehub.com",
                    "analyzer": "standard" # use different analyzer than you use during indexing
                }
            }
        }
    }
)

hits = result["hits"]["hits"]

for hit in hits:
    print(f"{hit["_source"]}")

{'content': 'Visit my website https://myuniversehub.com/ & like some images!'}


You can also use a term query to match exact terms. Since myuniversehub.com exists exactly as-is in the document, this query will return the document in the results.

In [86]:
result = es.search(
    index=index_name,
    body={
        "query": {
            "term": { # term is used for exact match
                "content": {
                    "value": "myuniversehub.com"
                }
            }
        }
    }
)

hits = result["hits"]["hits"]

for hit in hits:
    print(f"{hit["_source"]}")

{'content': 'Visit my website https://myuniversehub.com/ & like some images!'}


Term is case-sensitive.

In [87]:
result = es.search(
    index=index_name,
    body={
        "query": {
            "term": { # term is used for exact match
                "content": {
                    "value": "Myuniversehub.com"
                }
            }
        }
    }
)

hits = result["hits"]["hits"]

for hit in hits:
    print(f"{hit["_source"]}")

No result return because there is no Myuniversehub.com in content.