# Synonyms

This notebook shows how to define synonyms.

Several options are tried (see below). It turns out that we want to apply the synonyms only to the search query, not to the text stored in the database itself (option 3).

In [27]:
import subprocess
import json

In [70]:
query = """
curl -X DELETE "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch/pdf_documents_test" -u guest:teradata
"""

res = subprocess.getoutput(query)
res = json.loads(res[res.find("{"):])
res

{'acknowledged': True}

In [71]:
# OPTION 1
# --------
#
# "analyzer": "test_analyzer"
#        
# "fruit => apple, banana",
# "vegetable => onion, garlic",
# "grocery => fruit, apple, banana, vegetable, onion, garlic"
#
# Searching for apple will give apple and fruit.
# Searching for fruit will give fruit, apple and banana.
#
# Note:
#   "grocery => fruit, vegetable" will not work!
#   All synonyms have to be specified, i.e. "grocery => fruit, apple, banana, vegetable, onion, garlic"
#
#
# OPTION 2
# --------
#
# "analyzer": "test_analyzer"
#
# "apple, banana => fruit, grocery",
# "onion, garlic => vegetable, grocery",
# "fruit, vegetable => grocery"
#
# Searching for apple will give fruit, apple and banana.
# Searching for fruit will give fruit, apple and banana.
#
# Note:
#   "apple, banana => fruit" will not work!
#   All synonyms have to be specified, i.e. "apple, banana => fruit, grocery"
#
#
# OPTION 3
# --------
#
# "search_analyzer": "test_analyzer",
# "analyzer": "test_analyzer0"
#
# "fruit => fruit, apple, banana",
# "vegetable => vegetable, onion, garlic",
# "grocery => grocery, fruit, apple, banana, vegetable, onion, garlic"
#
# Searching for apple will give apple.
# Searching for fruit will give fruit, apple and banana.
#
# Note:
#   If I understand it correctly, the synonyms will only be applied to the search queary,
#   not to the text stored in the database.


query = """
curl -X PUT "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch/pdf_documents_test" -H 'Content-Type: application/json' -d'
{
  "settings": {
    "analysis": {
      "analyzer": {
        "test_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["standard", "lowercase", "synonym"]
        },
        "test_analyzer0": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["standard", "lowercase"]
        }
      },
      "filter": {
        "synonym" : {
          "type" : "synonym",
          "synonyms" : [
            "fruit => fruit, apple, banana",
            "vegetable => vegetable, onion, garlic",
            "grocery => grocery, fruit, apple, banana, vegetable, onion, garlic"
          ]
        }
      }
    }
  },
  "mappings": {
    "document": {
      "properties": {
        "summary": {
          "type": "text",
          "search_analyzer": "test_analyzer",
          "analyzer": "test_analyzer0"
        }
      }
    }
  }
}
' -u guest:teradata
"""

res = subprocess.getoutput(query)
res = json.loads(res[res.find("{"):])
res

{'acknowledged': True,
 'index': 'pdf_documents_test',
 'shards_acknowledged': True}

In [72]:
query = """
curl -X GET "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch/pdf_documents_test/_analyze" -H 'Content-Type: application/json' -d'
{
  "analyzer": "test_analyzer",
  "text":     "apple"
}
' -u guest:teradata
"""

res = subprocess.getoutput(query)
res = json.loads(res[res.find("{"):])
res

{'tokens': [{'end_offset': 5,
   'position': 0,
   'start_offset': 0,
   'token': 'apple',
   'type': '<ALPHANUM>'}]}

In [73]:
query = """
curl -X GET "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch/pdf_documents_test/_analyze" -H 'Content-Type: application/json' -d'
{
  "analyzer": "test_analyzer",
  "text":     "fruit"
}
' -u guest:teradata
"""

res = subprocess.getoutput(query)
res = json.loads(res[res.find("{"):])
res

{'tokens': [{'end_offset': 5,
   'position': 0,
   'start_offset': 0,
   'token': 'fruit',
   'type': 'SYNONYM'},
  {'end_offset': 5,
   'position': 0,
   'start_offset': 0,
   'token': 'apple',
   'type': 'SYNONYM'},
  {'end_offset': 5,
   'position': 0,
   'start_offset': 0,
   'token': 'banana',
   'type': 'SYNONYM'}]}

In [74]:
query = """
curl -X GET "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch/pdf_documents_test/_analyze" -H 'Content-Type: application/json' -d'
{
  "analyzer": "test_analyzer",
  "text":     "grocery"
}
' -u guest:teradata
"""

res = subprocess.getoutput(query)
res = json.loads(res[res.find("{"):])
res

{'tokens': [{'end_offset': 7,
   'position': 0,
   'start_offset': 0,
   'token': 'grocery',
   'type': 'SYNONYM'},
  {'end_offset': 7,
   'position': 0,
   'start_offset': 0,
   'token': 'fruit',
   'type': 'SYNONYM'},
  {'end_offset': 7,
   'position': 0,
   'start_offset': 0,
   'token': 'apple',
   'type': 'SYNONYM'},
  {'end_offset': 7,
   'position': 0,
   'start_offset': 0,
   'token': 'banana',
   'type': 'SYNONYM'},
  {'end_offset': 7,
   'position': 0,
   'start_offset': 0,
   'token': 'vegetable',
   'type': 'SYNONYM'},
  {'end_offset': 7,
   'position': 0,
   'start_offset': 0,
   'token': 'onion',
   'type': 'SYNONYM'},
  {'end_offset': 7,
   'position': 0,
   'start_offset': 0,
   'token': 'garlic',
   'type': 'SYNONYM'}]}

In [75]:
query = """
curl -X POST "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch/_reindex" -H 'Content-Type: application/json' -d'
{
  "source": {
    "index": "pdf_documents"
  },
  "dest": {
    "index": "pdf_documents_test"
  }
}
' -u guest:teradata
"""

#res = subprocess.getoutput(query)
#res = json.loads(res[res.find("{"):])
#res

In [76]:
import pandas as pd

df = pd.DataFrame(columns=["title", "summary"])
df.loc[0] = ["test1", "This document is about an apple"]
df.loc[1] = ["test2", "This document is about a banana"]
df.loc[2] = ["test3", "This document is about a fruit"]
df.loc[3] = ["test4", "This document is about spider man"]
df.loc[4] = ["test5", "This document is about Batman"]
df.loc[5] = ["test6", "This document is about an action hero"]

df.head()

Unnamed: 0,title,summary
0,test1,This document is about an apple
1,test2,This document is about a banana
2,test3,This document is about a fruit
3,test4,This document is about spider man
4,test5,This document is about Batman


In [77]:
from elasticsearch import Elasticsearch

url = "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch"
user = "guest"
secret = "teradata"

es = Elasticsearch(url, port=80, http_auth=(user, secret))

records = df.T.to_dict()
records = [records[i] for i in records]
for idx, row in enumerate(records):
    res = es.index(index="pdf_documents_test", doc_type='document', id=idx, body=row)
    print(res)

{'_version': 1, '_primary_term': 1, 'result': 'created', '_seq_no': 0, '_id': '0', '_type': 'document', '_shards': {'successful': 2, 'total': 2, 'failed': 0}, '_index': 'pdf_documents_test'}
{'_version': 1, '_primary_term': 1, 'result': 'created', '_seq_no': 0, '_id': '1', '_type': 'document', '_shards': {'successful': 2, 'total': 2, 'failed': 0}, '_index': 'pdf_documents_test'}
{'_version': 1, '_primary_term': 1, 'result': 'created', '_seq_no': 0, '_id': '2', '_type': 'document', '_shards': {'successful': 2, 'total': 2, 'failed': 0}, '_index': 'pdf_documents_test'}
{'_version': 1, '_primary_term': 1, 'result': 'created', '_seq_no': 0, '_id': '3', '_type': 'document', '_shards': {'successful': 2, 'total': 2, 'failed': 0}, '_index': 'pdf_documents_test'}
{'_version': 1, '_primary_term': 1, 'result': 'created', '_seq_no': 1, '_id': '4', '_type': 'document', '_shards': {'successful': 2, 'total': 2, 'failed': 0}, '_index': 'pdf_documents_test'}
{'_version': 1, '_primary_term': 1, 'result':

In [78]:
query = """
curl -X GET "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch/pdf_documents_test/document/_search" -H 'Content-Type: application/json' -d'
{
    "_source": ["title", "summary"],
    "query": {
        "match": {
            "summary": "apple"
        }
    }
}
' -u guest:teradata
"""

res = subprocess.getoutput(query)
res = json.loads(res[res.find("{"):], strict=False)
res

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': '0',
    '_index': 'pdf_documents_test',
    '_score': 0.2876821,
    '_source': {'summary': 'This document is about an apple',
     'title': 'test1'},
    '_type': 'document'}],
  'max_score': 0.2876821,
  'total': 1},
 'timed_out': False,
 'took': 89}

In [79]:
query = """
curl -X GET "http://a3557701c4b3211e88f8a060fa4fdbf3-427558466.eu-west-3.elb.amazonaws.com/elasticsearch/pdf_documents_test/document/_search" -H 'Content-Type: application/json' -d'
{
    "_source": ["title", "summary"],
    "query": {
        "match": {
            "summary": "fruit"
        }
    }
}
' -u guest:teradata
"""

res = subprocess.getoutput(query)
res = json.loads(res[res.find("{"):], strict=False)
res

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': '2',
    '_index': 'pdf_documents_test',
    '_score': 0.6682933,
    '_source': {'summary': 'This document is about a fruit', 'title': 'test3'},
    '_type': 'document'},
   {'_id': '0',
    '_index': 'pdf_documents_test',
    '_score': 0.2876821,
    '_source': {'summary': 'This document is about an apple',
     'title': 'test1'},
    '_type': 'document'},
   {'_id': '1',
    '_index': 'pdf_documents_test',
    '_score': 0.2876821,
    '_source': {'summary': 'This document is about a banana',
     'title': 'test2'},
    '_type': 'document'}],
  'max_score': 0.6682933,
  'total': 3},
 'timed_out': False,
 'took': 72}