In [1]:
# python 3.7

# you can install the ES library using pip
import elasticsearch 

# elasticsearch also have a higher level api (elasticsearch_dsl)
# its documentation is lacking and doesn't support all ES features

In [2]:
# The schema I used to index wikipedia text is somewhat like this json below,
# each doc has one title, many sections(each with a header and a body)
# doc:{
#   title: Text
#   sections:[ {header: Text, body: Text}, ... ]
#}

In [3]:
# init connection to nlp12
client=elasticsearch.Elasticsearch([{'host':'nlp12','port':9200}],timeout=20)

In [4]:
# We'll use spacy for sentence segmentation
import spacy
nlp = spacy.load('en')

#and some regexs
import re

from collections import Counter

In [5]:
# get documents containing "french" accroding to relevence
response = client.search(
    index="wiki", # This is somewhat like the SQL "table" 
    params={'from':0,'size':5}, # get 5 highest scroed
    body={'query':
              {"nested": { # search within document sections
                  'path':'sections',
                  'inner_hits':{},# return which section matched
                  "query": {"match": {"sections.body": 'french'}}
                
              }}
    }
)

print(f"Total hits: {response['hits']['total']['value']}")
print('(10k is the default max.)')
print()
print('The following pages were scored according to "french" relevence in their section (ranking algorithm called bs25):')
for hit in response['hits']['hits']:
    page_title=hit['_source']['title']
    matched_section=hit['inner_hits']['sections']['hits']['hits'][0]['_source']
    position=matched_section['body'].lower().index('french')
    text=matched_section['body'][max(0,position-20):position+30].replace('\n',' ').strip()
    print(f'\t{page_title}: {matched_section["header"]}   -   ...{text}...')

Total hits: 10000
(10k is the default max.)

The following pages were scored according to "french" relevence in their section (ranking algorithm called bs25):
	American French: Introduction   -   ...'''American French''' (French: le '''''fra...
	Section d'Or: Notable members   -   ...garian, naturalized French *Alexandra Exter, 1882–...
	Co-ordinated Organisation: Linguistic rules of the coordinated organisations   -   ...llon, member of the French National Assembly....
	French music: Introduction   -   ...'''''French music''''' may refer to...
	Édouard: Introduction   -   ...douard''' is both a French given name and a surnam...


In [6]:
#now a pharse: "french fries"
response = client.search(
    index="wiki", 
    params={'from':0,'size':5}, # get 5 highest scroed
    body={'query':
              {"nested": { # search within document sections
                  'path':'sections',
                  'inner_hits':{},# return which section matched
                  "query": {"match_phrase": {"sections.body": 'french fries'}}
              }}
    }
)

print(f"Total hits: {response['hits']['total']['value']}")
print()
print('hits:')
for hit in response['hits']['hits']:
    page_title=hit['_source']['title']
    matched_section=hit['inner_hits']['sections']['hits']['hits'][0]['_source']
    position=matched_section['body'].lower().index('french fries')
    text=matched_section['body'][max(0,position-20):position+30].replace('\n',' ')
    print(f'\t{page_title}: {matched_section["header"]} - ...{text}...')

Total hits: 870

hits:
	Sir Kensington's: Fries of New York - ...bition featured 100 French fries from New York Cit...
	Superdawg: The food - ...The hot dog and French fries are served togeth...
	Beverly Hills Caviar Automated Boutique: See also - ...* French fries vending machine ...
	Blooming onion: See also - ... *French fries * List of hors d'...
	List of cultural icons of the Netherlands: Food and Drink - ... *Heineken *Gouda  *French fries *Stroopwafel ...


In [7]:
#same pharse "french fries", but let's get a random sentence containing it
some_random_seed=5

response = client.search(
    index="wiki", 
    params={'from':0,'size':1}, # get 1
    body={'query':
              {"nested": { # search within document sections
                  'path':'sections',
                  'inner_hits':{},# return which section matched
                  "query": {"bool": {# these "bool" queres allows combining mutiple queries in various ways
                      "filter": [
                        {"match_phrase": {"sections.body": 'french fries'}}
                      ],
                      "must": {
                        "function_score": {
                            "functions": [
                                {
                                    "random_score": {"seed": some_random_seed}
                                }
                            ],
                            "boost_mode": "replace"
                          }
                        }
                  }
                } 
              }}
    }
)

print(f"Total hits: {response['hits']['total']['value']}")
print()

for hit in response['hits']['hits']:
    page_title=hit['_source']['title']
    matched_section=hit['inner_hits']['sections']['hits']['hits'][0]['_source']
    section_body=matched_section['body']
    for paragraph in section_body.split('\n'):
        parsed=nlp(paragraph)
        for sentence in parsed.sents:
            if 'french fries' in sentence.lower_:
                print(sentence)



Total hits: 870

The refined peanut oil has a smoke point of 450 °F/232 °C is commonly used for frying volume batches foods like french fries.


In [8]:
# find words between occurences of  "schwarzenegger" and "california"
response = client.search(
    index="wiki", 
    params={'from': 0, 'size': 200}, # first 100 hits 
    body={'query':
        {"nested": {
            'path': 'sections',
            'inner_hits': {},
            "query": {
                "intervals": { # find text with both tokens: schwarzenegger and california 
                    "sections.body": {
                        "all_of": {
                            "max_gaps" : 2, # max. 2 words between tokens
                            "intervals" : [
                                {
                                  "match" : {
                                    "query" : "schwarzenegger",
                                  }},
                                 {
                                  "match" : {
                                    "query" : "california",
                                  }}
                            ]
                            },

                        }
                    }
                }
            }
        }}
)

print(f"Total hits {response['hits']['total']['value']}\n")
foundin=[]
words_inbetween=Counter()
for hit in response['hits']['hits']:
    page_title=hit['_source']['title']
    matched_section=hit['inner_hits']['sections']['hits']['hits'][0]['_source']
    matches=re.findall('(california|schwarzenegger)([a-z ]{0,100})(california|schwarzenegger)',matched_section['body'].lower() )
    if matches:
        mathced_mid = matches[0][1].strip()
        if mathced_mid:
            words_inbetween.update(mathced_mid.split())
            foundin.append(f'{page_title}: {matched_section["header"]}')
print('found in: \n\t'+'\n\t'.join(foundin[:5])+'\n\t...')
print()
print('common words between: '+', '.join( w for w,_ in words_inbetween.most_common(5)))


Total hits 405

found in: 
	Plata v. Schwarzenegger: Reactions
	2004 in LGBT rights: Events
	List of Republicans who opposed the 2016 Donald Trump presidential campaign: Government officials
	First term of Arnold Schwarzenegger as Governor of California: First term
	Kevin Cooper (prisoner): Assessment by courts, governors, and independent groups
	...

common words between: arnold, governor, of, the, schwarzenegger


In [9]:
# find term freqency in documents containg 'acquisition' - i.e. probably related to economics
response = client.search(
    index="wiki", 
    params={'from':0,'size':1},
    body={'query':
              {"nested": {
                  'path':'sections',
                  "query": {"match": {"sections.body": 'acquisition'}}
                
              }},
          'aggs': {'wf': {'nested': {'path': 'sections'},
           'aggs': {'wf2': {'terms': {'field': 'sections.body', 'size': 1000}}}}}
          
    }
)

print(f"Aggregated over {response['aggregations']['wf']['wf2']['sum_other_doc_count']} documents")
print('We have full term freq. in the matched docs:')
print(', '.join([x['key'] for x in response['aggregations']['wf']['wf2']['buckets'][:10] ])+' ... '+ \
', '.join([x['key'] for x in response['aggregations']['wf']['wf2']['buckets'][100:110] ])+' ...')


Aggregated over 21260869 documents
We have full term freq. in the matched docs:
the, of, and, in, a, to, for, by, as, on ... state, several, high, until, 3, service, before, system, second, business ...


In [10]:
# for refernce this is the full schema
# Kibana is also running on nlp12:5601
    
#     {
#   "mapping": {
#     "properties": {
#       "hyperlinks": {
#         "type": "text",
#         "fields": {
#           "keyword": {
#             "type": "keyword",
#             "ignore_above": 256
#           }
#         }
#       },
#       "sections": {
#         "type": "nested",
#         "properties": {
#           "body": {
#             "type": "text",
#             "fields": {
#               "stemmed": {
#                 "type": "text",
#                 "analyzer": "snowball"
#               }
#             },
#             "fielddata": true
#           },
#           "header": {
#             "type": "text"
#           },
#           "hyperlinks": {
#             "type": "text"
#           }
#         }
#       },
#       "title": {
#         "type": "text"
#       }
#     }
#   }
# }