### Imports

In [60]:
import json
import os
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q

### Elastic Search Setup

In [61]:
def index(article, search_engine, index_name, article_id):
    search_engine.index(index = index_name, doc_type = "article",id = article_id, body = article)

def file_index(file_name):
    return int(file_name.split('-')[0])

def build_index(root_path, n, search_engine):
     for root, dirs, files in os.walk(root_path):
         for file in files[:n]:
            if file.endswith('.json'):
                with open(os.path.join(root, file), 'r') as f:
                     article = json.load(f)
                     index(article, search_engine, "scrapped", file_index(file))



es = Elasticsearch()
build_index("articles", 500, es)

### Print a sample index entry.

In [62]:
sample_article = es.get(index = "scrapped", id = 0)
print(json.dumps(sample_article['_source'], indent = 2))

{
  "title": "Employees of Big French Cleaning Company Win Sexual Harassment Case",
  "authors": [
    "Alissa J. Rubin",
    "Elian Peltier"
  ],
  "url": "https://www.nytimes.com/2017/11/10/world/europe/sexual-harassment-france-h-reinier.html",
  "body": "The court concurred that the four women had been sexually harassed by a man who was their \u201cteam leader\u201d at H. Reinier, a subsidiary of ONET, one of France\u2019s largest cleaning companies. The harassment started with uncomfortable kisses and inappropriate touching but became more intrusive.\n\n\u201cHe would come up behind me in the bathroom when I was leaning over to clean and rub himself against me,\u201d said Karima Emtir, another of the plaintiffs, bursting into tears a few days before the award as she described in an interview the treatment she had endured.\n\nThe harassment escalated, the women said, after they supported their co-worker, Rachid Lakhal, a whistle-blower who accused one of his superiors of a kickback 

## Queries Examples

### 1. Search for articles containing a word in their body.

In [63]:
s = Search(using = es, index = "scrapped") \
    .query("match", body = "supporters")
response = s.execute()

print("#Hits", len(response))
for hit in response:
    print(hit.meta.id + ".", hit.title)

#Hits 10
299. Court in Turkey Acquits Military Officers of Trying to Overthrow Government
28. In Crimea, Russian Land Grab Feeds Cries of ‘Carpetbaggers!’
291. Syrian Refugee Tripped in Hungary Fights Unfounded Accusations of Extremist Ties
219. Erdogan’s Formula for Consolidating Clout in Turkey
117. Setback for Angela Merkel as Far Right Makes Gains in Germany
190. Emmanuel Macron, French Economy Minister, Hints at Presidential Run
214. BBC Journalist Turned Away From Flight to U.S. Because She Was Born in Iran
295. Bronislaw Komorowski, Poland’s President, Concedes Defeat to Right-Wing Challenger Andrzej Duda
186. Hollande’s Romances Turn Into a Political Spectacle in France
104. London Attack Near Mosque Investigated as Terrorism


### 2. Search for articles containing a word in their title.

In [64]:
s = Search(using = es, index = "scrapped") \
    .query("match", title = "Twitter")
response = s.execute()

print("#Hits", len(response))
for hit in response:
    print(hit.meta.id + ".", hit.title)

#Hits 1
201. Murdoch and Fox News Mocked on Twitter for Claims About Muslims


### 3. Search for articles written by a specific author.

In [65]:
s = Search(using = es, index = "scrapped") \
    .query("match", authors = "Adam Nossiter")
response = s.execute()

print("#Hits", len(response))
for hit in response:
    print(hit.meta.id + ".", hit.title, "--by--" , hit.authors)

#Hits 10
180. As Terrorists Cross Borders, Europe Sees Anew That Its Intelligence Does Not --by-- ['Adam Nossiter']
221. Marine Le Pen’s Anti-Islam Message Gains Influence in France --by-- ['Adam Nossiter']
140. In France, the Mood Darkens as a Harsh Reality Sets In --by-- ['Adam Nossiter']
205. Brussels Attacks Underscore Vulnerability of an Open European Society --by-- ['Adam Nossiter']
228. Some Calais ‘Jungle’ Camp Migrants Get Eviction Reprieve --by-- ['Adam Nossiter']
234. Experts Question Whether Salah Abdeslam Will Provide Answers on Attacks --by-- ['Adam Nossiter']
31. Le Pen and Macron Clash in Vicious Presidential Debate in France --by-- ['Adam Nossiter']
126. François Hollande Cancels Plan to Strip French Citizenship in Terrorism Cases --by-- ['Adam Nossiter']
129. French Terrorism Suspects Appeared Anything But --by-- ['Adam Nossiter']
190. Emmanuel Macron, French Economy Minister, Hints at Presidential Run --by-- ['Adam Nossiter']


### 4. Search for a fuzzy query.

In [66]:
s = Search(using = es, index = "scrapped") \
    .query("fuzzy", body = "sapporters")
response = s.execute()

print("Fuzzy Query")
print("#Hits", len(response))
for hit in response:
    print(hit.meta.id + ".", hit.title)
    
#Compare it with a normal match query.
s = Search(using = es, index = "nyt") \
    .query("match", body = "sapporters")
response = s.execute()

print("\nMatch Query")
print("#Hits", len(response))
for hit in response:
    print(hit.meta.id + ".", hit.title)

Fuzzy Query
#Hits 10
291. Syrian Refugee Tripped in Hungary Fights Unfounded Accusations of Extremist Ties
299. Court in Turkey Acquits Military Officers of Trying to Overthrow Government
28. In Crimea, Russian Land Grab Feeds Cries of ‘Carpetbaggers!’
219. Erdogan’s Formula for Consolidating Clout in Turkey
117. Setback for Angela Merkel as Far Right Makes Gains in Germany
190. Emmanuel Macron, French Economy Minister, Hints at Presidential Run
121. Right-Wing Extremist Convicted of Murdering Jo Cox, a U.K. Lawmaker
214. BBC Journalist Turned Away From Flight to U.S. Because She Was Born in Iran
295. Bronislaw Komorowski, Poland’s President, Concedes Defeat to Right-Wing Challenger Andrzej Duda
186. Hollande’s Romances Turn Into a Political Spectacle in France

Match Query
#Hits 0


### 5. Search for a query on two fields.

In [67]:
s = Search(using = es, index = "scrapped") \
    .query("match", body = "supporters") \
    .query("match", authors = "Adam Nossiter")
response = s.execute()

print("#Hits", len(response))
for hit in response:
    print(hit.meta.id + ".", hit.title, "--by--" , hit.authors)

#Hits 1
190. Emmanuel Macron, French Economy Minister, Hints at Presidential Run --by-- ['Adam Nossiter']
