# GitHub Repository: [Elasticsearch-Signal-AI-1-Million-News-Articles-Example](https://github.com/abullard1/Elasticsearch-Signal-AI-1-Million-News-Articles-Example)

#### **<u>Helpful Resources</u>**
#### [Elasticsearch Regular Documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html)
#### [Elasticsearch Python Library Documentation](https://elasticsearch-py.readthedocs.io/en/v8.14.0/)

In [1]:
# Imports
import requests
import json

from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch.helpers import bulk
from elasticsearch.client import IngestClient

from tqdm import tqdm

  from elasticsearch.client import IngestClient


In [2]:
# Command to reset elasticsearch password: docker exec -it es01 /usr/share/elasticsearch/bin/elasticsearch-reset-password -u elastic
# Elasticsearch configuration
config = {
    "ELASTICSEARCH_URL": "http://localhost:9200/",
    "KIBANA_URL": "http://localhost:5601/",
    "FILE_PATH": "data/Signal-1M.jsonl",
    "DOC_INDEX_AMOUNT": 1000000
}

In [3]:
# Connecting to Elasticsearch instance and checking the connection
es = Elasticsearch(
    [config["ELASTICSEARCH_URL"]],
    request_timeout=60,
    verify_certs=True)

if es.ping():
    print("Connected to Elasticsearch.")
else:
    print("Failed to connect to Elasticsearch.")
    

Connected to Elasticsearch.


In [4]:
# Checking the cluster health to verify the connection (Tested with 4 elasticsearch nodes)
cluster_health = es.cluster.health()
print(cluster_health)

{'cluster_name': 'es-docker-cluster', 'status': 'green', 'timed_out': False, 'number_of_nodes': 4, 'number_of_data_nodes': 4, 'active_primary_shards': 36, 'active_shards': 72, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 0, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 100.0}


In [5]:
# Connecting to Kibana to verify the connection
try:
    response = requests.get(config["KIBANA_URL"])
    if response.status_code == 200:
        print("Connected to Kibana.")
    else:
        print(f"Failed to connect to Kibana. Status code: {response.status_code}")
except requests.exceptions.RequestException as e:
    print(f"Error connecting to Kibana: {e}")

Connected to Kibana.


In [6]:
# Function to create an index with the specified mappings
def create_index(index_name, mappings):
    try:
        es.indices.create(index=index_name, body=mappings)
        print(f"Index {index_name} created successfully.")
    except Exception as e:
        print(f"Failed to create index {index_name}. Error: {str(e)}")


In [7]:
# Example of the Signal AI 1 Million News Articles Dataset Data
# {
#     "id": "a080f99a-07d9-47d1-8244-26a540017b7a",
#     "content": "KUALA LUMPUR, Sept 15 (MySinchew) -- The Kuala Lumpur City Hall today issued ...",
#     "title": "Pay up or face legal action: DBKL",
#     "media-type": "News",
#     "source": "My Sinchew",
#     "published": "2015-09-15T10:17:53Z"
# }

In [8]:
# Mappings corresponding to the fields in the Signal AI 1 Million News Articles Dataset Data
mappings = {
    "mappings": {
        "properties": {
            "id": {"type": "keyword"},
            "content": {"type": "text"},
            "title": {"type": "text"},
            "media-type": {"type": "keyword"},
            "source": {"type": "keyword"},
            "published": {"type": "date"}
        }
    }
}

In [9]:
# Deletes a specified index if it exists
def delete_index(index_name):
    try:
        response = es.indices.delete(index=index_name)
        print(f"Index {index_name} deleted successfully: {response}")
    except Exception as e:
        print(f"Failed to delete index {index_name}. Error: {str(e)}")

In [10]:
# Deletes the "news_data" index if it exists
delete_index("news_data")

Index news_data deleted successfully: {'acknowledged': True}


In [11]:
# Creates the "news_data" index with the specified mappings
create_index(index_name="news_data", mappings=mappings)

Index news_data created successfully.


In [12]:
# Indexes documents in bulk
# Takes an op_type, index_name, documents, index_amount, chunk_size, and pipeline as input
# op_type: "index", "create", "update", "delete"
# index_name: Name of the index
# documents: List of documents to be indexed
# index_amount: Amount of documents to be indexed
# chunk_size: Number of documents to be indexed in each chunk
# pipeline: Name/ID of the ingest pipeline to be used if any
def index_docs_bulk(es, op_type, index_name, documents, index_amount, chunk_size, pipeline=None):
    # Defines the indexing actions to be performed
    actions = [
        {
            "_op_type": op_type,
            "_index": index_name,
            "_id": doc["id"],
            "_source": doc,
            "pipeline": pipeline,
            "chunk_size": chunk_size,
        }
        for doc in documents[:index_amount]
    ]
    
    # Configures the progress bar for the indexing process
    progress = tqdm(total=len(actions), desc="Indexing documents", unit="doc")
    progress.bar_format = "{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
    progress.set_postfix_str("Indexing...")

    for success, info in helpers.streaming_bulk(es, actions):
        progress.update(1)  # Updates the progress bar
        if not success:
            print(f"Failed to index document: {info}")

    progress.close()

In [13]:
# Loads the documents from the specified file path
def load_documents(file_path, num_lines):
    file_data = []
    with open(file_path, "r") as json_file:
        for i, line in enumerate(json_file):
            if i >= num_lines:
                break
            file_data.append(json.loads(line))
    print(f"Loaded {len(file_data)} documents from {file_path}")
    return file_data

In [14]:
# Loads the documents from the documents from the Signal AI 1 Million News Articles Dataset
file_data = load_documents(config["FILE_PATH"], config["DOC_INDEX_AMOUNT"])
# Prints example document
print(file_data[0])

Loaded 1000000 documents from data/Signal-1M.jsonl
{'id': 'f7ca322d-c3e8-40d2-841f-9d7250ac72ca', 'content': 'VETERANS saluted Worcester\'s first ever breakfast club for ex-soldiers which won over hearts, minds and bellies. \n \nThe Worcester Breakfast Club for HM Forces Veterans met at the Postal Order in Foregate Street at 10am on Saturday. \n \nThe club is designed to allow veterans a place to meet, socialise, eat and drink, giving hunger and loneliness their marching orders. \n \nFather-of-two Dave Carney, aged 43, of Merrimans Hill, Worcester, set up the club after being inspired by other similar clubs across the country. \n \nHe said: "As you can see from the picture, we had a good response. Five out of the 10 that attended said they saw the article in the newspaper and turned up. \n \n"We even had an old chap travel from Droitwich and he was late on parade by three hours. \n \n"It\'s generated a lot of interest and I estimate (from other veterans who saw the article) that next m

In [15]:
# Filters the data and ensures, that only the specified fields are included in the document
def filter_data(data):
    filtered_data = []
    for line in data:
        try:
            filtered_data.append({
                "id": line["id"],
                "content": line["content"],
                "title": line["title"],
                "media-type": line["media-type"],
                "source": line["source"],
                "published": line["published"]
            })
        except Exception as e:
            print(f"Failed to filter document: {str(e)}")
            continue
    return filtered_data

In [16]:
index_docs_bulk(es=es , op_type="index", index_name="news_data", documents=filter_data(file_data), index_amount=config["DOC_INDEX_AMOUNT"], chunk_size=1000, pipeline=None)

Indexing documents: 100%|██████████| 1000000/1000000 [11:10<00:00, 1490.97doc/s]


In [17]:
# Counts the number of documents in the "news_data" index and prints the count
doc_count = es.count(index="news_data")["count"]
print(f"Document count in 'news_data': {doc_count}")

Document count in 'news_data': 1000000


In [18]:
# Prints the first document in the "news_data" index to verify the successful indexing
example_document_news_data = es.get(index="news_data", id=file_data[0]["id"])
print(example_document_news_data["_source"])

{'id': 'f7ca322d-c3e8-40d2-841f-9d7250ac72ca', 'content': 'VETERANS saluted Worcester\'s first ever breakfast club for ex-soldiers which won over hearts, minds and bellies. \n \nThe Worcester Breakfast Club for HM Forces Veterans met at the Postal Order in Foregate Street at 10am on Saturday. \n \nThe club is designed to allow veterans a place to meet, socialise, eat and drink, giving hunger and loneliness their marching orders. \n \nFather-of-two Dave Carney, aged 43, of Merrimans Hill, Worcester, set up the club after being inspired by other similar clubs across the country. \n \nHe said: "As you can see from the picture, we had a good response. Five out of the 10 that attended said they saw the article in the newspaper and turned up. \n \n"We even had an old chap travel from Droitwich and he was late on parade by three hours. \n \n"It\'s generated a lot of interest and I estimate (from other veterans who saw the article) that next month\'s meeting will attract about 20 people. Onwar

In [19]:
# Deletes the "news_data_custom" index if it exists
delete_index("news_data_custom")

Index news_data_custom deleted successfully: {'acknowledged': True}


In [20]:
# Creates the "news_data_custom" index with the specified mappings
create_index(index_name="news_data_custom", mappings=mappings)

Index news_data_custom created successfully.


In [21]:
# Creating an IngestClient to manage ingest processing pipelines
ingest_client = IngestClient(es)

In [22]:
# Creates a pipeline that is fed into the reindexing operation and which makes the title uppercase and the content lowercase
# https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest.html#create-manage-ingest-pipelines

# API Approach (not used)
# def create_preprocessing_pipeline():
#     pipeline = {
#         "description": "Preprocessing pipeline",
#         "processors": [
#             {
#                 "lowercase": {
#                     "description": "Convert title to uppercase",
#                     "field": "title"
#                 }
#             },
#             {
#                 "uppercase": {
#                     "description": "Convert content to lowercase",
#                     "field": "content"
#                 }
#             }
#         ]
#     }
# 
#     try:
#         response = es.ingest.put_pipeline(id="preprocessing_pipeline", body=pipeline)
#         print("Preprocessing pipeline created successfully.")
#     except Exception as e:
#         print(f"Failed to create preprocessing pipeline. Error: {str(e)}")
# 

# Elasticsearch Python Library Approach
# https://elasticsearch-py.readthedocs.io/en/v8.14.0/api/ingest-pipelines.html

# Creates a pipeline that is later fed into the reindexing operation and which makes a field uppercase and another lowercase
# Also removes newline characters from the content field and the backslash escape character in front of the single quote character
def create_preprocessing_pipeline(es, ingest_client, uppercase_field, lowercase_field):
    pipeline = {
        "description": "Preprocessing pipeline",
        "processors": [
            {
                "uppercase": {
                    "description": "Convert title to uppercase",
                    "field": uppercase_field
                }
            },
            {
                "lowercase": {
                    "description": "Convert content to lowercase",
                    "field": lowercase_field
                }
            },
            {
                "gsub": {
                    "description": "Removes newline characters from content field",
                    "field": "content",
                    "pattern": "\\n",
                    "replacement": ""
                }
            },
            {
                "gsub": {
                    "description": "Removes the backslash escape character in front of the single quote character",
                    "field": "content",
                    "pattern": "\\'",
                    "replacement": "'"
                }
            }
        ]
    }

    try:
        ingest_client.put_pipeline(id="preprocessing_pipeline", body=pipeline)
        print("Preprocessing pipeline created successfully.")
    except Exception as e:
        print(f"Failed to create preprocessing pipeline. Error: {str(e)}")

In [23]:
# Creates the preprocessing pipeline with the specified fields to be transformed
create_preprocessing_pipeline(es, ingest_client=ingest_client, uppercase_field="title", lowercase_field="content")

Preprocessing pipeline created successfully.


In [24]:
# https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-reindex.html
# https://elasticsearch-py.readthedocs.io/en/v8.14.0/helpers.html#reindex
# Function to reindex with transformation pipeline. This will copy documents from the source index to the target index and apply the specified pipeline transformation.
# Uses the API instead of the Python library helper function, as this is the recommended approach and only this approach allows for the use of a pipeline during the reindexing process
def reindex_with_transformation(es, source_index, target_index, pipeline_id, batch_size=500, requests_per_second=2000, request_timeout=3000):
    body = {
        "source": {
            "index": source_index,
            "size": batch_size
        },
        "dest": {
            "index": target_index,
            "pipeline": pipeline_id
        },
        "conflicts": "proceed"
    }

    response = es.options(request_timeout=request_timeout).reindex(
        body=body,
        wait_for_completion=True,
        slices="auto",
        refresh=True,
        requests_per_second=requests_per_second
    )
    return response

In [25]:
# Fetches documents from the first index
reindex_response = reindex_with_transformation(es=es, source_index="news_data", target_index="news_data_custom", pipeline_id="preprocessing_pipeline", batch_size=500, requests_per_second=2000, request_timeout=3000)

In [26]:
# Counts the number of documents in the "news_data_custom" index and prints the count
doc_count_custom = es.count(index="news_data_custom")["count"]
print(f"Document count in 'news_data_custom': {doc_count_custom}")

Document count in 'news_data_custom': 1000000


In [27]:
# Prints the second document in the "news_data" index to verify the successful preprocessing using the pipeline
example_document_news_data_custom = es.get(index="news_data_custom", id=file_data[0]["id"])
print(example_document_news_data_custom["_source"])

{'id': 'f7ca322d-c3e8-40d2-841f-9d7250ac72ca', 'source': 'Redditch Advertiser', 'published': '2015-09-07T10:16:14Z', 'title': 'WORCESTER BREAKFAST CLUB FOR VETERANS GIVES HUNGER ITS MARCHING ORDERS', 'media-type': 'News', 'content': 'veterans saluted worcester\'s first ever breakfast club for ex-soldiers which won over hearts, minds and bellies.  the worcester breakfast club for hm forces veterans met at the postal order in foregate street at 10am on saturday.  the club is designed to allow veterans a place to meet, socialise, eat and drink, giving hunger and loneliness their marching orders.  father-of-two dave carney, aged 43, of merrimans hill, worcester, set up the club after being inspired by other similar clubs across the country.  he said: "as you can see from the picture, we had a good response. five out of the 10 that attended said they saw the article in the newspaper and turned up.  "we even had an old chap travel from droitwich and he was late on parade by three hours.  "it

In [28]:
delete_index("news_data_1000_1")
create_index(index_name="news_data_1000_1", mappings=mappings)

Index news_data_1000_1 deleted successfully: {'acknowledged': True}
Index news_data_1000_1 created successfully.


In [29]:
index_docs_bulk(es=es, op_type="index", index_name="news_data_1000_1", documents=filter_data(file_data), index_amount=1000, chunk_size=1000, pipeline="preprocessing_pipeline")

Indexing documents: 100%|██████████| 1000/1000 [00:00<00:00, 1206.41doc/s]


In [30]:
doc_count_1000_1 = es.count(index="news_data_1000_1")["count"]
print(f"Document count in 'news_data_1000_1': {doc_count_1000_1}")

Document count in 'news_data_1000_1': 1000


In [31]:
# Prints the second document in the "news_data_1000_1" index to verify the preprocessing pipeline
example_document_news_data_1000_1 = es.get(index="news_data_1000_1", id=file_data[1]["id"])
print(example_document_news_data_1000_1["_source"])

{'id': '609772bc-0672-4db5-8516-4c025cfd54ca', 'source': 'Virtualization Conference & Expo', 'published': '2015-09-17T15:00:00Z', 'title': 'JUMPSHOT GIVES MARKETERS RENEWED VISIBILITY INTO PAID AND ORGANIC KEYWORDS WITH LAUNCH OF JUMPSHOT ELITE', 'media-type': 'News', 'content': 'new product gives marketers access to real keywords, conversions and results along with 13 months of historical data san francisco, ca -- (marketwired) -- 09/17/15 -- jumpshot, a marketing analytics company that uses distinctive data sources to paint a complete picture of the online customer journey, today announced the launch of jumpshot elite, giving marketers insight into what their customers are doing the 99% of the time they\'re not on your site. for years, marketers have been unable to see what organic and paid search terms users were entering, much less tie those searches to purchases. jumpshot not only injects that user search visibility back into the market, but also makes it possible to tie those key

In [32]:
delete_index("news_data_custom_1000_2")
create_index(index_name="news_data_custom_1000_2", mappings=mappings)

Index news_data_custom_1000_2 deleted successfully: {'acknowledged': True}
Index news_data_custom_1000_2 created successfully.


In [33]:
index_docs_bulk(es=es, op_type="index", index_name="news_data_1000_2", documents=filter_data(file_data), index_amount=1000, chunk_size=1000, pipeline="preprocessing_pipeline")

Indexing documents: 100%|██████████| 1000/1000 [00:01<00:00, 783.19doc/s]


In [34]:
doc_count_1000_2 = es.count(index="news_data_1000_2")["count"]
print(f"Document count in 'news_data_1000_2': {doc_count_1000_2}")

Document count in 'news_data_1000_2': 1000


In [35]:
# Printing the second document in the "news_data_1000_2" index to verify the preprocessing pipeline
example_document_news_data_1000_2 = es.get(index="news_data_1000_2", id=file_data[1]["id"])
print(example_document_news_data_1000_2["_source"])

{'id': '609772bc-0672-4db5-8516-4c025cfd54ca', 'source': 'Virtualization Conference & Expo', 'published': '2015-09-17T15:00:00Z', 'title': 'JUMPSHOT GIVES MARKETERS RENEWED VISIBILITY INTO PAID AND ORGANIC KEYWORDS WITH LAUNCH OF JUMPSHOT ELITE', 'media-type': 'News', 'content': 'new product gives marketers access to real keywords, conversions and results along with 13 months of historical data san francisco, ca -- (marketwired) -- 09/17/15 -- jumpshot, a marketing analytics company that uses distinctive data sources to paint a complete picture of the online customer journey, today announced the launch of jumpshot elite, giving marketers insight into what their customers are doing the 99% of the time they\'re not on your site. for years, marketers have been unable to see what organic and paid search terms users were entering, much less tie those searches to purchases. jumpshot not only injects that user search visibility back into the market, but also makes it possible to tie those key

In [36]:
# Helper function to print the search response in a more readable and organized format
def print_response(response):
    print("Search Metadata:")
    print(f" - Took: {response['took']} ms")
    print(f" - Timed Out: {response['timed_out']}")
    print(f" - Shards: Total - {response['_shards']['total']}, Successful - {response['_shards']['successful']}, Skipped - {response['_shards']['skipped']}, Failed - {response['_shards']['failed']}")
    print(f" - Hits: Total - {response['hits']['total']['value']} (Relation: {response['hits']['total']['relation']})")
    print(f" - Max Score: {response['hits']['max_score']}")

    print("\nHits:")
    for hit in response['hits']['hits']:
        print(f" - Index: {hit['_index']}")
        print(f"   ID: {hit['_id']}")
        print(f"   Score: {hit['_score']}")
        print(f"   Source: {hit['_source']}\n")

In [37]:
# Function to search an index with a specified query
def search_index(query, index_name):
    try:
        response = es.search(index=index_name, body=query)
        return response
    except Exception as e:
        print(f"Search failed. Error: {str(e)}")
        return None

In [38]:
# Function to search for a value in a specified field of an index using a specified search type
def search_field(field, search_type, value, index_name):
    query = {
        "query": {
            search_type: {
                field: value
            }
        }
    }
    return search_index(query, index_name)

In [39]:
# Searches for documents with the "University" keyword in the "title" field of the "news_data" index
field_search_result = search_field(field="title", search_type="match", value="University", index_name="news_data")

# Prints the field query search response
print_response(field_search_result)

Search Metadata:
 - Took: 278 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 2896 (Relation: eq)
 - Max Score: 9.594052

Hits:
 - Index: news_data
   ID: 9d7e8ab2-970d-4628-9022-7612c59f7c5f
   Score: 9.594052
   Source: {'id': '9d7e8ab2-970d-4628-9022-7612c59f7c5f', 'content': 'Comments:\r10.09.2015 Pamula:\nan essay until it can be bearable, cowper you will d university essayhwestern university find oneself inside bruising.', 'title': 'D university essayhwestern university', 'media-type': 'Blog', 'source': 'Все здесь!', 'published': '2015-09-18T08:36:11Z'}

 - Index: news_data
   ID: 79435fde-ce56-40e3-ae9e-fb1e92b4aed4
   Score: 8.628289
   Source: {'id': '79435fde-ce56-40e3-ae9e-fb1e92b4aed4', 'content': 'Hey guys! Yet again, I am seriously sorry for the lack of posts recently. Everything’s still a little bit crazy at the moment, but I really hope things will start to become clearer in the next couple of weeks, so that should m

In [40]:
boolean_query = {
    "query": {
        "bool": {
            "must": [
                {"match": {"title": "University"}},
                {"match_phrase_prefix": {"content": "German"}}
            ],
            "must_not": [
                {"match": {"source": "The Sun"}},
                {"range": {"published": {"gte": "2010-01-01T00:00:00Z", "lte": "2015-06-20T23:59:59Z"}}}
            ]
        }
    }
}

In [41]:
# Similar query as above but as a boolean query with additional constraints
boolean_query_result = search_index(boolean_query, "news_data")

# Prints the boolean query search response
print_response(boolean_query_result)

Search Metadata:
 - Took: 380 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 74 (Relation: eq)
 - Max Score: 18.951775

Hits:
 - Index: news_data
   ID: dad47c11-472c-48fc-bc13-c50b99bffa31
   Score: 18.951775
   Source: {'id': 'dad47c11-472c-48fc-bc13-c50b99bffa31', 'content': 'Cuban academicians of the Civil Construction are receiving an International summer course taught at the University of Camaguey, which deals with the life cycle management of buildings. \nThe postgraduate course is taught by schoolteachers of the Munster University of Applied Sciences (MUAS), in Germany, together with professors of the "Jose Antonio Echeverria" Higher Polytechnic Institute (ISPJAE), in Havana, and encompasses topics as the feasibility studies and the investment evaluation, among others. \n\nClara Mustelier, Doctor of Sciences in the IPSJAE, declared to the university website www.reduc.edu.cu that the course studies a working mode that is rei

In [42]:
# Ten search queries around the time of September 2015
# 1. Migration Crisis in Europe
# 2. Volkswagen Emisson Scandal
# 3. Syria Conflict and Russian Military Intervention
# 4. Terror Attacks in Paris
# 5. Earthquake in Nepal
# 6. Greece Crisis
# 7. Pope Visit in the USA
# 8. FIFA Corruption Scandal
# 9. Iran Nuclear Deal
# 10. Apple introduces the iPhone 6s and iPhone 6s Plus

In [43]:
# 1. European Refugee Crisis
query_one = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "News"}},
                {"match": {"content": "Migration crisis"}},
                {"match": {"content": "Refugee crisis"}},
                {"match": {"content": "European migrant crisis"}},
                {"match": {"content": "European refugee crisis"}},
                {"match": {"content": "Migrant crisis"}},
                {"match": {"content": "Europe"}},
                {"match": {"content": "European"}},
                {"match": {"content": "refugee"}},
                {"match": {"content": "migrant"}},
                {"match": {"content": {"query": "German", "boost": 2}}},
                {"prefix": {"content": {"value": "German", "boost": 2}}}
            ],
            "must_not": [
                {"match": {"source": "Al Jazeera"}},
                {"match": {"content": "Turkey"}},
                {"match": {"content": "Turkish"}},
                {"match": {"content": "Erdogan"}},
                {"match": {"content": "President"}},
                {"match": {"content": "Istanbul"}},
                {"match": {"content": "Turkye"}}
            ]
        }
    }
}

# 2. VW Emission Scandal
query_two = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "News"}},
                {"match": {"content": "Volkswagen scandal"}},
                {"match": {"content": "Volkswagen emissions scandal"}},
                {"match": {"content": "VW scandal"}},
                {"match": {"content": "VW emissions scandal"}},
                {"match": {"content": "Volkswagen emissions"}}
            ],
            "must_not": [
                {"match": {"content": "Government"}},
                {"match": {"content": "Legislation"}},
                {"match": {"content": "Regulation"}},
                {"match": {"content": "Law"}},
                {"match": {"content": "Legal"}}
            ]
        }
    }
}

# 3. Syria Conflict and Russian Military Intervention
query_three = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "News"}},
                {"match": {"content": "Syria conflict"}},
                {"match": {"content": "Russian military intervention"}},
                {"match": {"content": "Syrian civil war"}},
                {"match": {"content": "Russian intervention in Syria"}},
                {"match": {"content": "Syrian conflict"}},
                {"match": {"content": "Russian military in Syria"}},
                {"match": {"content": "Syrian war"}}
            ],
            "must_not": [
                {"match": {"source": "Al Jazeera"}},
                {"match": {"content": "Afghanistan"}},
                {"match_phrase_prefix": {"content": "US"}}
            ]
        }
    }
}

# 4. Terror Attacks in Paris
query_four = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "News"}},
                {"match": {"content": "Paris terror attacks"}},
                {"match": {"content": "Paris attacks"}},
                {"match": {"content": "Paris bombings"}},
                {"match": {"content": "Paris shootings"}},
                {"match": {"content": "Paris massacre"}},
                {"match": {"content": "Victim"}},
                {"match": {"content": "Victims"}},
                {"match": {"content": "Weapon"}},
                {"match": {"content": "Weapons"}},
                {"match": {"content": "Explosion"}},
                {"match": {"content": "Explosions"}},
                {"match": {"content": "Dead"}},
                {"match": {"content": "Death"}},
                {"match": {"content": "Injured"}},
                {"match": {"content": "Injury"}},
                {"match": {"content": "Terror"}}
            ],
            "must_not": [
                {"match": {"source": "Reuters"}},
                {"match": {"source": "Al Jazeera"}},
                {"match": {"content": "Politics"}},
                {"match": {"content": "Political"}},
                {"match": {"content": "Right-wing"}},
                {"match": {"content": "Left-wing"}},
                {"match": {"content": "Government"}},
                {"match": {"content": "Right"}},
                {"match": {"content": "Left"}}
            ]
        }
    }
}

# 5. Earthquake in Nepal
query_five = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "News"}},
                {"match": {"content": "Nepal earthquake"}},
                {"match": {"content": "earthquake in Nepal"}},
                {"match": {"content": "Nepal quake"}},
                {"match": {"content": "quake in Nepal"}},
                {"match": {"content": "Nepal disaster"}},
                {"match": {"content": "disaster in Nepal"}},
                {"match": {"content": "Nepal natural disaster"}},
                {"match": {"content": "natural disaster in Nepal"}},
                {"match": {"content": "Nepal catastrophe"}},
                {"match": {"content": "catastrophe in Nepal"}},
                {"match": {"content": {"query": "Homeless", "boost": 2}}},
                {"match": {"content": "Magnitude"}}
            ],
            "must_not": [
                {"match": {"source": "Reuters"}},
                {"match": {"content": "Politics"}},
                {"match": {"content": "Political"}},
                {"match": {"content": "Government"}},
                {"match": {"content": "Political"}},
                {"match": {"content": "Right-wing"}},
                {"match": {"content": "Left-wing"}}
            ]
        }
    }
}

# 6. Greece Crisis
query_six = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "News"}},
                {"match": {"content": "Greece crisis"}},
                {"match": {"content": "Greek financial crisis"}},
                {"match": {"content": "Greek debt crisis"}},
                {"match": {"content": "Greece debt crisis"}},
                {"match": {"content": "Greek economic crisis"}}
            ],
            "must_not": [
                {"match": {"source": "Reuters"}},
                {"match": {"content": "Sociopolitical"}},
                {"match": {"content": "Beach"}},
                {"match": {"content": "Sunny"}},
                {"match": {"content": "Ocean"}}
            ]
        }
    }
}

# 7. Pope's Visit to the USA
query_seven = {
    "query": {
        "bool": {
            "should": [
                {"match": {"content": "Pope visit USA"}},
                {"match": {"content": "Pope Francis visit USA"}},
                {"match": {"content": "Pope Francis visit America"}},
                {"match": {"content": "Pope visit America"}},
                {"match": {"content": "Pope Francis visit United States"}},
                {"match": {"content": "Pope visit United States"}},
                {"match": {"content": "Christian"}},
                {"match": {"content": "Catholic"}},
                {"match": {"content": "Religion"}},
                {"match": {"content": "Church"}},
                {"match": {"content": "Christianity"}},
                {"match": {"content": "Catholicism"}}
            ],
            "must_not": [
                {"match": {"source": "Reuters"}},
                {"match": {"content": "Obama"}},
                {"match": {"content": "POTUS"}},
                {"match": {"content": "White House"}},
                {"match": {"content": "Congress"}},
                {"match": {"content": "Senate"}}
            ]
        }
    }
}

# 8. FIFA Corruption Scandal
query_eight = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "News"}},
                {"match": {"content": "FIFA corruption scandal"}},
                {"match": {"content": "FIFA corruption"}},
                {"match": {"content": "FIFA scandal"}},
                {"match": {"content": "FIFA corruption probe"}},
                {"match": {"content": "FIFA corruption investigation"}},
                {"match": {"content": "FIFA corruption charges"}},
                {"match": {"content": "FIFA corruption allegations"}},
                {"match": {"content": "FIFA corruption case"}},
                {"match": {"content": "FIFA corruption trial"}},
                {"match": {"content": "FIFA corruption scandal"}},
                {"match": {"content": "Fédération Internationale de Football Association"}},
                {"match": {"content": "bribery"}},
                {"match": {"content": "corruption"}},
                {"match": {"content": "scandal"}},
                {"match": {"content": "investigation"}},
                {"match": {"content": "officials"}}
            ],
            "must_not": [
                {"match": {"source": "Reuters"}},
                {"match": {"content": "Soccer"}},
                {"match": {"content": "World Cup"}},
                {"match": {"content": "Match"}},
                {"match": {"content": "Goal"}},
                {"match": {"content": "Score"}}
            ]
        }
    }
}

# 9. Iran Nuclear Deal
query_nine = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "News"}},
                {"match": {"content": "Iran"}},
                {"match": {"content": "Iran nuclear deal"}},
                {"match": {"content": "Iran nuclear agreement"}},
                {"match": {"content": "Iran nuclear pact"}},
                {"match": {"content": "Iran nuclear accord"}},
                {"match": {"content": "Iran nuclear treaty"}},
                {"match": {"content": "Iran nuclear program"}},
                {"match": {"content": "Iran nuclear talks"}},
                {"match": {"content": "Iran nuclear negotiations"}},
                {"match": {"content": "Iran nuclear sanctions"}},
                {"match": {"content": "Nuclear weapons"}},
                {"match": {"content": "Nuclear capabilities"}},
                {"match": {"content": "Nuclear capabilities"}},
                {"match": {"content": "Nuclear development"}},
                {"match": {"content": "Nuclear research"}},
                {"match": {"content": "Nuclear facilities"}},
                {"match": {"content": "Nuclear enrichment"}}
            ],
            "must_not": [
                {"match": {"source": "Al Jazeera"}},
                {"match": {"content": "WMD"}},
                {"match": {"content": "Weapons of mass destruction"}}
            ]
        }
    }
}

# 10. Apple's Introduction of iPhone 6s and iPhone 6s Plus
query_ten = {
    "query": {
        "bool": {
            "should": [
                {"match": {"media-type": "Blog"}},
                {"match": {"content": "Apple introduces iPhone 6s"}},
                {"match": {"content": "Apple introduces iPhone 6s Plus"}},
                {"match": {"content": "Apple unveils iPhone 6s"}},
                {"match": {"content": "Apple Keynote"}},
                {"match": {"content": "Apple presentation"}},
                {"match": {"content": "Apple reveal"}},
                {"match": {"content": "IPhone"}},
                {"match": {"content": "IPhone 6s"}},
                {"match": {"content": "IPhone 6s Plus"}},
                {"match": {"content": "IPhone 6s release"}},
                {"match": {"content": "IPhone 6s Plus release"}},
                {"match": {"content": "Announcement"}}
            ],
            "must_not": [
                {"match": {"content": "Samsung"}},
                {"match": {"content": "LG"}},
                {"match": {"content": "Google"}},
                {"match": {"content": "Galaxy"}},
                {"match": {"content": "Android"}},
                {"match": {"content": "Huawei"}},
                {"match": {"content": "Xiaomi"}},
                {"match": {"content": "Blackberry"}},
                {"match": {"content": "Sony"}},
                {"match": {"content": "Nokia"}},
                {"match": {"content": "Fairphone"}},
                {"match": {"content": "Microsoft"}}
            ]
        }
    }
}

In [44]:
# 1st Search Query: Migration Crisis in Europe
migration_crisis_result = search_index(query_one, "news_data_custom")
print(print_response(migration_crisis_result))

Search Metadata:
 - Took: 222 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 103.26129

Hits:
 - Index: news_data_custom
   ID: 8ef7589f-2975-4a0a-91fb-848293e58d30
   Score: 103.26129
   Source: {'id': '8ef7589f-2975-4a0a-91fb-848293e58d30', 'source': 'Kenly News', 'published': '2015-09-06T12:26:45Z', 'title': "POPE TELLS EUROPE'S CHURCHES TO HOST REFUGEES - CNN", 'media-type': 'News', 'content': 'story highlights pope implores catholic churches to host refugee families 11,000 migrants had crossed into austria by midday sunday, interior ministry says report: germany\'s kindness on saturday should be seen as exception, german foreign minister says "every parish, every religious community, every monastery, every sanctuary of europe has to host a family, starting from my diocese of rome," francis said at the end of his angelus prayers in rome on sunday. "the two parishes in the vatican these days w

In [45]:
# 2nd Search Query: Volkswagen Emission Scandal
vw_emission_scandal_result = search_index(query_two, "news_data_custom")
print_response(vw_emission_scandal_result)

Search Metadata:
 - Took: 145 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 111.35358

Hits:
 - Index: news_data_custom
   ID: 0ef6a0ee-6b16-45b2-ac20-70dd1bac37b2
   Score: 111.35358
   Source: {'id': '0ef6a0ee-6b16-45b2-ac20-70dd1bac37b2', 'source': 'Autocar.co.uk', 'published': '2015-09-28T11:22:38Z', 'title': 'VW EMISSIONS SCANDAL: PROSECUTORS TO INVESTIGATE MARTIN WINTERKORN', 'media-type': 'News', 'content': 'reports suggest that volkswagen\'s former chief executive subjected to fraud probe; several other executives are suspended from the company german prosecutors have opened an investigation into former volkswagen group boss martin winterkorn as a result of the emissions scandal that has engulfed the company. the prosecutor’s office in germany has said that winterkorn is being investigated over “allegations of fraud in the sale of cars with manipulated emissions data.” in germany, anyone

In [46]:
# 3rd Search Query: Syria Conflict and Russian Military Intervention
syria_conflict_result = search_index(query_three, "news_data_custom")
print_response(syria_conflict_result)

Search Metadata:
 - Took: 169 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 115.40952

Hits:
 - Index: news_data_custom
   ID: 5f23bd62-fd18-401b-9ce5-cbaa53ef3cd3
   Score: 115.40952
   Source: {'id': '5f23bd62-fd18-401b-9ce5-cbaa53ef3cd3', 'source': 'MyInforms', 'published': '2015-09-05T21:00:52Z', 'title': 'KERRY CONFRONTS RUSSIA OVER REPORTS OF ESCALATING INVOLVEMENT IN SYRIA', 'media-type': 'News', 'content': "further russian intervention in syria’s civil war could make things worse, kerry s… further russian intervention in syria’s civil war could make things worse, secretary of state john kerry told russian foreign minister sergey lavrov in a phone call today. washington — further russian intervention in syria’s civil war could make things worse for civilians and the coalition battling islamic state fighters there, secretary of state john kerry warned russian foreign minister sergey lavrov

In [47]:
# 4th Search Query: Terror Attacks in Paris
paris_terror_attacks_result = search_index(query_four, "news_data_custom")
print_response(paris_terror_attacks_result)

Search Metadata:
 - Took: 148 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 66.50238

Hits:
 - Index: news_data_custom
   ID: 0d025613-a36f-40c6-8fff-6548705c6f8a
   Score: 66.50238

 - Index: news_data_custom
   ID: dece581e-7f4d-4494-b24a-a8879ef4402c
   Score: 65.14121
   Source: {'id': 'dece581e-7f4d-4494-b24a-a8879ef4402c', 'source': 'KCRA.com', 'published': '2015-09-11T19:04:04Z', 'title': 'WATCH LIVE: 3 MEN HONORED IN SACRAMENTO HOMETOWN HEROES PARADE', 'media-type': 'News', 'content': 'thousands of people turned out on sacramento\'s capitol mall to honor three men credited with thwarting a suspected terror attack on a paris-bound train (sept. 11, 2015) thousands of people turned out on sacramento\'s capitol mall to honor three men credited with thwarting a suspected terror attack on a paris-bound train (sept. 11, 2015) the and festivities will kick off at noon friday on capitol mall to h

In [48]:
# 5th Search Query: Earthquake in Nepal
nepal_earthquake_result = search_index(query_five, "news_data_custom")
print_response(nepal_earthquake_result)

Search Metadata:
 - Took: 84 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 173.55121

Hits:
 - Index: news_data_custom
   ID: 4eb2ef55-0bd6-47d4-b2b3-94f72177a526
   Score: 173.55121
   Source: {'id': '4eb2ef55-0bd6-47d4-b2b3-94f72177a526', 'source': 'Kathmandu Post', 'published': '2015-09-25T13:27:59Z', 'title': '[INFOGRAPHIC] HOW MUCH DOES NEPAL NEED FOR RECONSTRUCTION?', 'media-type': 'News', 'content': 'last week the international community called for finalisation of reconstruction bill immediately so that the work on reconstruction can begin.\xa0 so what are the needs according to sectors?\xa0 according to the post disaster a post disaster needs assessment (pdna) report reveals. nepal has faced cumulative losses worth rs700 billion ($7 billion) due to the april 25 earthquake and its aftershocks. the economic losses were estimated at rs187.08 billion. the loss means the impact on the economy

In [49]:
# 6th Search Query: Greece Crisis
greece_crisis_result = search_index(query_six, "news_data_custom")
print_response(greece_crisis_result)

Search Metadata:
 - Took: 51 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 89.815254

Hits:
 - Index: news_data_custom
   ID: 23dbe1f8-2d3e-4909-99b4-bd7c8eaef8da
   Score: 89.815254
   Source: {'id': '23dbe1f8-2d3e-4909-99b4-bd7c8eaef8da', 'source': 'Latest Nigerian News.com', 'published': '2015-09-03T04:02:56Z', 'title': 'GREEK AMERICANS CONTINUE FIGHT FOR GREECE AFTER DEBT DEAL', 'media-type': 'News', 'content': "greece may be out of the headlines, but it is not off the greek-american communitys agenda.that was the message of a briefing call conducted by former california state treasurer and renowned finance expertphil angelides on tuesday for members of the hellenic american leadership council, a chicago-based greek-american political action group.'angelides offered analysis of the latest events and discussed ideas for how greek americans can mobilize to help greece.angelides, who chaired th

In [50]:
# 7th Search Query: Pope Visit in the USA
pope_visit_result = search_index(query_seven, "news_data_custom")
print_response(pope_visit_result)

Search Metadata:
 - Took: 100 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 153.76437

Hits:
 - Index: news_data_custom
   ID: 46e1c12c-0c30-46ba-8630-772a0b50021a
   Score: 153.76437
   Source: {'id': '46e1c12c-0c30-46ba-8630-772a0b50021a', 'source': 'Daily Vidette', 'published': '2015-09-28T17:17:20Z', 'title': 'POPE’S VISIT SYMBOLIZES CHANGING CHURCH', 'media-type': 'News', 'content': 'nathan penn / columnist  pope francis’ visit to the united states last week symbolizes not only a changing roman catholic church, but a changing religious culture in the united states.  in a nation where religion has always been a significant cultural value, the pope’s visit comes during a time when a growing number of americans claim no religion at all. despite his popularity, pope francis will have to work hard to make the catholic church appear welcoming, and in step with modern values.  the u.s. still remai

In [51]:
# 8th Search Query: FIFA Corruption Scandal
fifa_corruption_result = search_index(query_eight, "news_data_custom")
print_response(fifa_corruption_result)

Search Metadata:
 - Took: 163 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 239.29791

Hits:
 - Index: news_data_custom
   ID: 51c4afc4-973d-4e94-99bb-9cb0af5d29f5
   Score: 239.29791
   Source: {'id': '51c4afc4-973d-4e94-99bb-9cb0af5d29f5', 'source': 'MyInforms', 'published': '2015-09-15T11:06:33Z', 'title': 'TROUBLED SOUTH AMERICAN CONFEDERATION APPROVES REFORM PLAN', 'media-type': 'News', 'content': 'south america\'s troubled football confederation conmebol has approved a reform plan after it was caught up in the corruption scandal that has swept the sport and plunged governing body fifa into… \r asuncion: south america\'s troubled football confederation conmebol has approved a reform plan after it was caught up in the corruption scandal that has swept the sport and plunged governing body fifa into crisis. conmebol said its executive committee had approved a "comprehensive review of the manag

In [52]:
# 9th Search Query: Iran Nuclear Deal
iran_nuclear_deal_result = search_index(query_nine, "news_data_custom")
print_response(iran_nuclear_deal_result)

Search Metadata:
 - Took: 116 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 247.84875

Hits:
 - Index: news_data_custom
   ID: ac0840c2-74d3-4226-8222-2caeb9c9efc9
   Score: 247.84875
   Source: {'id': 'ac0840c2-74d3-4226-8222-2caeb9c9efc9', 'source': 'Keep It Fresh', 'published': '2015-09-05T15:40:16Z', 'title': 'JAPAN, IRAN TO START INVESTMENT TALKS NEXT WEEK – FINANCIAL EXPRESS', 'media-type': 'Blog', 'content': 'the news international japan, iran to start investment talks next week financial express japan and iran will start talks next week to negotiate a bilateral investment treaty, as washington moves to ease sanctions against tehran and tokyo looks to step up its interests in the resource rich nation. by: afp | tokyo | september 5, 2015 9:10 pm\xa0... three lessons from the iran deal newsday ernest istook: blame congress, not just obama, for awful iran nuclear deal washington times iran m

In [53]:
# 10th Search Query: Apple introduces the iPhone 6s and iPhone 6s Plus
apple_iphone_result = search_index(query_ten, "news_data_custom")
print_response(apple_iphone_result)

Search Metadata:
 - Took: 85 ms
 - Timed Out: False
 - Shards: Total - 1, Successful - 1, Skipped - 0, Failed - 0
 - Hits: Total - 10000 (Relation: gte)
 - Max Score: 241.92131

Hits:
 - Index: news_data_custom
   ID: 47ae6962-dca9-41e8-8439-2979f0e5846d
   Score: 241.92131
   Source: {'id': '47ae6962-dca9-41e8-8439-2979f0e5846d', 'source': 'EHabari.com', 'published': '2015-09-10T14:10:00Z', 'title': 'BREAKING NEWS : SERENA WILLIAMS STANDING ON THE BRINK OF HISTORY – NEWS TODAY', 'media-type': 'News', 'content': 'late show with stephen colbert, colbert, cbs, colbert late show, the late show with stephen colbert, george clooney, stephen colbert late show, colbert late show date, the late show, the late show with stephen colbert episode 1, late show iphone 6s  sep 8, 2015 apple, apple event, ipad pro, apple pencil, iphone 7, new iphone, iphone, apple tv, ios 9, apple iphone 6s, iphone 6s plus, aapl, apple announcement, apple event september 2015, iphone 6, iphone 6s release, ios 9 releas