# Basic Setup.

In [11]:
#-Importing the modules-#
import warnings
from time import sleep
from typing import Any, Dict, Generator
from elasticsearch import helpers, Elasticsearch, ElasticsearchWarning

#-Suppressing the warnings-#
warnings.filterwarnings("ignore", category = ElasticsearchWarning)


In [12]:
#-Creating the Elasticsearch object connected to our localhost instance-#
host = "http://localhost:9200/"
es = Elasticsearch(hosts = host, request_timeout = 600)


In [13]:
#-Testing the connection-#
es.ping()


True

In [14]:
#-Deleting the tbl_reporters index-#
response = es.indices.delete(index = "tbl_reporters", ignore_unavailable = True)
print(response)

#-Confirming that it is deleted-#
response = es.indices.get(index = "*", ignore_unavailable = True)
print("tbl_reporters" in response)


{'acknowledged': True}
False


In [15]:
#-This approach is for large dataset files with limited preprocessing-#
import csv

def doc_generator(filename: str) -> Generator[Dict[str, Any], None, None]:

    #-Opening the file-#
    with open(filename, "r", encoding = "utf-8") as csv_file:

        #-Creating a CSV reader object-#
        reader = csv.DictReader(csv_file)

        #-Iterating the records-#
        for row in reader:

            #-Yielding the processed document-#
            yield {
                "_index": "tbl_reporters",
                "_id": row.pop("reporter_id"),
                "_source": row
            }

#-Doing the bulk import with this method-#
response = helpers.bulk(es, doc_generator("reporters.csv"))
print(response)


(1000, [])


In [16]:
#-Adding some interval to wait for the data to be ingested-#
sleep(1)

#-Checking the ingested data using search-#
dict(es.search(index = "tbl_reporters", size = 1))


{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'tbl_reporters',
    '_id': '85382',
    '_score': 1.0,
    '_source': {'name': 'Michael Griffiths',
     'outlet_id': '16039',
     'city': '',
     'state': '',
     'country_code': 'GB',
     'associations': '111071, 18745, 57818',
     'topics': 'Crime, Law, News, Crime And Justice, European Union',
     'twitter_description': 'News editor and covering sanctions for Global Investigations Review.  in . Also runs @FARAupdates. Still some UK corporate crime stuff. DMs open.',
     'pitch': '',
     'last_updated': '2024-08-22 07:16:54.000',
     'active': 'false'}}]}}

# Filtration queries.

In [17]:
#-Only getting selected attributes from the source and adding a document limit-#
dict(es.search(index = "tbl_reporters", source = ["name", "associations"], size = 1))


{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'tbl_reporters',
    '_id': '85382',
    '_score': 1.0,
    '_source': {'name': 'Michael Griffiths',
     'associations': '111071, 18745, 57818'}}]}}

In [18]:
#-This is the same as source-#
dict(es.search(index = "tbl_reporters", source_includes = ["name", "associations"], size = 1))


{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'tbl_reporters',
    '_id': '85382',
    '_score': 1.0,
    '_source': {'name': 'Michael Griffiths',
     'associations': '111071, 18745, 57818'}}]}}

In [19]:
#-Query object-#
query = {
    "match_phrase": {
      "twitter_description": "global investigations"
    }
}

#-Using match_phrase query-#
dict(es.search(index = "tbl_reporters", query = query))


{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1, 'relation': 'eq'},
  'max_score': 7.8862267,
  'hits': [{'_index': 'tbl_reporters',
    '_id': '85382',
    '_score': 7.8862267,
    '_source': {'name': 'Michael Griffiths',
     'outlet_id': '16039',
     'city': '',
     'state': '',
     'country_code': 'GB',
     'associations': '111071, 18745, 57818',
     'topics': 'Crime, Law, News, Crime And Justice, European Union',
     'twitter_description': 'News editor and covering sanctions for Global Investigations Review.  in . Also runs @FARAupdates. Still some UK corporate crime stuff. DMs open.',
     'pitch': '',
     'last_updated': '2024-08-22 07:16:54.000',
     'active': 'false'}}]}}

In [23]:
#-Query object-#
query = {
    "match": {
        "twitter_description": "investigations global"
    }
}

#-Using the match query-#
dict(es.search(index = "tbl_reporters", query = query))


{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 6, 'relation': 'eq'},
  'max_score': 7.8862267,
  'hits': [{'_index': 'tbl_reporters',
    '_id': '85382',
    '_score': 7.8862267,
    '_source': {'name': 'Michael Griffiths',
     'outlet_id': '16039',
     'city': '',
     'state': '',
     'country_code': 'GB',
     'associations': '111071, 18745, 57818',
     'topics': 'Crime, Law, News, Crime And Justice, European Union',
     'twitter_description': 'News editor and covering sanctions for Global Investigations Review.  in . Also runs @FARAupdates. Still some UK corporate crime stuff. DMs open.',
     'pitch': '',
     'last_updated': '2024-08-22 07:16:54.000',
     'active': 'false'}},
   {'_index': 'tbl_reporters',
    '_id': '352134',
    '_score': 5.4511576,
    '_source': {'name': 'Elizabeth Hlavinka',
     'outlet_id': '461482',
     'city': '',
     'state': '',
     'country_code': 'US',
     

# Aggregations Query.

In [26]:
#-Aggregation object-#
aggs = {
    "associations": {
      "terms": {
        "field": "associations.keyword",
        "size": 10,
        "order": {
          "_count": "desc"
        }
      }
    }
  }

#-Using single aggregation bucket-#
dict(es.search(index = "tbl_reporters", aggregations = aggs, size = 0))



{'took': 12,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'associations': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 396,
   'buckets': [{'key': '', 'doc_count': 595},
    {'key': '#ACSM Certified Clinical Exercise Physiologist,RRCA Run Coach, Runner, yoga, twin mama. #FitFluential #SweatPink Saucony #FindYourStrongTeam #StonyfieldBlogger',
     'doc_count': 1},
    {'key': '#FoodStudies & #PublicCommunications student @SyracuseU all opinions are my own',
     'doc_count': 1},
    {'key': '#lovespokane', 'doc_count': 1},
    {'key': '#rva word nerd. writes about music, beer, things south. @nekocase for president.',
     'doc_count': 1},
    {'key': '(Bee-taa Ba-gu-lee/zaa-deh) Asst Prof of History & Africana @BucknellU. Writing a book on race+slavery in Iran, making a @diasporaletters film. Editor @ajammc.',
     'do

In [34]:
#-Aggregation object-#
aggs = {
    "tweet_desc": {
      "terms": {
        "field": "twitter_description.keyword",
        "size": 10,
        "order": {
          "_count": "desc"
        }
      }
    }
  }

#-Using single aggregation bucket and showing comma separation-#
dict(es.search(index = "tbl_reporters", aggregations = aggs, size = 0))



{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'tweet_desc': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 396,
   'buckets': [{'key': '', 'doc_count': 595},
    {'key': '#ACSM Certified Clinical Exercise Physiologist,RRCA Run Coach, Runner, yoga, twin mama. #FitFluential #SweatPink Saucony #FindYourStrongTeam #StonyfieldBlogger',
     'doc_count': 1},
    {'key': '#FoodStudies & #PublicCommunications student @SyracuseU all opinions are my own',
     'doc_count': 1},
    {'key': '#lovespokane', 'doc_count': 1},
    {'key': '#rva word nerd. writes about music, beer, things south. @nekocase for president.',
     'doc_count': 1},
    {'key': '(Bee-taa Ba-gu-lee/zaa-deh) Asst Prof of History & Africana @BucknellU. Writing a book on race+slavery in Iran, making a @diasporaletters film. Editor @ajammc.',
     'doc_c

In [31]:
#-Aggregation object-#
aggs = {
    "associations": {
      "terms": {
        "field": "associations.keyword",
        "size": 10,
        "order": {
          "_count": "desc"
        }
      }
    },
    "countries": {
      "terms": {
        "field": "country_code.keyword",
        "size": 10,
        "order": {
          "_count": "desc"
        }
      }
    }
  }


#-Multiple aggregation query-#
dict(es.search(index = "tbl_reporters", aggregations = aggs, size = 0))


{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 1000, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'associations': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 955,
   'buckets': [{'key': '383132', 'doc_count': 6},
    {'key': '37310', 'doc_count': 5},
    {'key': '111071', 'doc_count': 4},
    {'key': '14838', 'doc_count': 4},
    {'key': '347437', 'doc_count': 4},
    {'key': '479366', 'doc_count': 4},
    {'key': '32742, 37310', 'doc_count': 3},
    {'key': '342960', 'doc_count': 3},
    {'key': '37310, 32742', 'doc_count': 3},
    {'key': '483509', 'doc_count': 3}]},
  'countries': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 52,
   'buckets': [{'key': '', 'doc_count': 346},
    {'key': 'US', 'doc_count': 257},
    {'key': 'GB', 'doc_count': 243},
    {'key': 'CA', 'doc_count': 52},
    {'key': 'AU', 'doc_count': 15},
    {'key': 'IN', 'doc

# Filtering out Empty Strings.

In [35]:
#-Query object-#
query = {
    "wildcard": {
      "country_code": {
        "value": "?*"
      }
    }
  }

#-Aggregation object-#
aggs = {
    "associations": {
      "terms": {
        "field": "associations.keyword",
        "size": 10,
        "order": {
          "_count": "desc"
        }
      }
    },
    "countries": {
      "terms": {
        "field": "country_code.keyword",
        "size": 10,
        "order": {
          "_count": "desc"
        }
      }
    }
  }

#-Multiple aggregation query without empty strings-#
dict(es.search(index = "tbl_reporters", query = query, aggregations = aggs, size = 0))

{'took': 12,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 654, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'associations': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 618,
   'buckets': [{'key': '383132', 'doc_count': 6},
    {'key': '111071', 'doc_count': 3},
    {'key': '14838', 'doc_count': 3},
    {'key': '342960', 'doc_count': 3},
    {'key': '347437', 'doc_count': 3},
    {'key': '37310', 'doc_count': 3},
    {'key': '479366', 'doc_count': 3},
    {'key': '483509', 'doc_count': 3},
    {'key': '483781', 'doc_count': 3},
    {'key': '5744', 'doc_count': 3}]},
  'countries': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 47,
   'buckets': [{'key': 'US', 'doc_count': 257},
    {'key': 'GB', 'doc_count': 243},
    {'key': 'CA', 'doc_count': 52},
    {'key': 'AU', 'doc_count': 15},
    {'key': 'IN', 'doc_count': 13},
    {'key': 'CN', 'doc_count': 6},
