In [2]:
!pip install elasticsearch



In [3]:
!pip install python-dotenv



In [11]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    
    import pandas as pd
    import json
    import datetime
    import os
    import sys
    import numpy as np
    from dotenv import load_dotenv
    from elasticsearch import helpers
    from pathlib import Path
    print("Loaded!")
except Exception as e:
    print("Some Modules are Missing {}".format(e))

Loaded!


#### Reading the dataset

In [13]:
FILE_PATH = Path("../Data", "books.csv")
df = pd.read_csv(FILE_PATH, error_bad_lines=False)
df.head()

b'Skipping line 3350: expected 12 fields, saw 13\nSkipping line 4704: expected 12 fields, saw 13\nSkipping line 5879: expected 12 fields, saw 13\nSkipping line 8981: expected 12 fields, saw 13\n'


Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [14]:
df.shape

(11123, 12)

In [20]:
df.columns

Index(['bookID', 'title', 'authors', 'average_rating', 'isbn', 'isbn13',
       'language_code', '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date', 'publisher'],
      dtype='object')

#### Connecting into Elasticsearch

In [15]:
load_dotenv()

True

In [16]:
ENDPOINT = os.getenv('ENDPOINT')  #credential host is in .env file
es = Elasticsearch(timeout=600, hosts=ENDPOINT)

In [17]:
es.ping() #if true, we are connected!

True

#### Converting the data into the format that Elasticsearch can understand 

In [18]:
dic = df.to_dict("records")
dic[0]  #look to the first element of the list of dictionaries

{'bookID': 1,
 'title': 'Harry Potter and the Half-Blood Prince (Harry Potter  #6)',
 'authors': 'J.K. Rowling/Mary GrandPré',
 'average_rating': 4.57,
 'isbn': '0439785960',
 'isbn13': 9780439785969,
 'language_code': 'eng',
 '  num_pages': 652,
 'ratings_count': 2095690,
 'text_reviews_count': 27591,
 'publication_date': '9/16/2006',
 'publisher': 'Scholastic Inc.'}

In [19]:
len(dic)

11123

#### Converting the data into ELK format 

In [21]:
def generator(dictionary):
    for i, line in enumerate(dictionary):
        yield {
            '_index': 'books',
            '_type': '_doc',
            '_id': line.get("bookID", ""),
            '_source': {
                'title': line.get("title", ""),
                'authors': line.get("authors", ""),
                'average_rating': line.get("average_rating", ""),
                'isbn': line.get("isbn", ""),
                'language_code': line.get("language_code", ""),
                'num_pages': line.get("  num_pages", ""),
                'ratings_count': line.get("ratings_count", ""),
                'text_reviews_count': line.get("text_reviews_count", ""),
                'publication_date': line.get("publication_date", ""),
                'publisher': line.get("publisher", "")
            }
        }

In [22]:
convert = generator(dic)

In [23]:
next(convert)

{'_index': 'books',
 '_type': '_doc',
 '_id': 1,
 '_source': {'title': 'Harry Potter and the Half-Blood Prince (Harry Potter  #6)',
  'authors': 'J.K. Rowling/Mary GrandPré',
  'average_rating': 4.57,
  'isbn': '0439785960',
  'language_code': 'eng',
  'num_pages': 652,
  'ratings_count': 2095690,
  'text_reviews_count': 27591,
  'publication_date': '9/16/2006',
  'publisher': 'Scholastic Inc.'}}

#### Mappings

In [28]:
Map = {
    "mappings" : {
      "properties" : {
        "average_rating" : {
          "type" : "float"
        },
        "num_pages" : {
          "type" : "long"
        },
        "ratings_count" : {
          "type" : "long"
        },
        "text_reviews_count" : {
          "type" : "long"
        },
        "title" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword"
            }
          }
        },
        "authors" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "isbn" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "language_code" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "publication_date" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "publisher" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        }
      }
    }
  }

In [29]:
ma = es.indices.create(index='books', ignore=[400,404], body=Map)

In [30]:
ma

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'books'}

#### Uploading the data into Elasticsearch 

In [31]:
try:
    res = helpers.bulk(es, generator(dic))
    print("Uploading")
except Exception as e:
    print(e)
    pass

Uploading
