In [1]:
try:
    import elasticsearch
    from elasticsearch import Elasticsearch
    import pandas as pd
    import json
    from ast import literal_eval
    from tqdm import tqdm
    import datetime
    import os
    import sys
    import numpy as np
    from elasticsearch import helpers
    print('All libraries loaded successfully')
    
except Exception as e:
    print(e)

All libraries loaded successfully


In [2]:
df = pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020.0,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,07:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016.0,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011.0,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009.0,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008.0,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [3]:
print(df.shape)
df['show_id'].nunique()

(7787, 12)


7787

* show_id is a unique column

### Create elasticsearch instance

In [4]:
# connect to elastic search
web_address = 'http://localhost:9200/'
es = Elasticsearch(timeout = 600, hosts = web_address)

In [5]:
es.ping()

True

### Data cleaning

In [6]:
# check for number of null values in each column
df.isna().sum()

show_id            0
type               0
title              3
director        2389
cast             718
country          508
date_added        10
release_year       2
rating             7
duration           3
listed_in          0
description        0
dtype: int64

In [7]:
# drop all the null values
df = df.dropna()
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [8]:
df.shape

(4804, 12)

### Convert data into APP format compatible with elasticsearch

In [9]:
df_dict = df.to_dict('records')

In [10]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [28]:
def generator(df):
    for item, line in enumerate(df):
        yield{
            '_index': 'first_elk',
            '_type': '_doc',
            '_id': line.get('show_id', None),
            '_source': {
                'title': line.get('title', ''),
                'director': line.get('director', ''),
                'cast': line.get('cast', ''),
                'country': line.get('country', ''),
                'release_year': line.get('release_year', '')
            }
        }
    raise StopIteration

In [22]:
my_custom = generator(df_dict)

In [23]:
my_custom

<generator object generator at 0x0000027FB35DD938>

In [24]:
next(my_custom)

{'_index': 'first_elk',
 '_type': '_doc',
 '_id': 's2',
 '_source': {'title': '07:19',
  'director': 'Jorge Michel Grau',
  'cast': 'Demián Bichir, Héctor Bonilla, Oscar Serrano, Azalia Ortiz, Octavio Michel, Carmen Beato',
  'country': 'Mexico',
  'release_year': 2016.0}}

### Settings or Mappings

In [29]:
settings = {
    'settings':{
        'number_of_shards':1,
        'number_of_replicas':0
    },
    'mappings':{
        'properties':{
            'director':{
                'type': 'text'
                
            },
            'cast':{
                'type': 'text'
            }
            
        }
    }
}

In [30]:
my = es.indices.create(index = 'first_elk', 
                       ignore = [404, 400], 
                       body = settings)

In [31]:
my

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'first_elk'}

### Upload data to Elasticsearch

In [32]:
try:
    res = helpers.bulk(client = es, 
                       actions = generator(df_dict))
    print('completed')
except Exception as e:
    pass



completed
