## Indexing

In [1]:
import zipfile
import json
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook
from datetime import datetime
from elasticsearch import Elasticsearch
from elasticsearch import helpers
import datetime
from pprint import pprint


import boto3
from elasticsearch import RequestsHttpConnection
from requests_aws4auth import AWS4Auth

### elasticsearch mapping, bulk file, AWS endpoint

In [2]:
#INDENTIFY MAPPING & INDEX & AWS ES Endpoint(IF RUNNING ON AWS)

mapping_name = 'movies'
index_name = 'movies'
es_endpoint = 'put your aws elasticsearch endpoint here'

In [3]:
#import csv bulk file
now = datetime.datetime.now()
index_csv = pd.read_csv('panda_movies_r1.csv')
#index_csv['date'] = now.strftime("%Y-%m-%d")
index_csv.head()

Unnamed: 0,title,year,rating,rank,genres,plot,directors,actors,release_date,running_time_secs,image_url
0,(500) Days of Summer,2009,7.8,269,"['Comedy', 'Drama', 'Romance']",An offbeat romantic comedy about a woman who d...,['Marc Webb'],"['Zooey Deschanel', 'Joseph Gordon-Levitt', 'G...",2009-01-17,5700,http://ia.media-imdb.com/images/M/MV5BMTk5MjM4...
1,+1,2013,5.6,401,"['Sci-Fi', 'Thriller']",Three college friends hit the biggest party of...,['Dennis Iliadis'],"['Rhys Wakefield', 'Logan Miller', 'Ashley Hin...",2013-03-10,5700,http://ia.media-imdb.com/images/M/MV5BMTQwOTA5...
2,10,1979,5.9,2862,"['Comedy', 'Romance']",A Hollywood songwriter goes through a mid-life...,['Blake Edwards'],"['Dudley Moore', 'Bo Derek', 'Julie Andrews']",1979-10-05,7320,http://ia.media-imdb.com/images/M/MV5BMTg1NDQ1...
3,10 Items or Less,2006,6.6,4401,"['Comedy', 'Drama', 'Romance']",An actor (Freeman) prepping for an upcoming ro...,['Brad Silberling'],"['Morgan Freeman', 'Paz Vega', 'Jonah Hill']",2006-09-11,4920,http://ia.media-imdb.com/images/M/MV5BMTI1MTU4...
4,10 Rillington Place,1971,7.5,2605,"['Biography', 'Crime', 'Drama', 'Horror', 'Thr...",,['Richard Fleischer'],"['Richard Attenborough', 'Judy Geeson', 'John ...",1971-02-10,6660,http://ia.media-imdb.com/images/M/MV5BMTc4MzM5...


In [4]:
#Load mapping file
with open('{}_mapping.json'.format(mapping_name)) as f:
    data = json.load(f)
    
#Convert to string
mapping = json.dumps(data)
pprint(mapping)

('{"settings": {"index.mapping.ignore_malformed": true}, "mappings": '
 '{"movies": {"properties": {"title": {"type": "text", "fields": {"keyword": '
 '{"type": "keyword", "ignore_above": 256}}}, "release_date": {"type": "date", '
 '"format": "YYYY-MM-dd"}, "genres": {"type": "text", "fields": {"keyword": '
 '{"type": "keyword", "ignore_above": 256}}}, "plot": {"type": "text", '
 '"fields": {"keyword": {"type": "keyword", "ignore_above": 256}}}, '
 '"directors": {"type": "text", "fields": {"keyword": {"type": "keyword", '
 '"ignore_above": 256}}}, "actors": {"type": "text", "fields": {"keyword": '
 '{"type": "keyword", "ignore_above": 256}}}, "rating": {"type": "float"}, '
 '"year": {"type": "integer"}, "rank": {"type": "integer"}, '
 '"running_time_secs": {"type": "integer"}, "image_url": {"type": "text", '
 '"fields": {"keyword": {"type": "keyword", "ignore_above": 256}}}}}}}')


### Create elasticsearch connection

In [5]:
# Local Elasticsearch
# Use this or else AWS blocks below
# connects to port 9300 by default
es = Elasticsearch()

In [9]:
# AWS Elasticsearch
# Don't run this if local version is being used
# Authorization
# If we have an open policy on AWS ES this block is not needed and http_auth must be commented out below
region = 'us-west-2' 
service = 'es'
#Credentials calls the user keys identified from aws-cli command aws configure
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service, session_token=credentials.token)

#Create ES Connection with AWS
#AWS ES endpoint link
host = es_endpoint

es = Elasticsearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)
print(es.info())

{'name': 'MJtp04b', 'cluster_name': '634195893235:movies', 'cluster_uuid': 'ymqDfLcWQkGDXG-otApe-A', 'version': {'number': '6.3.1', 'build_flavor': 'oss', 'build_type': 'zip', 'build_hash': 'eb782d0', 'build_date': '2018-09-11T14:05:25.216906Z', 'build_snapshot': False, 'lucene_version': '7.3.1', 'minimum_wire_compatibility_version': '5.6.0', 'minimum_index_compatibility_version': '5.0.0'}, 'tagline': 'You Know, for Search'}


## Check cluster and index 

In [6]:
#Check Cluster Health
es.cluster.health()

{'cluster_name': 'elasticsearch',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'active_primary_shards': 33,
 'active_shards': 33,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 30,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 52.38095238095239}

In [7]:
# Index Info
es.cat.indices()

'yellow open  28xx             fvC2d0ctQDiW721x98Sj3Q 5 1 12043 0 33.6mb 33.6mb\ngreen  open  .tasks           4d4J387tQUeJVwEBQtEZtQ 1 0     3 0 18.1kb 18.1kb\nyellow open  skyline          xU-aeMljRyyPGeHXT54q3A 5 1  8282 0 24.7mb 24.7mb\nyellow open  tesla_procedures RH4YIj_rSD-gXXX0uCuYxQ 5 1  4849 0  4.8mb  4.8mb\ngreen  open  .kibana_2        hawyYA63Q2KlN6MxJrg33g 1 0    18 2 72.1kb 72.1kb\n       close equality         6jlUmliMRNqIGZA6owgWwQ                          \nyellow open  29xx             8CGKhZ4WSk21gV8hF0wQKQ 5 1 15355 0 44.8mb 44.8mb\nyellow open  movies           g5HxrnaOTSi7xUjxVEkn3g 5 1  5000 0    6mb    6mb\nyellow open  rapid            GtqrufLiQDGgeiFv0GIL0g 5 1 11759 0 30.9mb 30.9mb\ngreen  open  .kibana_1        c1TjhiZdRgabetvFbEQmAg 1 0    13 0 30.6kb 30.6kb\n'

In [8]:
# Delete index if exists
if es.indices.exists('{}'.format(index_name)):
        es.indices.delete(index='{}'.format(index_name))

In [9]:
# Index Info
es.cat.indices()

'yellow open  28xx             fvC2d0ctQDiW721x98Sj3Q 5 1 12043 0 33.6mb 33.6mb\ngreen  open  .tasks           4d4J387tQUeJVwEBQtEZtQ 1 0     3 0 18.1kb 18.1kb\nyellow open  skyline          xU-aeMljRyyPGeHXT54q3A 5 1  8282 0 24.7mb 24.7mb\nyellow open  tesla_procedures RH4YIj_rSD-gXXX0uCuYxQ 5 1  4849 0  4.8mb  4.8mb\ngreen  open  .kibana_2        hawyYA63Q2KlN6MxJrg33g 1 0    18 2 72.1kb 72.1kb\n       close equality         6jlUmliMRNqIGZA6owgWwQ                          \nyellow open  29xx             8CGKhZ4WSk21gV8hF0wQKQ 5 1 15355 0 44.8mb 44.8mb\nyellow open  rapid            GtqrufLiQDGgeiFv0GIL0g 5 1 11759 0 30.9mb 30.9mb\ngreen  open  .kibana_1        c1TjhiZdRgabetvFbEQmAg 1 0    13 0 30.6kb 30.6kb\n'

In [10]:
# Create new index w/ mapping
es.indices.create(index='{}'.format(index_name), ignore=400, body=mapping)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'}

In [11]:
# Index Info
es.cat.indices()

'yellow open  28xx             fvC2d0ctQDiW721x98Sj3Q 5 1 12043 0 33.6mb 33.6mb\ngreen  open  .tasks           4d4J387tQUeJVwEBQtEZtQ 1 0     3 0 18.1kb 18.1kb\nyellow open  skyline          xU-aeMljRyyPGeHXT54q3A 5 1  8282 0 24.7mb 24.7mb\nyellow open  tesla_procedures RH4YIj_rSD-gXXX0uCuYxQ 5 1  4849 0  4.8mb  4.8mb\ngreen  open  .kibana_2        hawyYA63Q2KlN6MxJrg33g 1 0    18 2 72.1kb 72.1kb\nyellow open  movies           DfVHWk4pQm6oxgq6HFEp2g 5 1     0 0   460b   460b\n       close equality         6jlUmliMRNqIGZA6owgWwQ                          \nyellow open  29xx             8CGKhZ4WSk21gV8hF0wQKQ 5 1 15355 0 44.8mb 44.8mb\nyellow open  rapid            GtqrufLiQDGgeiFv0GIL0g 5 1 11759 0 30.9mb 30.9mb\ngreen  open  .kibana_1        c1TjhiZdRgabetvFbEQmAg 1 0    13 0 30.6kb 30.6kb\n'

In [12]:
# build bulk index files
for i,row in tqdm_notebook(index_csv.iterrows()):
            actions = [
                {
                  "_index" : index_name,
                  "_type" : mapping_name,
                  "_id" : i,
                  "_source" : index_csv.iloc[i].to_json(orient="index")
                }]
            helpers.bulk(es, actions, raise_on_exception=False, request_timeout=30)   

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [13]:
# Index Info
es.cat.indices()

'yellow open  28xx             fvC2d0ctQDiW721x98Sj3Q 5 1 12043 0 33.6mb 33.6mb\ngreen  open  .tasks           4d4J387tQUeJVwEBQtEZtQ 1 0     3 0 18.1kb 18.1kb\nyellow open  skyline          xU-aeMljRyyPGeHXT54q3A 5 1  8282 0 24.7mb 24.7mb\nyellow open  tesla_procedures RH4YIj_rSD-gXXX0uCuYxQ 5 1  4849 0  4.8mb  4.8mb\ngreen  open  .kibana_2        hawyYA63Q2KlN6MxJrg33g 1 0    18 2 72.2kb 72.2kb\nyellow open  movies           DfVHWk4pQm6oxgq6HFEp2g 5 1  5000 0  6.3mb  6.3mb\n       close equality         6jlUmliMRNqIGZA6owgWwQ                          \nyellow open  29xx             8CGKhZ4WSk21gV8hF0wQKQ 5 1 15355 0 44.8mb 44.8mb\nyellow open  rapid            GtqrufLiQDGgeiFv0GIL0g 5 1 11759 0 30.9mb 30.9mb\ngreen  open  .kibana_1        c1TjhiZdRgabetvFbEQmAg 1 0    13 0 30.6kb 30.6kb\n'