### MovieLens Dataset

In [None]:
!rm ml-100k.zip
!rm -rf ml-100k
!wget -O ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip ml-100k.zip

In [None]:
import pandas as pd 
user_path = './ml-100k/u.user'
item_path = './ml-100k/u.item'
user_item = './ml-100k/u.data'

user_df = pd.read_csv(user_path, names=['uid','age','gender','occupation','zipcode'],  sep='|')
user_df.head()

In [None]:
genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \
                                        'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \
                                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


item_df = pd.read_csv(item_path, names=['iid','title','release_date','video_release_date', 'imdb url'] + genres,  sep='|', encoding = "ISO-8859-1")



In [None]:
user_item_df = pd.read_csv(user_item, names=['iid', 'uid', 'rating', 'timestamp'], sep='\t')
user_item_df = user_item_df.merge(item_df, on=['iid'])
user_item_df = user_item_df.merge(user_df, on=['uid'])
user_item_df.head() 

### Observe - Gender v.s. Genre 

In [None]:
import numpy 
import matplotlib.pyplot as plt
import seaborn as sns

def plot_heat_map(df, figsize=(10,7)): 
    df = df.div(df.sum(axis=1), axis=0)     
    plt.subplots(figsize=figsize)
    sns.heatmap(df)

gender = user_item_df[user_item_df['rating']>3][['gender']+genres].groupby(['gender']).sum()
plot_heat_map(gender, figsize=(10,2))
    

### Observe - Occupation v.s. Genre 

In [None]:
occupation = user_item_df[user_item_df['rating']>3][['occupation']+genres].groupby(['occupation']).sum()
plot_heat_map(occupation, figsize=(10,7))

### Observe - Age v.s. Genre 

In [None]:
user_item_df['age_segment']=user_item_df['age']//10
age = user_item_df[user_item_df['rating']>3][['age_segment']+genres].groupby(['age_segment']).sum()
plot_heat_map(age, figsize=(10,5))

In [None]:
import pandas as pd 
genres = ['unknown','Action' , 'Adventure', 'Animation', 'Childrens' , 'Comedy' , 'Crime', \
                                        'Documentary', 'Drama' ,'Fantasy' , 'Film-Noir' , 'Horror' , 'Musical', \
                                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


import re 
item_df = pd.read_csv(item_path, names=['iid','title','release_date','video_release_date', 'imdb url'] + genres,  sep='|', encoding = "ISO-8859-1")
def get_year(title):
    movie_year_p = re.compile('.*\((\d+)\)')
    m = re.search(movie_year_p, title)
    movie_year = -1
    try:
        movie_year = int(m.group(1))
    except:
        pass
    return movie_year



item_df['year'] = item_df.apply(lambda x: get_year(x['title']), axis=1)

item_df.head()




### Prepare to insert into Elastic Search

In [None]:
!pip install requests 
!pip install Elasticsearch 
!pip install urllib3

In [None]:
master_user="master_user"
master_user_password="master_user_password"
elastic_search_endpoint="elastic_search_endpoint"

In [None]:
import boto3
import json
from elasticsearch import Elasticsearch, RequestsHttpConnection
def connectES(esEndPoint):
    print ('Connecting to the ES Endpoint {0}'.format(esEndPoint))
    try:
        esClient = Elasticsearch(
        hosts=[{'host': esEndPoint, 'port': 443}],
        http_auth=(master_user, master_user_password),
        use_ssl=True,
        verify_certs=True,
        connection_class=RequestsHttpConnection)
        return esClient
    except Exception as E:
        print("Unable to connect to {0}".format(esEndPoint))
        print(E)
        exit(3)

In [None]:
esClient = connectES(elastic_search_endpoint)

In [None]:
indexDoc = {
    "dataRecord":{
    "iid": {"type":"integer"},
    "title": {"type":"text", "index":"analyzed"},
    "release_date": {"type":"text", "index":False},
    "video_release_date": {"type":"text", "index":False},
    "imdb url": {"type":"text", "index":False},
    "unknown": {"type":"text", "index":False},
    "Action": {"type":"integer"},
    "Adventure": {"type":"integer"},
    "Animation": {"type":"integer"},
    "Childrens": {"type":"integer"},
    "Comedy": {"type":"integer"},
    "Crime": {"type":"integer"},
    "Documentary": {"type":"integer"},
    "Drama": {"type":"integer"},
    "Fantasy": {"type":"integer"},
    "Film-Noir": {"type":"integer"},
    "Horror": {"type":"integer"},
    "Musical": {"type":"integer"},
    "Mystery": {"type":"integer"},
    "Romance": {"type":"integer"},
    "Sci-Fi": {"type":"integer"},
    "Thriller": {"type":"integer"},
    "War": {"type":"integer"},
    "Western": {"type":"integer"},
    "year": {"type":"integer"}
},
"settings" : {
 "number_of_shards": 1,
 "number_of_replicas": 0
 }
}

In [None]:
def createIndex(esClient):
    try:
        res = esClient.indices.exists('movies')
        print("Index Exists ... {}".format(res))
        if res is False:
            esClient.indices.create('movies', body=indexDoc)
        return 1
    except Exception as E:
        print("Unable to Create Index {0}".format("movies"))
        print(E)
        exit(4)

In [None]:
def indexDocElement(esClient, response):
    try:
        retval = esClient.index(index='movies', doc_type='movie', body=response)
    except Exception as E:
        print("Doc not indexed")
        print("Error: ",E)
        exit(5)

In [None]:
content = {"iid": 18, "title": "White Balloon, The (1995)", "release_date": "01-Jan-1995", "video_release_date": "", "imdb url": "http://us.imdb.com/M/title-exact?Badkonake%20Sefid%20(1995)", "unknown": 0, "Action": 0, "Adventure": 0, "Animation": 0, "Childrens": 0, "Comedy": 0, "Crime": 0, "Documentary": 0, "Drama": 1, "Fantasy": 0, "Film-Noir": 0, "Horror": 0, "Musical": 0, "Mystery": 0, "Romance": 0, "Sci-Fi": 0, "Thriller": 0, "War": 0, "Western": 0, "year": 1995}

indexDocElement(esClient,content)

In [None]:
item_df = item_df.fillna("")
item_arr = item_df.to_dict(orient="records")
import json 
for i, j in enumerate(item_arr):
    indexDocElement(esClient,response=j)

In [None]:
import requests
r = requests.get('https://{}/movies/movie/_search?q=title:black&size=100'.format(elastic_search_endpoint), auth=(master_user, master_user_password))
rjson = r.json()
rjson 

### Save user_item_df and item_df for later usage

In [None]:
user_item_df.to_pickle("user_item_df.p")
item_df.to_pickle("item_df.p")
