In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from neo4j import GraphDatabase
pd.set_option("display.max_colwidth", 500)

# Data exploration and processing # 

In [None]:
## load movies metadata
movies_df = pd.read_csv("movies_metadata.csv", low_memory=False)
movies_df.drop_duplicates(subset=['id'], inplace=True)
movies_df['genres'] = movies_df['genres'].apply(lambda x: eval(x))
movies_df.dropna(subset=['title'], inplace=True)
## load cast and credits
credits_df = pd.read_csv('credits.csv')
credits_df.drop_duplicates(subset=['id'], inplace=True)
movies_df['id'] = movies_df['id'].astype(str)
credits_df['id'] = credits_df['id'].astype(str)
credits_df['cast'] = credits_df['cast'].apply(lambda x: eval(x))
# load production houses
movies_df['production_companies'] = movies_df['production_companies'].apply(lambda x: eval(x))
movies_df.dropna(subset=['release_date'], inplace=True)
# Adding cast and crew to movie dataframe
movies_df = pd.merge(movies_df, credits_df, how='inner', on='id')

 ### Neo4J _ Knowledge Graph

In [None]:
## Connect to DB
d = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "admin"), encrypted=False)

In [None]:
def name_property_check(func):
    def validate_func(**kwargs):
        print(kwargs)
        return func(*kwargs)
    return validate_func

In [None]:
class GraphUtils:
    def __init__(self):
        pass
    ## Utitlity to convert the dictionary object to into string ({'key1':'value1'} -> "{key1:'value'}")
    def _get_property_in_cypher_format(self, dic):
        format_string = "{"
        for k,v in dic.items():
            format_string += k + ':"%s", '%(v)
        format_string = format_string[:-2] # removing the last/trailing ", "
        format_string+="}"
        return format_string
    def insert_HTR(self, head_type,head_id,head_properties,tail_type,tail_id,tail_properties,rel_type,rel_properties=None):
        ## all the properties expcet rel_properties must have a 'id' field
        ## add a decorator here for validation
        head_properties = self._get_property_in_cypher_format(head_properties)
        tail_properties = self._get_property_in_cypher_format(tail_properties)
        ## for relation types make sure to check before that, that particular relation doesn't exist
        
        if rel_properties is None:
            query = """\
            MERGE(head: {head_type} {{ id:"{head_id}" }})
            SET head += {head_properties}
            MERGE(tail: {tail_type} {{ id:"{tail_id}" }}) 
            SET tail += {tail_properties}
            MERGE (head)-[rel: {rel_type}]->(tail) return head, tail, rel""".format(head_type=head_type,
                                                                                    head_id=head_id, 
                                                                                    head_properties=head_properties,
                                                                                    tail_type=tail_type, 
                                                                                    tail_id=tail_id,
                                                                                    tail_properties=tail_properties, 
                                                                                    rel_type=rel_type)
            ## initialize this d object with a singleton
            with d.session() as session:
                session.run(query)
                

In [None]:
class HRT_Extractor:
    def __init__(self):
        pass
    def _movie_genre_htr(self, mids, titles, genres):
        for m_id, genre_json_list, title in zip(mids, genres, titles):
            if len(genre_json_list)==0:
                continue
            for genre_json in genre_json_list:
                htr_dict = {
                       'head_type' : 'Movie',
                       'head_id' : str(m_id),
                       'head_properties' : {'id': str(m_id), 'title': str(title).replace('"',"'")},
                       'tail_type' : 'Genre',
                       'tail_id' : str(genre_json['id']),
                       'tail_properties' : genre_json,
                       'rel_type' : 'has_genre'
                   }
                yield htr_dict
                
    def movie_genre_population(self, movie_ids_list: list, movie_titles_list: list, genres: dict):
        idx = 0
        for _obj in self._movie_genre_htr(movie_ids_list, movie_titles_list, genres):
            GraphUtils().insert_HTR(**_obj)
            
            print(idx)
            if idx%100==0:
                clear_output()
            idx+=1
        clear_output()
        print(idx, "Completed")
        
    def _movie_actor_htr(self, mids, actors):
        for m_id, actor_json_list in zip(mids, actors):
            if len(actor_json_list)==0:
                continue
            for actor_json in actor_json_list:
                actor_json = {'id':actor_json['id'], 'name':actor_json['name'].replace('"',"'"), 'gender':str(actor_json['gender'])}
                if actor_json['name'] == """"Weird Al" Yankovic""" or actor_json['name'] == "'Weird Al' Yankovic":
                    print(actor_json)
           
                htr_dict = {
                       'head_type' : 'Movie',
                       'head_id' : str(m_id),
                       'head_properties' : {'id': str(m_id)},
                       'tail_type' : 'Actor',
                       'tail_id' : str(actor_json['id']),
                       'tail_properties' : actor_json,
                       'rel_type' : 'acted_by'
                   }
                yield htr_dict
                
    def movie_actor_population(self, movie_ids_list: list, actors: dict):
        idx = 0
        for _obj in self._movie_actor_htr(movie_ids_list,actors):
            GraphUtils().insert_HTR(**_obj)
            
            if idx%100==0:
                print(idx)
            if idx%1000==0:
                clear_output()
            idx+=1
        clear_output()
        print(idx, "Completed")
        
        
    def _movie_production_company_htr(self, mids, titles, prod_comps):
        for m_id, prod_comp_json_list, title in zip(mids, prod_comps, titles):
            if len(prod_comp_json_list)==0:
                continue
            for prod_comp_json in prod_comp_json_list:
                htr_dict = {
                       'head_type' : 'Movie',
                       'head_id' : str(m_id),
                       'head_properties' : {'id': str(m_id), 'title': str(title).replace('"',"'")},
                       'tail_type' : 'ProductionCompany',
                       'tail_id' : str(prod_comp_json['id']),
                       'tail_properties' : prod_comp_json,
                       'rel_type' : 'is_produced_by'
                   }
                yield htr_dict
                
    def movie_production_company_population(self, movie_ids_list: list, movie_titles_list: list, prod_comps: dict):
        idx = 0
        for _obj in self._movie_production_company_htr(movie_ids_list, movie_titles_list, prod_comps):
            GraphUtils().insert_HTR(**_obj)
            
            print(idx)
            if idx%100==0:
                clear_output()
            idx+=1
        clear_output()
        print(idx, "Completed")
        
        
        
    def _movie_year_htr(self, mids, titles, year):
        for m_id, year, title in zip(mids, year, titles):
            
            htr_dict = {
                       'head_type' : 'Movie',
                       'head_id' : str(m_id),
                       'head_properties' : {'id': str(m_id), 'title': str(title).replace('"',"'")},
                       'tail_type' : 'Year',
                       'tail_id' : str(year),
                       'tail_properties' : {'id': str(year), 'year': str(year)},
                       'rel_type' : 'released_in'
                   }
            yield  htr_dict
        
                
    def movie_year_population(self, movie_ids_list: list, movie_titles_list: list, year):
        idx = 0
        for _obj in self._movie_year_htr(movie_ids_list, movie_titles_list, year):
            GraphUtils().insert_HTR(**_obj)
            
            print(idx)
            if idx%100==0:
                clear_output()
            idx+=1
        clear_output()
        print(idx, "Completed")
        
        
                
    def _movie_director_htr(self, mids, directors):
        for m_id, director_json_list in zip(mids, directors):
            if len(director_json_list)==0:
                continue
            for director_json in director_json_list:
                director_json = {'id':director_json['id'], 'name':director_json['name'].replace('"',"'"), 'gender':str(director_json['gender'])}

                htr_dict = {
                       'head_type' : 'Movie',
                       'head_id' : str(m_id),
                       'head_properties' : {'id': str(m_id)},
                       'tail_type' : 'director',
                       'tail_id' : str(director_json['id']),
                       'tail_properties' : director_json,
                       'rel_type' : 'directed_by'
                   }
                yield htr_dict
                
    def movie_director_population(self, movie_ids_list: list, directors: dict):
        idx = 0
        for _obj in self._movie_director_htr(movie_ids_list,directors):
            GraphUtils().insert_HTR(**_obj)
            
            if idx%100==0:
                print(idx)
            if idx%1000==0:
                clear_output()
            idx+=1
        clear_output()
        print(idx, "Completed")

# Adding Movie - Genre nodes #

In [None]:
HRT_Extractor().movie_genre_population(movies_df['id'], movies_df['title'], movies_df['genres'])

2398 Completed


# Adding Movie nodes #

In [None]:
HRT_Extractor().movie_actor_population(movies_df['id'], movies_df['cast'])

17368 Completed


# Adding production companies #

In [None]:
HRT_Extractor().movie_production_company_population(movies_df['id'], movies_df['title'], movies_df['production_companies'])

1962 Completed


# Adding Year node #

In [None]:
from datetime import datetime
# funtion to extract year from date
def Extract_year(x): 
    x= str(x)
    dt = datetime.strptime(x, '%Y-%m-%d')
    return dt.year

In [None]:
movies_df['release_year'] = movies_df['release_date'].apply(lambda x: Extract_year(x))
HRT_Extractor().movie_year_population(movies_df['id'], movies_df['title'], movies_df['release_year'])

998 Completed


# Adding director and writer #

In [None]:
credits_df['crew'] = credits_df['crew'].apply(lambda x: eval(x))

In [None]:
def MovieDirector(movie_crew):
    t=[]
    for i in range (len(movie_crew)):
        if (movie_crew[i]['job']== 'Director'):
            t.append(movie_crew[i])
    return (t)

In [None]:
movies_df['Director'] = movies_df['crew'].apply(lambda x : MovieDirector(x))

In [None]:
HRT_Extractor().movie_director_population(movies_df['id'],  movies_df['Director'])

1046 Completed


<img src="https://drive.google.com/uc?export=view&id=1O4jndDGd5jwqZ1UrzYcYTFqaKblJUmSy"></img>