<a href="https://colab.research.google.com/github/canon14/MovieRecommendationModel/blob/main/ChildrenMovieRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly

In [2]:
# Create dataframes for all the files
artist_attributes_df = pd.read_csv('/content/artist_attributes.csv')
artist_to_imdb_df = pd.read_csv('/content/artist_to_imdb.csv')
imdb_attributes_df = pd.read_csv('/content/imdb_attributes.csv')
imdb_genres_df = pd.read_csv('/content/imdb_genres.csv')
links_df = pd.read_csv('/content/links.csv')
movie_genres_df = pd.read_csv('/content/movie_genres.csv')
movie_title_keywords_df = pd.read_csv('/content/movie_title_keywords.csv')
movies_df = pd.read_csv('/content/movies.csv')
ratings_df = pd.read_csv('/content/ratings.csv')
tmdb_attributes_df = pd.read_csv('/content/tmdb_attributes.csv')
tmdb_keywords_df = pd.read_csv('/content/tmdb_keywords.csv')
tmdb_overview_keywords_df = pd.read_csv('/content/tmdb_overview_keywords.csv')

In [8]:
#Check total num of rows for each dataframe
df_names = [artist_attributes_df, artist_to_imdb_df, imdb_attributes_df, imdb_genres_df, links_df, movie_genres_df, movie_title_keywords_df, movies_df, ratings_df, tmdb_attributes_df, tmdb_keywords_df, tmdb_overview_keywords_df]
names = ['artist_attributes_df', 'artist_to_imdb_df', 'imdb_attributes_df', 'imdb_genres_df', 'links_df', 'movie_genres_df', 'movie_title_keywords_df', 'movies_df', 'ratings_df', 'tmdb_attributes_df', 'tmdb_keywords_df', 'tmdb_overview_keywords_df']

for i in range(0, len(df_names)):
  print(f'{names[i]} : {len(df_names[i])}')
  print(f'{names[i]} columns: {df_names[i].columns}')
  print('--------------------------------------------')


artist_attributes_df : 175719
artist_attributes_df columns: Index(['artist_id', 'name'], dtype='object')
--------------------------------------------
artist_to_imdb_df : 627212
artist_to_imdb_df columns: Index(['artist_id', 'imdb_id'], dtype='object')
--------------------------------------------
imdb_attributes_df : 14761
imdb_attributes_df columns: Index(['tid', 'title', 'url', 'imdbRating', 'duration', 'year', 'nrOfWins',
       'nrOfNominations'],
      dtype='object')
--------------------------------------------
imdb_genres_df : 33040
imdb_genres_df columns: Index(['genre', 'id'], dtype='object')
--------------------------------------------
links_df : 9742
links_df columns: Index(['imdbId', 'movieId', 'tmdbId'], dtype='object')
--------------------------------------------
movie_genres_df : 22084
movie_genres_df columns: Index(['Unnamed: 0', 'genre', 'movieID', 'title'], dtype='object')
--------------------------------------------
movie_title_keywords_df : 47663
movie_title_keywords

#Data Consolidation

We will consolidate all the files based on their relationship with each other to create one consolidated file on which we will perform visualization and apply model onto.

In [44]:
#Artist and IMDB
merged_df = artist_attributes_df.merge(artist_to_imdb_df, how='inner', on='artist_id')
merged_df = merged_df.merge(imdb_attributes_df, how='inner', left_on='imdb_id', right_on='tid').drop(['artist_id', 'tid'], axis=1)

#group data by its features to consolidate all the artist names into a list under a new column called 'artists'
merged_df = merged_df.groupby(['imdb_id', 'title', 'url', 'imdbRating', 'duration', 'year', 'nrOfWins', 'nrOfNominations'])['name'].apply(list).rename('artist_names').reset_index()

In [43]:
merged_df

Unnamed: 0,imdb_id,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations,artist_names
0,tt0002844,Fant√¥mas - √Ä l'ombre de la guillotine (1913),http://www.imdb.com/title/tt0002844/,6.8,3240,1913,0,0,"[Renée Carl, Georges Melchior, René Navarre, P..."
1,tt0003740,Cabiria (1914),http://www.imdb.com/title/tt0003740/,6.5,8880,1914,0,0,"[Carolina Catena, Titus Livius, Manlio Mazza, ..."
2,tt0004630,The Spoilers (1914),http://www.imdb.com/title/tt0004630/,6.7,6600,1914,0,0,[Frank Clark]
3,tt0004635,The Squaw Man (1914),http://www.imdb.com/title/tt0004635/,6.3,4440,1914,0,0,"[Oscar Apfel, Sydney Deane, William Elmer, Dus..."
4,tt0004972,Die Geburt einer Nation (1915),http://www.imdb.com/title/tt0004972/,6.9,9900,1915,1,0,"[D.W. Griffith, G.W. Bitzer, Spottiswoode Aitk..."
...,...,...,...,...,...,...,...,...,...
9954,tt3420392,The House of Exorcism (1975),http://www.imdb.com/title/tt3420392/,5.3,5520,1975,0,0,"[Alfredo Leone, Alberto Cittini, Francesca Rus..."
9955,tt3465082,Who the 'Ell Is Tauriel? (2013),http://www.imdb.com/title/tt3465082/,8.5,360,2013,0,0,[Jim Baltaxe]
9956,tt3465488,Karle Pyaar Karle (2014),http://www.imdb.com/title/tt3465488/,3.5,6600,2014,0,0,"[Aham Sharma, Hasleen Kaur, Spandan Mishra, Ra..."
9957,tt3481232,Die Oscars (TV Movie 2014),http://www.imdb.com/title/tt3481232/,7.4,5700,2014,0,0,[Rick Spalla]


In [38]:
merged_df = merged_df.merge(links_df, how='left', left_on='imdb_id', right_on='imdbId').drop(['imdbId'], axis=1)

In [39]:
merged_df

Unnamed: 0,imdb_id,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations,artist_names,movieId,tmdbId
0,tt0002844,Fant√¥mas - √Ä l'ombre de la guillotine (1913),http://www.imdb.com/title/tt0002844/,6.8,3240,1913,0,0,"[Renée Carl, Georges Melchior, René Navarre, P...",,
1,tt0003740,Cabiria (1914),http://www.imdb.com/title/tt0003740/,6.5,8880,1914,0,0,"[Carolina Catena, Titus Livius, Manlio Mazza, ...",,
2,tt0004630,The Spoilers (1914),http://www.imdb.com/title/tt0004630/,6.7,6600,1914,0,0,[Frank Clark],,
3,tt0004635,The Squaw Man (1914),http://www.imdb.com/title/tt0004635/,6.3,4440,1914,0,0,"[Oscar Apfel, Sydney Deane, William Elmer, Dus...",,
4,tt0004972,Die Geburt einer Nation (1915),http://www.imdb.com/title/tt0004972/,6.9,9900,1915,1,0,"[D.W. Griffith, G.W. Bitzer, Spottiswoode Aitk...",7065.0,618.0
...,...,...,...,...,...,...,...,...,...,...,...
9954,tt3420392,The House of Exorcism (1975),http://www.imdb.com/title/tt3420392/,5.3,5520,1975,0,0,"[Alfredo Leone, Alberto Cittini, Francesca Rus...",,
9955,tt3465082,Who the 'Ell Is Tauriel? (2013),http://www.imdb.com/title/tt3465082/,8.5,360,2013,0,0,[Jim Baltaxe],,
9956,tt3465488,Karle Pyaar Karle (2014),http://www.imdb.com/title/tt3465488/,3.5,6600,2014,0,0,"[Aham Sharma, Hasleen Kaur, Spandan Mishra, Ra...",,
9957,tt3481232,Die Oscars (TV Movie 2014),http://www.imdb.com/title/tt3481232/,7.4,5700,2014,0,0,[Rick Spalla],,


##Clean movie genres

In [None]:
movie_genres_df.head()

Unnamed: 0.1,Unnamed: 0,genre,movieID,title
0,0,Adventure,1,Toy Story (1995)
1,1,Animation,1,Toy Story (1995)
2,2,Children,1,Toy Story (1995)
3,3,Comedy,1,Toy Story (1995)
4,4,Fantasy,1,Toy Story (1995)


In [None]:
#Remove Unnecessary column
movie_genres_df = movie_genres_df.drop('Unnamed: 0', axis=1)

In [None]:
#group data by its movieID to consolidate all the genres into a list under a new column called 'genre_list'
movie_genres_group_df = movie_genres_df.groupby(['movieID', 'title'])['genre'].apply(list).rename('genre_list').reset_index()

In [None]:
movie_genres_group_df

Unnamed: 0,movieID,title,genre_list
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]"
9738,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]"
9739,193585,Flint (2017),[Drama]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]"
