<a href="https://colab.research.google.com/github/canon14/MovieRecommendationModel/blob/main/MovieRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly

In [3]:
# Create dataframes for all the files
artist_attributes_df = pd.read_csv('/content/artist_attributes.csv')
artist_to_imdb_df = pd.read_csv('/content/artist_to_imdb.csv')
imdb_attributes_df = pd.read_csv('/content/imdb_attributes.csv')
imdb_genres_df = pd.read_csv('/content/imdb_genres.csv')
links_df = pd.read_csv('/content/links.csv')
movie_genres_df = pd.read_csv('/content/movie_genres.csv')
movie_title_keywords_df = pd.read_csv('/content/movie_title_keywords.csv')
movies_df = pd.read_csv('/content/movies.csv')
tmdb_attributes_df = pd.read_csv('/content/tmdb_attributes.csv')
tmdb_keywords_df = pd.read_csv('/content/tmdb_keywords.csv')
tmdb_overview_keywords_df = pd.read_csv('/content/tmdb_overview_keywords.csv')

In [4]:
ratings_df = pd.read_csv('/content/ratings.csv')

In [5]:
#Check total num of rows for each dataframe
df_names = [artist_attributes_df, artist_to_imdb_df, imdb_attributes_df, imdb_genres_df, links_df, movie_genres_df, movie_title_keywords_df, movies_df, ratings_df, tmdb_attributes_df, tmdb_keywords_df, tmdb_overview_keywords_df]
names = ['artist_attributes_df', 'artist_to_imdb_df', 'imdb_attributes_df', 'imdb_genres_df', 'links_df', 'movie_genres_df', 'movie_title_keywords_df', 'movies_df', 'ratings_df', 'tmdb_attributes_df', 'tmdb_keywords_df', 'tmdb_overview_keywords_df']

for i in range(0, len(df_names)):
  print(f'{names[i]} : {len(df_names[i])}')
  print(f'{names[i]} columns: {df_names[i].columns}')
  print('--------------------------------------------')


artist_attributes_df : 175719
artist_attributes_df columns: Index(['artist_id', 'name'], dtype='object')
--------------------------------------------
artist_to_imdb_df : 627212
artist_to_imdb_df columns: Index(['artist_id', 'imdb_id'], dtype='object')
--------------------------------------------
imdb_attributes_df : 14761
imdb_attributes_df columns: Index(['tid', 'title', 'url', 'imdbRating', 'duration', 'year', 'nrOfWins',
       'nrOfNominations'],
      dtype='object')
--------------------------------------------
imdb_genres_df : 33040
imdb_genres_df columns: Index(['genre', 'id'], dtype='object')
--------------------------------------------
links_df : 9742
links_df columns: Index(['imdbId', 'movieId', 'tmdbId'], dtype='object')
--------------------------------------------
movie_genres_df : 22084
movie_genres_df columns: Index(['Unnamed: 0', 'genre', 'movieID', 'title'], dtype='object')
--------------------------------------------
movie_title_keywords_df : 47663
movie_title_keywords

#**Data Exploration and Cleaning/Transformation**

###**Artist**

In [6]:
#Artist 
artist_attributes_df.head()

Unnamed: 0,artist_id,name
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman


In [7]:
artist_attributes_df.describe()

Unnamed: 0,artist_id,name
count,175719,175719
unique,175719,173131
top,nm0000001,Dinesh
freq,1,7


There are 175719 unique artists but only 173131 names..Some names might be duplicated in this dataset

In [9]:
#Check for duplicate names
artist_attributes_df.value_counts('name')

name
Dinesh               7
Chris Brown          6
John Gilbert         5
Robert Hall          5
Chris Wright         5
                    ..
Gustavo Pizzi        1
Gustavo Pomeranec    1
Gustavo Rodriguez    1
Gustavo Ron          1
Þórir Waagfjörð      1
Length: 173131, dtype: int64

In [10]:
#Should we remove these duplicates? The problem is these names also have unique artist_id, this means that different movies starred by these artists might be using different artist_id for the same artist
#It is safe to just keep them for now
artist_attributes_df[artist_attributes_df['name'] == 'Dinesh']

Unnamed: 0,artist_id,name
93166,nm10010521,Dinesh
158451,nm5110893,Dinesh
161717,nm5625923,Dinesh
166104,nm6549371,Dinesh
171504,nm8128125,Dinesh
171524,nm8135104,Dinesh
175206,nm9696209,Dinesh


###**IMDB**

In [25]:
imdb_attributes_df.head()

Unnamed: 0,tid,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations
0,tt0012349,Der Vagabund und das Kind (1921),http://www.imdb.com/title/tt0012349/,8.4,3240,1921,1,0
1,tt0015864,Goldrausch (1925),http://www.imdb.com/title/tt0015864/,8.3,5700,1925,2,1
2,tt0017136,Metropolis (1927),http://www.imdb.com/title/tt0017136/,8.4,9180,1927,3,4
3,tt0017925,Der General (1926),http://www.imdb.com/title/tt0017925/,8.3,6420,1926,1,1
4,tt0021749,Lichter der Gro√üstadt (1931),http://www.imdb.com/title/tt0021749/,8.7,5220,1931,2,0


In [23]:
merge_df = imdb_attributes_df.merge(artist_to_imdb_df, how='inner', left_on='tid', right_on='imdb_id').drop('tid', axis=1)
merge_df = merge_df.merge(artist_attributes_df, how='inner', on='artist_id')

In [24]:
merge_df

Unnamed: 0,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations,artist_id,imdb_id,name
0,Der Vagabund und das Kind (1921),http://www.imdb.com/title/tt0012349/,8.4,3240,1921,1,0,nm0001067,tt0012349,Jackie Coogan
1,Die Addams Family (TV Series 1964‚Äì1966),http://www.imdb.com/title/tt0057729/,8.1,1800,1966,0,0,nm0001067,tt0057729,Jackie Coogan
2,Oliver Twist (1922),http://www.imdb.com/title/tt0013450/,6.8,5880,1922,0,0,nm0001067,tt0013450,Jackie Coogan
3,Der Vagabund und das Kind (1921),http://www.imdb.com/title/tt0012349/,8.4,3240,1921,1,0,nm0465231,tt0012349,Toraichi Kono
4,Goldrausch (1925),http://www.imdb.com/title/tt0015864/,8.3,5700,1925,2,1,nm0465231,tt0015864,Toraichi Kono
...,...,...,...,...,...,...,...,...,...,...
108687,Abbott and Costello Meet the Killer\,abbott and costello meet the killer boris karloff,http://www.imdb.com/title/tt0041085/,2476,5040,video.movie,0,nm0041304,tt0041085,Lenore Aubert
108688,Abbott and Costello Meet the Killer\,abbott and costello meet the killer boris karloff,http://www.imdb.com/title/tt0041085/,2476,5040,video.movie,0,nm0551550,tt0041085,Donna Martell
108689,Abbott and Costello Meet the Killer\,abbott and costello meet the killer boris karloff,http://www.imdb.com/title/tt0041085/,2476,5040,video.movie,0,nm0811467,tt0041085,Howard Snyder
108690,Die Abenteuer von Ichabod und Tadd√§us Kr√∂te ...,http://www.imdb.com/title/tt0041094/,7.2,4080,1949,0,0,nm0021502,tt0041094,Claud Allister


In [None]:
merged_df = merged_df.merge(links_df, how='left', left_on='imdb_id', right_on='imdbId').drop(['imdbId'], axis=1)

In [None]:
merged_df

Unnamed: 0,imdb_id,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations,artist_names,movieId,tmdbId
0,tt0002844,Fant√¥mas - √Ä l'ombre de la guillotine (1913),http://www.imdb.com/title/tt0002844/,6.8,3240,1913,0,0,"[Renée Carl, Georges Melchior, René Navarre, P...",,
1,tt0003740,Cabiria (1914),http://www.imdb.com/title/tt0003740/,6.5,8880,1914,0,0,"[Carolina Catena, Titus Livius, Manlio Mazza, ...",,
2,tt0004630,The Spoilers (1914),http://www.imdb.com/title/tt0004630/,6.7,6600,1914,0,0,[Frank Clark],,
3,tt0004635,The Squaw Man (1914),http://www.imdb.com/title/tt0004635/,6.3,4440,1914,0,0,"[Oscar Apfel, Sydney Deane, William Elmer, Dus...",,
4,tt0004972,Die Geburt einer Nation (1915),http://www.imdb.com/title/tt0004972/,6.9,9900,1915,1,0,"[D.W. Griffith, G.W. Bitzer, Spottiswoode Aitk...",7065.0,618.0
...,...,...,...,...,...,...,...,...,...,...,...
9954,tt3420392,The House of Exorcism (1975),http://www.imdb.com/title/tt3420392/,5.3,5520,1975,0,0,"[Alfredo Leone, Alberto Cittini, Francesca Rus...",,
9955,tt3465082,Who the 'Ell Is Tauriel? (2013),http://www.imdb.com/title/tt3465082/,8.5,360,2013,0,0,[Jim Baltaxe],,
9956,tt3465488,Karle Pyaar Karle (2014),http://www.imdb.com/title/tt3465488/,3.5,6600,2014,0,0,"[Aham Sharma, Hasleen Kaur, Spandan Mishra, Ra...",,
9957,tt3481232,Die Oscars (TV Movie 2014),http://www.imdb.com/title/tt3481232/,7.4,5700,2014,0,0,[Rick Spalla],,


##Clean movie genres

In [None]:
movie_genres_df.head()

Unnamed: 0.1,Unnamed: 0,genre,movieID,title
0,0,Adventure,1,Toy Story (1995)
1,1,Animation,1,Toy Story (1995)
2,2,Children,1,Toy Story (1995)
3,3,Comedy,1,Toy Story (1995)
4,4,Fantasy,1,Toy Story (1995)


In [None]:
#Remove Unnecessary column
movie_genres_df = movie_genres_df.drop('Unnamed: 0', axis=1)

In [None]:
#group data by its movieID to consolidate all the genres into a list under a new column called 'genre_list'
movie_genres_group_df = movie_genres_df.groupby(['movieID', 'title'])['genre'].apply(list).rename('genre_list').reset_index()

In [None]:
movie_genres_group_df

Unnamed: 0,movieID,title,genre_list
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]"
9738,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]"
9739,193585,Flint (2017),[Drama]
9740,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]"
