<a href="https://colab.research.google.com/github/canon14/MovieRecommendationModel/blob/main/MovieRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [132]:
#Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import re

In [133]:
# Create dataframes for all the files
artist_attributes_df = pd.read_csv('/content/artist_attributes.csv')
artist_to_imdb_df = pd.read_csv('/content/artist_to_imdb.csv')
imdb_attributes_df = pd.read_csv('/content/imdb_attributes.csv')
imdb_genres_df = pd.read_csv('/content/imdb_genres.csv')
links_df = pd.read_csv('/content/links.csv')
movie_genres_df = pd.read_csv('/content/movie_genres.csv')
movie_title_keywords_df = pd.read_csv('/content/movie_title_keywords.csv')
movies_df = pd.read_csv('/content/movies.csv')
tmdb_attributes_df = pd.read_csv('/content/tmdb_attributes.csv')
tmdb_keywords_df = pd.read_csv('/content/tmdb_keywords.csv')
tmdb_overview_keywords_df = pd.read_csv('/content/tmdb_overview_keywords.csv')

In [134]:
ratings_df = pd.read_csv('/content/ratings.csv')

In [135]:
#Check total num of rows for each dataframe
df_names = [artist_attributes_df, artist_to_imdb_df, imdb_attributes_df, imdb_genres_df, links_df, movie_genres_df, movie_title_keywords_df, movies_df, ratings_df, tmdb_attributes_df, tmdb_keywords_df, tmdb_overview_keywords_df]
names = ['artist_attributes_df', 'artist_to_imdb_df', 'imdb_attributes_df', 'imdb_genres_df', 'links_df', 'movie_genres_df', 'movie_title_keywords_df', 'movies_df', 'ratings_df', 'tmdb_attributes_df', 'tmdb_keywords_df', 'tmdb_overview_keywords_df']

for i in range(0, len(df_names)):
  print(f'{names[i]} : {len(df_names[i])}')
  print(f'{names[i]} columns: {df_names[i].columns}')
  print('--------------------------------------------')


artist_attributes_df : 175719
artist_attributes_df columns: Index(['artist_id', 'name'], dtype='object')
--------------------------------------------
artist_to_imdb_df : 627212
artist_to_imdb_df columns: Index(['artist_id', 'imdb_id'], dtype='object')
--------------------------------------------
imdb_attributes_df : 14761
imdb_attributes_df columns: Index(['tid', 'title', 'url', 'imdbRating', 'duration', 'year', 'nrOfWins',
       'nrOfNominations'],
      dtype='object')
--------------------------------------------
imdb_genres_df : 33040
imdb_genres_df columns: Index(['genre', 'id'], dtype='object')
--------------------------------------------
links_df : 9742
links_df columns: Index(['imdbId', 'movieId', 'tmdbId'], dtype='object')
--------------------------------------------
movie_genres_df : 22084
movie_genres_df columns: Index(['Unnamed: 0', 'genre', 'movieID', 'title'], dtype='object')
--------------------------------------------
movie_title_keywords_df : 47663
movie_title_keywords

#**Data Exploration and Cleaning/Transformation**

###**Artist**

In [136]:
#Artist 
artist_attributes_df.head()

Unnamed: 0,artist_id,name
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman


In [137]:
artist_attributes_df.describe()

Unnamed: 0,artist_id,name
count,175719,175719
unique,175719,173131
top,nm0000001,Dinesh
freq,1,7


There are 175719 unique artists but only 173131 names..Some names might be duplicated in this dataset

In [138]:
#Check for duplicate names
artist_attributes_df.value_counts('name')

name
Dinesh               7
Chris Brown          6
John Gilbert         5
Robert Hall          5
Chris Wright         5
                    ..
Gustavo Pizzi        1
Gustavo Pomeranec    1
Gustavo Rodriguez    1
Gustavo Ron          1
Þórir Waagfjörð      1
Length: 173131, dtype: int64

In [139]:
#Should we remove these duplicates? The problem is these names also have unique artist_id, this means that different movies starred by these artists might be using different artist_id for the same artist
#It is safe to just keep them for now
artist_attributes_df[artist_attributes_df['name'] == 'Dinesh']

Unnamed: 0,artist_id,name
93166,nm10010521,Dinesh
158451,nm5110893,Dinesh
161717,nm5625923,Dinesh
166104,nm6549371,Dinesh
171504,nm8128125,Dinesh
171524,nm8135104,Dinesh
175206,nm9696209,Dinesh


###**IMDB**

In [220]:
imdb_attributes_df = pd.read_csv('/content/imdb_attributes.csv')

In [221]:
imdb_attributes_df.head()

Unnamed: 0,tid,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations
0,tt0012349,Der Vagabund und das Kind (1921),http://www.imdb.com/title/tt0012349/,8.4,3240,1921,1,0
1,tt0015864,Goldrausch (1925),http://www.imdb.com/title/tt0015864/,8.3,5700,1925,2,1
2,tt0017136,Metropolis (1927),http://www.imdb.com/title/tt0017136/,8.4,9180,1927,3,4
3,tt0017925,Der General (1926),http://www.imdb.com/title/tt0017925/,8.3,6420,1926,1,1
4,tt0021749,Lichter der Gro√üstadt (1931),http://www.imdb.com/title/tt0021749/,8.7,5220,1931,2,0


In [222]:
imdb_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14761 entries, 0 to 14760
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   tid              14761 non-null  object
 1   title            14761 non-null  object
 2   url              14761 non-null  object
 3   imdbRating       13582 non-null  object
 4   duration         13730 non-null  object
 5   year             14716 non-null  object
 6   nrOfWins         14760 non-null  object
 7   nrOfNominations  14761 non-null  object
dtypes: object(8)
memory usage: 922.7+ KB


In [223]:
#Transform datatypes 
#We are using errors='coerce' in case if there is any junk data, meaning any data that is not its respective type.. it will be replaced by NaN/Null
def transform_numeric_datatypes(df, cols):
  for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

def transform_int_datatypes(df, cols):
  for col in cols:
    df[col] = df[col].astype('Int64')

def transform_str_datatypes(df, cols):
  for col in cols:
    df[col] =  df[col].astype(str)

def transform_date_datatypes(df, cols):
  for col in cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

A lot of these values do not make sense (e.g. max year = 276637, max nrOfWins = 7620, etc). We will have to clean the data for further analysis. Let's investigate some of these columns.

In [225]:
transform_numeric_datatypes(imdb_attributes_df, ['imdbRating', 'duration', 'year', 'nrOfWins', 'nrOfNominations'])
transform_str_datatypes(imdb_attributes_df, ['title'])

#drop rows where year is less than 1900 and more than 2022
imdb_attributes_df = imdb_attributes_df[(imdb_attributes_df['year'] > 1900) & (imdb_attributes_df['year'] < 2023)]

#drop rows where nrOfWins is more than 100 
imdb_attributes_df = imdb_attributes_df[imdb_attributes_df['nrOfWins'] <= 100]

# drop rows if not Movie, TV Series, TV Mini-Series, or TV Episode 
imdb_attributes_df = imdb_attributes_df[~imdb_attributes_df.title.str.contains('|'.join(['Video', 'Video Game']))]

#create a new column called 'category' to store title category
imdb_attributes_df['category'] = ""

#Define title category
for index,data in enumerate(imdb_attributes_df['title']):
    if "TV Episode" in data:
      imdb_attributes_df['category'][index] = 'TV Episode'
    elif "TV Series" in data:
      imdb_attributes_df['category'][index] = 'TV Series'
    elif "TV Mini-Series" in data:
      imdb_attributes_df['category'][index] = 'TV Mini-Series'
    else:
      imdb_attributes_df['category'][index] = 'Movie'

#Split title to only keep the title (excluding title category and year released)
imdb_attributes_df['title'] = imdb_attributes_df['title'].apply(lambda x: x.split(' (')[0])

#Check average duration value for each category to fill duration missing value
imdb_attributes_df.groupby('category').mean('imdbRating')
imdb_attributes_df['duration'] = imdb_attributes_df['duration'].fillna(imdb_attributes_df['duration'].mean())

#Drop rows where imdbRating is NULL since these are mostly episodes of TV Episode/Series/Mini-Series
imdb_attributes_df = imdb_attributes_df[imdb_attributes_df['imdbRating'].notnull()]


imdb_attributes_df.describe()

#transform datatypes
# transform_int_datatypes(imdb_attributes_df, ['duration', 'year', 'nrOfWins', 'nrOfNominations'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,imdbRating,duration,year,nrOfWins,nrOfNominations
count,12594.0,12594.0,12594.0,12594.0,12594.0
mean,6.871074,5677.366922,1989.056535,3.312609,4.832778
std,1.105552,2605.834593,22.455765,8.144872,13.89707
min,1.0,60.0,1902.0,0.0,0.0
25%,6.3,4980.0,1977.0,0.0,0.0
50%,7.0,5760.0,1997.0,0.0,0.0
75%,7.6,6660.0,2006.0,3.0,4.0
max,9.9,68400.0,2014.0,94.0,418.0


In [226]:
imdb_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12594 entries, 0 to 14760
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tid              12594 non-null  object 
 1   title            12594 non-null  object 
 2   url              12594 non-null  object 
 3   imdbRating       12594 non-null  float64
 4   duration         12594 non-null  float64
 5   year             12594 non-null  float64
 6   nrOfWins         12594 non-null  float64
 7   nrOfNominations  12594 non-null  float64
 8   category         12594 non-null  object 
dtypes: float64(5), object(4)
memory usage: 983.9+ KB


In [175]:
#Define title category
for index,data in enumerate(imdb_attributes_df['title']):
    if "TV Episode" in data:
      imdb_attributes_df['category'][index] = 'TV Episode'
    elif "TV Series" in data:
      imdb_attributes_df['category'][index] = 'TV Series'
    elif "TV Mini-Series" in data:
      imdb_attributes_df['category'][index] = 'TV Mini-Series'
    else:
      imdb_attributes_df['category'][index] = 'Movie'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [182]:
#Split title to only keep the title (excluding title category and year released)
imdb_attributes_df['title'] = imdb_attributes_df['title'].apply(lambda x: x.split(' (')[0])

In [191]:
#transform datatypes
transform_int_datatypes(imdb_attributes_df, ['duration', 'year', 'nrOfWins', 'nrOfNominations'])

In [192]:
imdb_attributes_df.head()

Unnamed: 0,tid,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations,category
0,tt0012349,Der Vagabund und das Kind,http://www.imdb.com/title/tt0012349/,8.4,3240,1921,1,0,Movie
1,tt0015864,Goldrausch,http://www.imdb.com/title/tt0015864/,8.3,5700,1925,2,1,Movie
2,tt0017136,Metropolis,http://www.imdb.com/title/tt0017136/,8.4,9180,1927,3,4,Movie
3,tt0017925,Der General,http://www.imdb.com/title/tt0017925/,8.3,6420,1926,1,1,Movie
4,tt0021749,Lichter der Gro√üstadt,http://www.imdb.com/title/tt0021749/,8.7,5220,1931,2,0,Movie


In [193]:
imdb_attributes_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13664 entries, 0 to 14760
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tid              13664 non-null  object 
 1   title            13664 non-null  object 
 2   url              13664 non-null  object 
 3   imdbRating       12594 non-null  float64
 4   duration         12857 non-null  Int64  
 5   year             13664 non-null  Int64  
 6   nrOfWins         13664 non-null  Int64  
 7   nrOfNominations  13664 non-null  Int64  
 8   category         13664 non-null  object 
dtypes: Int64(4), float64(1), object(4)
memory usage: 1.6+ MB


In [205]:
imdb_attributes_df.describe()

Unnamed: 0,imdbRating,duration,year,nrOfWins,nrOfNominations
count,12594.0,11995.0,12594.0,12594.0,12594.0
mean,6.871074,5686.784494,1989.056535,3.312609,4.832778
std,1.105552,2669.762431,22.455765,8.144872,13.89707
min,1.0,60.0,1902.0,0.0,0.0
25%,6.3,4800.0,1977.0,0.0,0.0
50%,7.0,5820.0,1997.0,0.0,0.0
75%,7.6,6720.0,2006.0,3.0,4.0
max,9.9,68400.0,2014.0,94.0,418.0


In [207]:
#Check average duration value for each category to fill duration missing value
imdb_attributes_df.groupby('category').mean('imdbRating')
imdb_attributes_df['duration'] = imdb_attributes_df['duration'].fillna(imdb_attributes_df['duration'].mean())

TypeError: ignored

In [204]:
#Drop rows where imdbRating is NULL since these are mostly episodes of TV Episode/Series/Mini-Series
imdb_attributes_df[imdb_attributes_df['duration'].isnull()]

Unnamed: 0,tid,title,url,imdbRating,duration,year,nrOfWins,nrOfNominations,category
473,tt1621432,Adventures in the Sin Bin,http://www.imdb.com/title/tt1621432/,5.4,,2012,1,0,Movie
1055,tt1644158,The Nostalgia Critic Conan: Part 1,http://www.imdb.com/title/tt1644158/,7.7,,2010,0,0,Movie
1288,tt0250182,AFI's 100 Years... 100 Laughs: America's Funni...,http://www.imdb.com/title/tt0250182/,6.4,,2000,0,0,Movie
1292,tt0312283,Vaktm√§staren och professorn,http://www.imdb.com/title/tt0312283/,7.5,,2002,0,0,Movie
1302,tt0469050,Qing dian da sheng,http://www.imdb.com/title/tt0469050/,5.0,,2005,0,0,Movie
...,...,...,...,...,...,...,...,...,...
14553,tt0481456,This Week,http://www.imdb.com/title/tt0481456/,6.2,,1996,0,0,
14556,tt0482447,The 52nd Annual Golden Globe Awards,http://www.imdb.com/title/tt0482447/,6.6,,1995,0,0,
14583,tt0488798,Welcome,http://www.imdb.com/title/tt0488798/,6.1,,2007,0,0,
14633,tt0499410,That Mitchell and Webb Look,http://www.imdb.com/title/tt0499410/,8.0,,2006,1,7,


In [None]:
merge_df = imdb_attributes_df.merge(artist_to_imdb_df, how='inner', left_on='tid', right_on='imdb_id').drop('tid', axis=1)
merge_df = merge_df.merge(artist_attributes_df, how='inner', on='artist_id')

In [None]:
merge_df

In [None]:
merged_df = merged_df.merge(links_df, how='left', left_on='imdb_id', right_on='imdbId').drop(['imdbId'], axis=1)

In [None]:
merged_df

##Clean movie genres

In [None]:
movie_genres_df.head()

In [None]:
#Remove Unnecessary column
movie_genres_df = movie_genres_df.drop('Unnamed: 0', axis=1)

In [None]:
#group data by its movieID to consolidate all the genres into a list under a new column called 'genre_list'
movie_genres_group_df = movie_genres_df.groupby(['movieID', 'title'])['genre'].apply(list).rename('genre_list').reset_index()

In [None]:
movie_genres_group_df