In [1]:
#Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
#Load the dataset
movie = pd.read_csv('movie.csv')
movie.head(200)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
195,197,"Stars Fell on Henrietta, The (1995)",Drama
196,198,Strange Days (1995),Action|Crime|Drama|Mystery|Sci-Fi|Thriller
197,199,"Umbrellas of Cherbourg, The (Parapluies de Che...",Drama|Musical|Romance
198,200,"Tie That Binds, The (1995)",Thriller


In [3]:
tag = pd.read_csv('tag.csv')
tag.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18


In [4]:
rating = pd.read_csv('rating.csv')
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
link = pd.read_csv('link.csv')
link.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
genome_tags = pd.read_csv('genome_tags.csv')
genome_tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [7]:
genome_scores = pd.read_csv('genome_scores.csv')
genome_scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


#### Data Analysis

In [8]:
movie.shape

(27278, 3)

In [9]:
tag.shape

(465564, 4)

In [10]:
rating.shape

(20000263, 4)

In [11]:
rating['userId'].value_counts()

118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
          ... 
59390       20
23558       20
34668       20
80291       20
58028       20
Name: userId, Length: 138493, dtype: int64

In [12]:
rating['userId'].value_counts().shape

(138493,)

#### Data Cleaning and Merge

In [13]:
movie_details=movie.merge(rating,on='movieId')

In [14]:
movie_details.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,2009-01-02 01:13:41


In [15]:
movie_details.shape

(20000263, 6)

In [16]:
movie_details.drop(columns=['timestamp'],inplace=True)

In [17]:
movie_details.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5


In [18]:
movie_details.drop_duplicates(['title','userId'], inplace = True)

In [19]:
movie_details.shape

(19999939, 5)

In [20]:
number_rating = movie_details.groupby('title')['rating'].count().reset_index()

In [21]:
number_rating

Unnamed: 0,title,rating
0,#chicagoGirl: The Social Network Takes on a Di...,3
1,$ (Dollars) (1971),24
2,$5 a Day (2008),39
3,$9.99 (2008),55
4,$ellebrity (Sellebrity) (2012),2
...,...,...
26724,À propos de Nice (1930),4
26725,Árido Movie (2005),1
26726,Åsa-Nisse - Wälkom to Knohult (2011),2
26727,Üvegtigris (2001),1


In [22]:
number_rating.rename(columns = {'rating':'number of rating'},inplace = True)

In [23]:
number_rating.head()

Unnamed: 0,title,number of rating
0,#chicagoGirl: The Social Network Takes on a Di...,3
1,$ (Dollars) (1971),24
2,$5 a Day (2008),39
3,$9.99 (2008),55
4,$ellebrity (Sellebrity) (2012),2


In [24]:
movie_details=movie_details.merge(number_rating,on='title')

In [25]:
movie_details.shape

(19999939, 6)

In [26]:
movie_details.head()

Unnamed: 0,movieId,title,genres,userId,rating,number of rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4.0,49695
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5.0,49695
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4.0,49695
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4.0,49695
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4.5,49695


In [27]:
movie_details=movie_details[movie_details['number of rating']>=50] #selecting valuable books by ratings

In [28]:
movie_details.drop_duplicates(['title','userId'], inplace = True)

In [29]:
movie_details.shape

(19847742, 6)

In [30]:
movie_details.sort_values('number of rating', ascending = False).head(10)

Unnamed: 0,movieId,title,genres,userId,rating,number of rating
1964501,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,51953,4.0,67310
1966041,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55162,3.5,67310
1966053,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55177,4.0,67310
1966052,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55176,5.0,67310
1966051,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55175,1.0,67310
1966050,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55174,5.0,67310
1966049,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55173,4.0,67310
1966048,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55172,5.0,67310
1966047,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55171,3.0,67310
1966046,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,55170,5.0,67310


In [31]:
movie_details['rating'] = movie_details['rating'].astype(int)

In [32]:
movie_details.head()

Unnamed: 0,movieId,title,genres,userId,rating,number of rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3,4,49695
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6,5,49695
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8,4,49695
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10,4,49695
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11,4,49695


In [33]:
movie_pivot = movie_details.pivot_table(columns='title',index='userId',values='rating')

In [34]:
movie_pivot.head()

title,$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),"...All the Marbles (California Dolls, The) (1981)",...,Zulu (1964),[REC] (2007),[REC]² (2009),"\\""Great Performances\""\"" Cats (1998)""",eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [35]:
movie_pivot.shape

(138493, 10523)

In [36]:
movie_pivot.fillna(0,inplace=True)

In [37]:
movie_pivot.head()

title,$9.99 (2008),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),"...All the Marbles (California Dolls, The) (1981)",...,Zulu (1964),[REC] (2007),[REC]² (2009),"\\""Great Performances\""\"" Cats (1998)""",eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Find the Correlation

In [42]:
def movie_recommendation(title):
    
    picked_movie = movie_pivot[title]
    
    #Find the Correlation
    similar_movies = movie_pivot.corrwith(picked_movie)
    
    #Create a new dataset
    corr_movie = pd.DataFrame(similar_movies, columns=['Correlation'])
    
    #Sort the values by correlation
    corr_movie = corr_movie.sort_values(by = 'Correlation',ascending = False)
    
    #set the title as index
    data = movie_details.set_index('title')
    
    # Merge the 'number of rating' to corr_movie
    corr_movie = pd.merge(corr_movie,data['number of rating'],left_index=True,right_index=True)
    
    corr_movie = corr_movie.sort_values(by = 'Correlation',ascending = False)
    
    corr_movie.drop_duplicates(inplace=True)
    
    return corr_movie.head()

In [43]:
movie_recommendation('Stars Fell on Henrietta, The (1995)')

Unnamed: 0_level_0,Correlation,number of rating
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Stars Fell on Henrietta, The (1995)",1.0,110
Country Life (1994),0.188937,100
Feast of July (1995),0.174797,79
Frankie Starlight (1995),0.166541,194
"Grass Harp, The (1995)",0.151373,304
