In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
df_reviews = pd.read_csv('reviews.csv')
df_reviews.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
df_reviews.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [4]:
df_reviews['userId'].value_counts()

In [5]:
df_movies_titles = pd.read_csv('movies.csv', index_col=False)
df_movies_titles.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
df = pd.merge(df_reviews,df_movies_titles,on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [7]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,98180.0,98180.0,98180.0,98180.0
mean,326.348747,15938.331259,3.502567,1197962000.0
std,182.344624,28668.177407,1.040068,213527400.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1193.0,3.0,1008558000.0
50%,325.0,2890.0,3.5,1179177000.0
75%,477.0,7121.0,4.0,1428575000.0
max,610.0,131237.0,5.0,1537799000.0


In [8]:
df_ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
df_ratings['number_of_ratings'] = df.groupby('title')['rating'].count()
df_ratings.head()

Unnamed: 0_level_0,rating,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


In [10]:
movie_matrix = df.pivot_table(index='userId', columns='title',values='rating')
movie_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),"...All the Marbles (California Dolls, The) (1981)",...,Zulu (1964),Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [11]:
Avatar_user_rating = movie_matrix['Avatar (2009)']
Avatar_user_rating = Avatar_user_rating.dropna()
Avatar_user_rating.head()

In [12]:
similar_to_Avatar = movie_matrix.corrwith(Avatar_user_rating)
corr_Avatar = pd.DataFrame(similar_to_Avatar,columns=['correlation'])
corr_Avatar.dropna(inplace=True)
corr_Avatar = corr_Avatar.join(df_ratings['number_of_ratings'])
corr_Avatar.head()

Unnamed: 0_level_0,correlation,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",0.353553,17
(500) Days of Summer (2009),0.13112,42
*batteries not included (1987),0.785714,7
10 Things I Hate About You (1999),0.265637,54
"10,000 BC (2008)",-0.075431,17


In [13]:
corr_Avatar[df_ratings['number_of_ratings']>100].sort_values(by='correlation',ascending=False).head(10)

Unnamed: 0_level_0,correlation,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Trainspotting (1996),0.623735,102
"Monsters, Inc. (2001)",0.541616,132
"Net, The (1995)",0.530552,112
Finding Nemo (2003),0.501461,141
"Matrix, The (1999)",0.493241,278
Die Hard: With a Vengeance (1995),0.466794,144
Men in Black (a.k.a. MIB) (1997),0.459441,165
Star Wars: Episode VI - Return of the Jedi (1983),0.458785,196
"Terminator, The (1984)",0.448443,131
Spider-Man (2002),0.448073,122


In [14]:
Thor_user_rating = movie_matrix['Thor (2011)']
Thor_user_rating = Thor_user_rating.dropna()
Thor_user_rating.head()

In [16]:
similar_to_Thor = movie_matrix.corrwith(Thor_user_rating)
similar_to_Thor

In [17]:
corr_Thor = pd.DataFrame(similar_to_Thor,columns=['correlation'])
corr_Thor.dropna(inplace=True)
corr_Thor = corr_Thor.join(df_ratings['number_of_ratings'])
corr_Thor.head()

Unnamed: 0_level_0,correlation,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",0.997176,17
(500) Days of Summer (2009),0.38682,42
10 Things I Hate About You (1999),0.27356,54
"10,000 BC (2008)",0.245718,17
101 Dalmatians (1996),0.08973,47


In [18]:
corr_Thor[df_ratings['number_of_ratings']>100].sort_values(by='correlation',ascending=False).head(10)

Unnamed: 0_level_0,correlation,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Stargate (1994),0.759494,140
Aladdin (1992),0.730377,183
Jurassic Park (1993),0.714481,238
"Lion King, The (1994)",0.710244,172
"Bourne Identity, The (2002)",0.705183,112
X-Men (2000),0.670755,133
Up (2009),0.665904,105
Batman Begins (2005),0.665208,116
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),0.658109,107
Men in Black (a.k.a. MIB) (1997),0.643894,165
