In [1]:
import pandas as pd
import numpy as np

In [2]:
movie_df = pd.read_csv('movies.csv')

In [3]:
rating_df = pd.read_csv('ratings.csv')

In [4]:
## Checking the tables

In [5]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
## Now combine the two tables and drop things we don´t have to use

In [7]:
combine_movie_rating = pd.merge(rating_df, movie_df, on='movieId')

In [8]:
combine_movie_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [9]:
columns = ['timestamp', 'genres']

In [10]:
combine_movie_rating = combine_movie_rating.drop(columns, axis=1)

In [11]:
combine_movie_rating.head(10)

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)
5,18,1,3.5,Toy Story (1995)
6,19,1,4.0,Toy Story (1995)
7,21,1,3.5,Toy Story (1995)
8,27,1,3.0,Toy Story (1995)
9,31,1,5.0,Toy Story (1995)


In [12]:
combine_movie_rating = combine_movie_rating.dropna(axis = 0, subset = ['title'])

In [13]:
movie_ratingCount = (combine_movie_rating.
                    groupby(by = ['title'])['rating'].
                    count().
                    reset_index().
                    rename(columns={'rating': 'totalRatingCount'})
                    [['title', 'totalRatingCount']]
                    )

In [14]:
movie_ratingCount.head(10)

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
5,'Tis the Season for Love (2015),1
6,"'burbs, The (1989)",17
7,'night Mother (1986),1
8,(500) Days of Summer (2009),42
9,*batteries not included (1987),7


In [15]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on='title', right_on='title', how='left')

In [16]:
rating_with_totalRatingCount.head(10)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215
5,18,1,3.5,Toy Story (1995),215
6,19,1,4.0,Toy Story (1995),215
7,21,1,3.5,Toy Story (1995),215
8,27,1,3.0,Toy Story (1995),215
9,31,1,5.0,Toy Story (1995),215


In [None]:
## Now drop the duplicate data

In [17]:
user_rating = rating_with_totalRatingCount.drop_duplicates(['userId', 'title'])

In [18]:
user_rating.head(10)

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,1,4.0,Toy Story (1995),215
1,5,1,4.0,Toy Story (1995),215
2,7,1,4.5,Toy Story (1995),215
3,15,1,2.5,Toy Story (1995),215
4,17,1,4.5,Toy Story (1995),215
5,18,1,3.5,Toy Story (1995),215
6,19,1,4.0,Toy Story (1995),215
7,21,1,3.5,Toy Story (1995),215
8,27,1,3.0,Toy Story (1995),215
9,31,1,5.0,Toy Story (1995),215


In [None]:
## Matrix Factorization

In [None]:
# Now create a matrix and fill 0 values

In [19]:
movie_user_rating_pivot = user_rating.pivot(index = 'userId', columns='title', values='rating').fillna(0)

In [20]:
movie_user_rating_pivot.head(10)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
X = movie_user_rating_pivot.values.T

In [22]:
X.shape

(9719, 610)

In [None]:
## Now lets fit the model

In [25]:
import sklearn
from sklearn.decomposition import TruncatedSVD

In [27]:
SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)

In [28]:
matrix.shape

(9719, 12)

In [29]:
import warnings


In [30]:
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [31]:
corr = np.corrcoef(matrix)

In [32]:
corr.shape

(9719, 9719)

In [33]:
## Now check the Results

In [34]:
movie_title = movie_user_rating_pivot.columns

In [35]:
movie_title_list = list(movie_title)

In [45]:
coffey_hands = movie_title_list.index("Guardians of the Galaxy (2014)")

In [46]:
#coffey_hands = movie_title_list.index("Avatar (2009)")

In [47]:
corr_coffey_hands = corr[coffey_hands]

In [60]:
list(movie_title[(corr_coffey_hands >= 0.9)])

['Adjustment Bureau, The (2011)',
 'Amazing Spider-Man, The (2012)',
 'Ant-Man (2015)',
 'Avatar (2009)',
 'Avengers, The (2012)',
 'Avengers: Age of Ultron (2015)',
 'Big Hero 6 (2014)',
 'Brave (2012)',
 'Captain America: Civil War (2016)',
 'Captain America: The First Avenger (2011)',
 'Captain America: The Winter Soldier (2014)',
 'Cloud Atlas (2012)',
 'Cloudy with a Chance of Meatballs (2009)',
 'Dark Knight Rises, The (2012)',
 'Deadpool (2016)',
 'Despicable Me (2010)',
 'District 9 (2009)',
 'Django Unchained (2012)',
 'Doctor Strange (2016)',
 'Edge of Tomorrow (2014)',
 "Ender's Game (2013)",
 'Grand Budapest Hotel, The (2014)',
 'Gravity (2013)',
 'Guardians of the Galaxy (2014)',
 'Guardians of the Galaxy 2 (2017)',
 'Harry Potter and the Deathly Hallows: Part 1 (2010)',
 'Harry Potter and the Deathly Hallows: Part 2 (2011)',
 'Hobbit: An Unexpected Journey, The (2012)',
 'Hobbit: The Desolation of Smaug, The (2013)',
 'How to Train Your Dragon (2010)',
 'Hugo (2011)',
 'I