# Movie Recommender System
### Base on Matrix Factorization
- Author: Alexis
- Updated: 2022/2/27
- Updated: 2021/2/15



In [None]:
import pandas as pd
import numpy as np

### Load and organize the data

In [None]:
# Download and unzip dataset

!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

--2022-02-27 13:36:24--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2022-02-27 13:36:25 (4.56 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [None]:
# Merge data

PATH = '/content/drive/MyDrive/Dataset/ml-latest-small/'
ratings = pd.read_csv(PATH + 'ratings.csv')
movies = pd.read_csv(PATH + 'movies.csv')
combine_movie_rating = pd.merge(ratings,movies,on='movieId')

print(combine_movie_rating.shape)
combine_movie_rating.head()

(100836, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [None]:
# Calculate the number of ratings by movie

movie_rating_count = (combine_movie_rating.
                  groupby(by=['title'])['rating'].
                  count().
                  reset_index().
                  rename(columns={'rating':'totalRatingCount'})
                  [['title','totalRatingCount']]
                 )

print(movie_rating_count.shape)
movie_rating_count.head()

(9719, 2)


Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [None]:
# Merge data

rating_with_total_rating_count = combine_movie_rating.merge(movie_rating_count,
                                                            left_on='title',
                                                            right_on='title',
                                                            how="inner")

print(rating_with_total_rating_count.shape)
rating_with_total_rating_count.head()

(100836, 7)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,totalRatingCount
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


In [None]:
# Filter out movies with number of ratings below the threshold

popularity_threshold = 20

rating_popular_movie = rating_with_total_rating_count[rating_with_total_rating_count['totalRatingCount']>=popularity_threshold]

print(rating_popular_movie.shape)
rating_popular_movie.head()

(67901, 7)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,totalRatingCount
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


In [None]:
# Filter out duplicate data

rating_popular_movie = rating_popular_movie.drop_duplicates(['userId','title'])

print(rating_popular_movie.shape)
rating_popular_movie.head()

(67898, 7)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres,totalRatingCount
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215


In [None]:
# Convert to user-item matrix

rating_pivot = rating_popular_movie.pivot(index="userId",columns="title",values="rating").fillna(0)

print(rating_pivot.shape)
rating_pivot.head()

(610, 1297)


title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),21 Grams (2003),21 Jump Street (2012),25th Hour (2002),27 Dresses (2008),28 Days (2000),28 Days Later (2002),28 Weeks Later (2007),300 (2007),3:10 to Yuma (2007),"40-Year-Old Virgin, The (2005)",50 First Dates (2004),"6th Day, The (2000)",8 Mile (2002),A.I. Artificial Intelligence (2001),About Schmidt (2002),About a Boy (2002),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Adaptation (2002),Addams Family Values (1993),"Addams Family, The (1991)","Adjustment Bureau, The (2011)",Adventures in Babysitting (1987),"Adventures of Buckaroo Banzai Across the 8th Dimension, The (1984)","Adventures of Priscilla, Queen of the Desert, The (1994)",Aeon Flux (2005),"African Queen, The (1951)",Air Force One (1997),Airheads (1994),...,What's Eating Gilbert Grape (1993),When Harry Met Sally... (1989),While You Were Sleeping (1995),Whiplash (2014),White Men Can't Jump (1992),Who Framed Roger Rabbit? (1988),"Whole Nine Yards, The (2000)",Wild Things (1998),Wild Wild West (1999),William Shakespeare's Romeo + Juliet (1996),Willow (1988),Willy Wonka & the Chocolate Factory (1971),Witness (1985),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",Wonder Boys (2000),Working Girl (1988),"World Is Not Enough, The (1999)",Wreck-It Ralph (2012),"Wrestler, The (2008)",Wyatt Earp (1994),"X-Files: Fight the Future, The (1998)",X-Men (2000),X-Men Origins: Wolverine (2009),X-Men: Days of Future Past (2014),X-Men: First Class (2011),X-Men: The Last Stand (2006),X2: X-Men United (2003),Yes Man (2008),You've Got Mail (1998),Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,4.0,5.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Query by user

In [None]:
# Find a movie
x = movies[movies['title'].str.contains("Moneyball")]
print(x)

# Get movie titles
movie_title = rating_pivot.columns
movie_list = list(movie_title)

#idx = movie_list.index('Moneyball (2011)')
#print(idx)

# Get the rating of a movie
y = ratings[ratings['movieId']==89492].sort_values(by=['rating'],ascending=False)
print(y)
# 408,495

      movieId             title genres
7688    89492  Moneyball (2011)  Drama
       userId  movieId  rating   timestamp
61679     408    89492     5.0  1468147077
79709     495    89492     5.0  1458635252
64841     414    89492     4.5  1328313081
11726      70    89492     4.5  1355185210
83402     528    89492     4.5  1391736722
52302     339    89492     4.5  1460346554
11554      68    89492     4.5  1321936093
81420     514    89492     4.5  1533873024
46463     305    89492     4.0  1460303444
53186     351    89492     4.0  1326027883
79225     490    89492     4.0  1328145178
9029       62    89492     4.0  1521489573
69978     448    89492     4.0  1355584097
37142     249    89492     4.0  1346758113
25738     177    89492     4.0  1435526646
22418     152    89492     4.0  1450867862
15908     103    89492     4.0  1431955333
11949      73    89492     4.0  1464282937
82390     522    89492     4.0  1325653283
49683     318    89492     3.5  1424797380
16742     105    89

In [None]:
### Matrix factorization

import numpy as np
from sklearn.utils.extmath import randomized_svd

A = np.array(rating_pivot)
u, s, vt = randomized_svd(A, n_components = 12)
print(u.shape)
print(s.shape)
print(vt.shape)

# New rating matrix
nR = np.dot(u,vt)
print(nR.shape)

print(type(nR))

(610, 12)
(12,)
(12, 1297)
(610, 1297)
<class 'numpy.ndarray'>




In [None]:
# Get all new ratings for a user
user_idx = 494
scores = nR[user_idx]

# Get the tuple of score and index 
scores = list(enumerate(scores))

# Sort by rating
scores = sorted(scores,key=lambda x:x[1],reverse=True)

# Get the top 10
scores = scores[:10]

# Get the movie indices
movie_indices = [i[0] for i in scores]

# Convert to the movie titles
recomm = [movie_list[idx] for idx in movie_indices]

# Display the top 10 movies
for r in recomm:
    print(r)

Goodfellas (1990)
Hangover, The (2009)
Departed, The (2006)
Godfather, The (1972)
Social Network, The (2010)
Inglourious Basterds (2009)
Wolf of Wall Street, The (2013)
Inception (2010)
Fight Club (1999)
American History X (1998)
