In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

## Import data into df's

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv')

In [3]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [4]:
df = ratings.merge(movies, on='movieId')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 5.4+ MB


In [6]:
df['movieId'].nunique()

9724

In [7]:
df['title'].nunique()

9719

### Transform into a matrix... userID x movieID

In [8]:
mm = df.pivot_table(values='rating', index='userId', columns='title')

In [9]:
mm.fillna(2.5,inplace=True)

In [10]:
mm.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,4.0,2.5
2,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
3,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
4,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5


In [11]:
mm.shape

(610, 9719)

In [12]:
R = mm.values

In [13]:
len(R)

610

### Create the NMF model and set the hyperparameters

* model assumes R ~ PQ'

In [14]:
m = NMF(n_components=500, init='random', random_state=10, max_iter=200)

m.fit(R)

Q = m.components_  # movie-genre matrix

P = m.transform(R)  # user-genre matrix

print(m.reconstruction_err_) #reconstruction error



68.25941664239208




In [15]:
nR = np.dot(P, Q)
nR.shape

(610, 9719)

### Handle a new user

In [16]:
# predict the hidden features for a new data point
user = {'title' : ["Fight Club (1999)", "Pretty Woman (1990)" , "The Butterfly Effect (2004)", 
                   "Inception (2010)", "(500) Days of Summer (2009)","Devil Wears Prada, The (2006)"],
        'rating' : [5,2,4,4,2,1]}

In [17]:
user = pd.DataFrame(user)
user

Unnamed: 0,title,rating
0,Fight Club (1999),5
1,Pretty Woman (1990),2
2,The Butterfly Effect (2004),4
3,Inception (2010),4
4,(500) Days of Summer (2009),2
5,"Devil Wears Prada, The (2006)",1


In [18]:
titles = df['title'].unique()

In [19]:
titles = pd.DataFrame(titles)
titles.shape

(9719, 1)

In [20]:
user_merge = pd.merge(titles, user, left_on = 0, right_on = 'title', how = 'left')

In [21]:
user_merge.shape

(9719, 3)

In [22]:
query = user_merge['rating']

In [23]:
query = query.fillna(2.5)

In [24]:
query = np.array(query)

In [25]:
len(query)

9719

In [26]:
user = np.array(query)
user = user.reshape(1, -1)
user.shape

(1, 9719)

In [27]:
profile = m.transform(user)
profile.shape



(1, 500)

In [28]:
Q.shape

(500, 9719)

In [29]:
result = np.dot(profile, Q)          # how strongly our user would like all movies
result.shape

(1, 9719)

In [30]:
s = pd.Series(result[0], index=mm.columns)

In [31]:
s.sort_values(ascending=False).head(5)

title
Adventures of Priscilla, Queen of the Desert, The (1994)    3.011337
Mrs. Doubtfire (1993)                                       2.871751
Animal House (1978)                                         2.866159
City of God (Cidade de Deus) (2002)                         2.848963
Avatar (2009)                                               2.846201
dtype: float64