In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF

## Import data into df's

In [53]:
movies = pd.read_csv('ml-latest-small/movies.csv')

In [54]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [55]:
df = ratings.merge(movies, on='movieId')

In [56]:
df['movieId'].nunique()

9724

In [57]:
df['title'].nunique()

9719

### Transform into a matrix... userID x movieID

In [58]:
mm = df.pivot_table(values='rating', index='userId', columns='title')

In [59]:
mm.fillna(2.5,inplace=True)

In [60]:
mm.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,4.0,2.5
2,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
3,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
4,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5
5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,...,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5,2.5


In [61]:
mm.shape

(610, 9719)

In [62]:
R = mm.values

### Create the NMF model and set the hyperparameters

* model assumes R ~ PQ'

In [14]:
m = NMF(n_components=500, init='random', random_state=10, max_iter=200)

m.fit(R)

Q = m.components_  # movie-genre matrix

P = m.transform(R)  # user-genre matrix

print(m.reconstruction_err_) #reconstruction error



68.25941664239208




### Handle a new user

In [37]:
# predict the hidden features for a new data point
user = {'title' : ["Fight Club (1999)", "Pretty Woman (1990)" , "The Butterfly Effect (2004)", 
                   "Inception (2010)", "(500) Days of Summer (2009)","Devil Wears Prada, The (2006)"],
        'rating' : [5,2,4,4,2,1]}

##### this is the new user input. I chose the movie (500) Days of Summer (2009) because it is the 9th movie on the database ordered by title, as you can see in the mm Matrix above. Thus it should be easy to check if it has received indeed **2** as rating.

In [38]:
user = pd.DataFrame(user)
user

Unnamed: 0,title,rating
0,Fight Club (1999),5
1,Pretty Woman (1990),2
2,The Butterfly Effect (2004),4
3,Inception (2010),4
4,(500) Days of Summer (2009),2
5,"Devil Wears Prada, The (2006)",1


In [64]:
titles = df['title'].unique()

In [65]:
titles = pd.DataFrame(titles)
titles.shape

(9719, 1)

In [66]:
user_merge = pd.merge(titles, user, left_on = 0, right_on = 'title', how = 'left')

In [67]:
user_merge.shape

(9719, 3)

In [69]:
query = user_ratings['rating']

#####  So this array should be the array with the ratings for all the movies from the new user. And it should match the order of all the other ratings in the mm Matrix. Thus we should have value 2 for for the 9th item of the array.

#####  This becomes very visible on collab filtering because I have to manually add this array back as a new userID in the matrix to calculate which users have the most similar taste.

In [70]:
query[:15]

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
Name: rating, dtype: float64

#####  a quick check shows that the are no ratings in the first movies on this array...

In [49]:
def numberOfNonNans(data):
    count = 0
    for i in data:
        if not np.isnan(i):
            count += 1
    return count 

numberOfNonNans(query)

6

#####  this quick test confirms that we have in fact the 6 ratings given by the new user. But where are they? And why are they not matching the movie titles?


In [23]:
query = query.fillna(2.5)

In [24]:
query = np.array(query)

In [48]:
len(query)

9724

In [26]:
user = np.array(query)
user = user.reshape(1, -1)
user.shape

(1, 9719)

In [27]:
profile = m.transform(user)
profile.shape



(1, 500)

In [28]:
Q.shape

(500, 9719)

In [29]:
result = np.dot(profile, Q)          # how strongly our user would like all movies
result.shape

(1, 9719)

In [30]:
s = pd.Series(result[0], index=mm.columns)

In [31]:
s.sort_values(ascending=False).head(5)

title
Adventures of Priscilla, Queen of the Desert, The (1994)    3.011337
Mrs. Doubtfire (1993)                                       2.871751
Animal House (1978)                                         2.866159
City of God (Cidade de Deus) (2002)                         2.848963
Avatar (2009)                                               2.846201
dtype: float64