In [286]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
#from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold, ShuffleSplit
#from sklearn.grid_search import GridSearchCV
import time, math

import matplotlib.pylab as plt
from datetime import date, datetime
import os
import seaborn as sns
import random

from sklearn.decomposition import NMF
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

from scipy.sparse import csr_matrix

In [544]:
ratings = pd.read_csv("exports/ratings.csv",parse_dates = True,sep=",")
movies = pd.read_csv("exports/movies.csv",parse_dates = True,sep=",")

In [545]:
ratings.shape

(6608, 4)

In [546]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [549]:
###join both datasets

all_ratings =  pd.merge(ratings, movies, on='movieId', how='inner')
#all_ratings.reset_index(drop=True)

all_ratings.set_index('movieId', inplace=True)


In [550]:
all_ratings['movieId2'] = all_ratings.index

In [551]:
all_ratings.head()

Unnamed: 0_level_0,userId,rating,timestamp,title,genres,movie_year,movieId2
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
47,1,5,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995.0,47
47,4,2,945173425,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995.0,47
47,6,4,845553317,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995.0,47
47,8,4,839463546,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995.0,47
47,13,5,987895819,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,1995.0,47


In [552]:
ctab = pd.crosstab(index = all_ratings.movieId2, columns = all_ratings.userId, values = all_ratings.userId, aggfunc = 'count')
ctab

userId,1,3,4,5,6,7,8,9,11,12,...,601,602,603,604,605,606,607,608,609,610
movieId2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20,,,,,,,,,,,...,,,,,,,,,,
21,,,1.0,1.0,1.0,,1.0,,,,...,,1.0,1.0,,,,,1.0,,
22,,,,,1.0,,,,,,...,,1.0,,1.0,,,,,,
23,,,,,,,,,,,...,,,,1.0,,,,,,
24,,,,,1.0,,,,,,...,,,,,,,,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1302,,1.0,,,,,,,,,...,,,,,,,,1.0,,
1346,,,,,,,,,,,...,,,1.0,,,,,,,
2108,,,,,,,,,,,...,,,,,,,,,,
2478,1.0,,,,,,,,,,...,,,,,,,,,,


In [557]:
movie_names =  pd.merge(ctab, all_ratings, left_on=['movieId2'], right_on=['movieId2'], how='inner')
#movie_names =  pd.merge(ctab, all_ratings, on='movieId', how='inner')

#Both_DFs = pd.merge(df1,df2, how='left',left_on=['A','B'],right_on=['A','CC']).dropna()
movie_names = movie_names[['movieId2','title']]

In [558]:
movie_names

Unnamed: 0,movieId2,title
0,20,Money Train (1995)
1,20,Money Train (1995)
2,20,Money Train (1995)
3,20,Money Train (1995)
4,20,Money Train (1995)
...,...,...
6603,2478,¡Three Amigos! (1986)
6604,2478,¡Three Amigos! (1986)
6605,2478,¡Three Amigos! (1986)
6606,2478,¡Three Amigos! (1986)


In [559]:
movie_names = movie_names.drop_duplicates(keep='first')

In [560]:
#manual creation if the R matrix
R = csr_matrix((all_ratings['rating'], (all_ratings['userId'], all_ratings['movieId2'])))

R = R.todense()   # convert sparse matrix to dense matrix, same as: matrix_sparse.A
R = R[1:,1:]                  # removing the "Python starts at 0" offset
R = np.asarray(R) 

In [239]:
#create sparse matic and convert to dense matrix
def ConvertToDense_new(vals, y, x):
    R = csr_matrix((vals, (y, x)))

    R = R.todense()   # convert sparse matrix to dense matrix, same as: matrix_sparse.A
    R = R[1:,1:]                  # removing the "Python starts at 0" offset
    R = np.asarray(R)
    return R 



In [561]:
#test code to check function X1= ratings[['userId']].values
ConvertToDense_new(all_ratings['rating'], all_ratings['userId'], all_ratings['movieId2'])


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [241]:
#convert R matrix to dataframe for inspection purposes only
Rdf = pd.DataFrame(R)

In [242]:
#caluclate the sparsity of the R matrix
print(len(R.nonzero()[0]) / float(R.shape[0] * R.shape[1]) )

0.003985344335208837


###run the nmf model

In [428]:
##run the nmf model

from sklearn.decomposition import NMF

nmf_model = NMF(n_components=20)     # starts with 20 latents factors

# Matrix factorization               # V ~ W.H  (Find two non-negative matrices (W, H) whose product approximates the non- negative matrix X. )
nmf_model.fit(R)                     # R can be array-like or sparse, here it is array-like (dense)
Theta = nmf_model.transform(R)       # user latent factors (= W, called the features matrix)
M = nmf_model.components_.T          # item latent factors (= H.T) (H is called the coefficient matrix)

# Making the predictions
R_pred = M.dot(Theta.T)              # See http://stackoverflow.com/questions/24739121/nonnegative-matrix-factorization-in-sklearn
R_pred = R_pred.T                    # same dimensions as R

print('Item features - M:', M.shape)
print('User features - Theta:', Theta.shape)

print('R ~ M * Theta.T:')
print(R_pred.round(2) )
print(R_pred.shape )


Item features - M: (2698, 20)
User features - Theta: (610, 20)
R ~ M * Theta.T:
[[0.   0.   0.   ... 0.   0.   0.02]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.  ]
 ...
 [0.   0.   0.   ... 0.   0.   0.04]
 [0.   0.   0.   ... 0.   0.   0.  ]
 [0.   0.   0.   ... 0.   0.   0.03]]
(610, 2698)


In [627]:
#userId'], ratings['movieId']

def GetShape(filename):
    names = ['userId', 'movieId2', 'rating', 'timestamp']
    df = pd.read_csv(filename, sep=',',  names=names, header=0) 
    n_users = len(df['userId'].unique())
    n_items = len(df['movieId2'].unique())
    return (n_users, n_items)

def LoadData(filename):
    names = ['userId', 'movieId2', 'rating', 'timestamp']
    #df = pd.read_csv(filename, sep='\t', names=names)  
    df = pd.read_csv(filename, sep=',',  names=names, header=0) 
    X = df[['userId', 'movieId2']].values

    X1= df['userId'].values
    X2= df['movieId2'].values
    y = df['rating'].values 

    return X, y, ConvertToDense_new(y, X1, X2)

R_shape = GetShape('exports/ratings.csv') 
R_shape
X, y, R = LoadData('exports/ratings.csv') 




In [628]:
R


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [384]:
#convert R matrix to dataframe for inspection purposes only
Rdf = pd.DataFrame(R)

In [564]:
X, y, R

(array([[   1,   47],
        [   1,   50],
        [   1,   70],
        ...,
        [ 610,  608],
        [ 610,  904],
        [ 610, 1218]], dtype=int64),
 array([5, 5, 3, ..., 4, 5, 5], dtype=int64),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

In [246]:
'''X1= df['userId'].values
    X2= df['movieId'].values
    y = df['rating'].values 

    return X, y, ConvertToDense_new(y, X1, X2)'''

"X1= df['userId'].values\n    X2= df['movieId'].values\n    y = df['rating'].values \n\n    return X, y, ConvertToDense_new(y, X1, X2)"

In [565]:
X

array([[   1,   47],
       [   1,   50],
       [   1,   70],
       ...,
       [ 610,  608],
       [ 610,  904],
       [ 610, 1218]], dtype=int64)

In [248]:
#X[:, 0].shape
#X[:, 1].shape


In [405]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

#R_train = ConvertToDense_new(X_train, y_train, R_shape)
#R_test = ConvertToDense_new(X_test, y_test, R_shape)

#print(X_train[:,0])
R_train = ConvertToDense_new(y_train, X_train[:,0], X_train[:,1])
R_test = ConvertToDense_new(y_test, X_test[:,0], X_test[:,1])

print(R_train)
print(R_train)
print(R_train.shape)
print()
print(R_test)
print(R_test)
print(R_test.shape)




[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(610, 2698)

[[0 0 0 ... 0 0 4]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 4]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(610, 2478)


###2.3 Choose a model: and NMF paramaters to test find the best NMF model with best paramaters

In [406]:
from sklearn.decomposition import NMF

parametersNMF = {
                    'n_components' : 20,     # number of latent factors
                    'init' : 'random', 
                    'random_state' : 0, 
                    'alpha' : 0.01,          # regularization term
                    'l1_ratio' : 0,          # set regularization = L2 
                    'max_iter' : 10
                }

estimator = NMF(**parametersNMF)

##Estimating the error (RMSE) before tuning the hyperparameters

In [305]:
from sklearn.metrics import mean_squared_error

def get_rmse(pred, actual):
    pred = pred[actual.nonzero()].flatten()     # Ignore nonzero terms
    actual = actual[actual.nonzero()].flatten() # Ignore nonzero terms
    return np.sqrt(mean_squared_error(pred, actual))

In [262]:
X[:, 0][].shape

(6608,)

In [284]:
#RMSE for cross validation of n_fold models

err = 0
n_iter = 0
#n_folds = 5
#kf = 5

#kf = KFold(6608, 5)
#n_splits = 5
#kf = KFold(n_splits = n_splits, shuffle = True, random_state = 42)


kf = KFold(n_splits=5, random_state=None, shuffle=True)
for train_index, test_index in kf.split(X_test):  
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Converting sparse array to dense array
    R_train = ConvertToDense_new(y_train, X_train[:,0], X_train[:,1])
    R_test = ConvertToDense_new(y_test, X_test[:,0], X_test[:,1])


    # Training (matrix factorization)
    t0 = time.time()
    estimator.fit(R_train)  
    Theta = estimator.transform(R_train)       # user features
    M = estimator.components_.T                # item features
    print(  time.time() - t0  )
    n_iter += estimator.n_iter_ 

    # Making the predictions
    R_pred = M.dot(Theta.T)
    R_pred = R_pred.T      
    
    # Clipping values                                                    
    #R_pred[R_pred > 5] = 5.           # clips ratings above 5             
    #R_pred[R_pred < 1] = 1.           # clips ratings below 1

    # Computing the error on the validation set 
    #err += get_rmse(R_pred, R_test)
    print( get_rmse(R_pred, R_test) )
    
#print("*** RMSE Error : ," + err / 5 )
print("Mean number of iterations:" + n_iter / 5 )

1.2382566928863525
0.21983531594456193
0.26427769660949707
0.21940861033366837
0.43485403060913086
0.2216021525684641
0.14461374282836914
0.22099459180608405
0.2892475128173828


ValueError: Found input variables with inconsistent numbers of samples: [111, 110]

In [298]:

#cv = ShuffleSplit(X_train.shape[0], n_iter=5, test_size=0.20, random_state=0)  

cv = ShuffleSplit(n_splits = 10,  test_size = 0.2, random_state = 0)      
# 5-fold sc

In [300]:
###Grid search

param =        {
                    'n_components' : [15, 20, 25],
                    'alpha' : [0.001, 0.01, 0.1],
                    'l1_ratio' : [0], 
                    'max_iter' : [15, 20, 25]
                }

# Keep track of RMSE and parameters
grid_search = pd.DataFrame([[0, 0, 0, 0, 0]])
grid_search.columns = ['n_components', 'alpha', 'l1_ratio', 'max_iter'] + ['RMSE']

# nb of folds in ShuffleSplit CV
n_folds = 5      
i = 0

# Performing the Grid search
for n_components in param['n_components']:
    for alpha in param['alpha']:
        for l1_ratio in param['l1_ratio']:
            for max_iter in param['max_iter']:

                err = 0
                n_iter = 0
                #print('Search'),print(str(i)), print('/'), print(str(3 ** 3 - 1))
                
                cv.get_n_splits(X,y)
                for train_index, test_index in cv.split(X, y):  
    
                    X_train_cv, X_test_cv = X_train[train_index], X_train[test_index]
                    y_train_cv, y_test_cv = y_train[train_index], y_train[test_index]
    
                    # Converting sparse array to dense array
                    
                    R_train = ConvertToDense_new(y_train, X_train[:,0], X_train[:,1])
                    R_test = ConvertToDense_new(y_test, X_test[:,0], X_test[:,1])

                    # updating the parameters
                    parametersNMF = {
                    'n_components' : n_components,
                    'init' : 'random', 
                    'random_state' : 0, 
                    'alpha' : alpha,
                    'l1_ratio' : l1_ratio,
                    'max_iter' : max_iter}
                    estimator = NMF(**parametersNMF)
                
                    # Training (matrix factorization)
                    t0 = time.time()
                    estimator.fit(R_train)  
                    Theta = estimator.transform(R_train)       # user features
                    M = estimator.components_.T                # item features
                    #print "Fit in %0.3fs" % (time.time() - t0)
                    n_iter += estimator.n_iter_ 

                    # Making the predictions
                    R_pred = M.dot(Theta.T).T
                    
                    # Clipping values                                                    
                    #R_pred[R_pred > 5] = 5.           # clips ratings above 5             
                    #R_pred[R_pred < 1] = 1.           # clips ratings below 1

                    # Computing the error on the validation set 
                    err += get_rmse(R_pred, R_test)
    
                #print "RMSE Error : ", err / n_folds
                grid_search.loc[i] = [n_components, alpha, l1_ratio, max_iter, err / n_folds]
                print(grid_search.loc[i].tolist()), 
                print("Mean number of iterations:"),
                print( n_iter / n_folds)
                i += 1

best_params = grid_search.sort_values('RMSE')[:1]
print('*** best params ***')
print(best_params)

IndexError: index 2586 is out of bounds for axis 0 with size 1058

In [566]:
parametersNMF_opt = {
                    'n_components' : 20,     # number of latent factors
                    'init' : 'random', 
                    'random_state' : 0, 
                    'alpha' : 0.01,          # regularization term
                    'l1_ratio' : 0,          # set regularization = L2 
                    'max_iter' : 15
                }

In [567]:
estimator = NMF(**parametersNMF_opt)
                
# Training (matrix factorization)
estimator.fit(R_train)  
Theta = estimator.transform(R_train)       # user features
M = estimator.components_.T                # item features

# Making the predictions
R_pred = M.dot(Theta.T).T
                    
# Clipping values                                                    
#R_pred[R_pred > 5] = 5.           # clips ratings above 5             
#R_pred[R_pred < 1] = 1.           # clips ratings below 1

# Computing the error on the test set 
print('RMSE test:', get_rmse(R_pred, R_test) )

RMSE test: 3.319155635313933


In [568]:
R_pred, R

(array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.04842243],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.08330314],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.00392446],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.0498742 ]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64))

##2.7 Train the final model on whole dataset

In [569]:
estimator = NMF(**parametersNMF_opt)
                
# Training (matrix factorization)
estimator.fit(R)  
Theta = estimator.transform(R)            # user features
M = estimator.components_.T               # item features

# Making the predictions
R_pred = M.dot(Theta.T).T
                    
# Clipping values                                                    
#R_pred[R_pred > 5] = 5.           # clips ratings above 5             
#R_pred[R_pred < 1] = 1.           # clips ratings below 1

In [427]:
R_all_df = pd.DataFrame(R)

In [570]:
R.shape, R_pred.shape

#R_df = pd.DataFrame(R)
#R_Pred =pd.DataFrame(R_pred)




((610, 2698), (610, 2698))

In [571]:
R_pred_df_all = pd.DataFrame(R_pred) 

In [608]:
R_df = pd.DataFrame(R).iloc[10, :] 
R_pred_df = pd.DataFrame(R_pred).iloc[10, :] 

#R_pred_df =pd.DataFrame(R_pred)
#R_df = pd.DataFrame(R)



In [576]:
#movies_list = movies[['title', 'movieId']]

In [613]:

R_df = pd.DataFrame(R).iloc[10, :] 
R_pred_df = pd.DataFrame(R_pred).iloc[10, :] 

recList = pd.DataFrame(pd.merge(R_pred_df, R_df , left_index=True, right_index=True)  )

# Sort ascending order
recList.sort_values(recList.columns[0], ascending=False, inplace=True)

###remove movies already viewed by the user
recList.drop(recList[recList.iloc[:,1] > 0].index, inplace=True)

###drop exisitng film rated column
#recList.drop(recList.iloc[:,1].index, inplace=True)
recList=recList.drop(recList.columns[1], axis=1)

##trim predicted ratins when revie > 0
recList.drop(recList[recList.iloc[:,0] == 0].index, inplace=True)

###create movieid column in rec list using the index
recList['movieId'] = recList.index

##creat full movie title list from original movies df
movies_list = movies[['title', 'movieId']]

##join  recList10 with the 'movie_names' dataframe to insert the Movie titles
recList =  pd.merge(recList, movies_list, left_on=['movieId'], right_on=['movieId'], how='inner')

#top 10
recList10 = recList.head(10)
recList10.rename(columns={recList10.columns[0]: 'Prediction'},inplace=True)
recList10['Rank'] = recList10['Prediction'].rank(ascending=False)

recList10[['Rank','title']]

#recList10




Unnamed: 0,Rank,title
0,1.0,Congo (1995)
1,2.0,Eye for an Eye (1996)
2,3.0,How to Make an American Quilt (1995)
3,4.0,Mr. Wonderful (1993)
4,5.0,Dolores Claiborne (1995)
5,6.0,Powder (1995)
6,7.0,Nadja (1994)
7,8.0,Priest (1994)
8,9.0,Up Close and Personal (1996)
9,10.0,Forbidden Planet (1956)


In [414]:
print(R), print(R_pred)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [379]:
import pickle

# Save the trained model to file
with open('acW10_nmf_model_FIN.sav', 'wb') as f:
    pickle.dump(estimator, f)


In [None]:
movies['title'].unique

In [368]:
movie_titles = pd.DataFrame(movies['title'])
#movie_titles = movie_titles['title'].unique

moviealter = pd.DataFrame(movie_titles).iloc[50,:] 


title    Georgia (1995)
Name: 50, dtype: object

###3.1 Item recommendation for an active user (given its rating history)

In [615]:
def make_recommendation_activeuser(R, prediction, user_id, k=10):
    '''
    user_idx ...... select an active user
    k  ............ number of movies to recommend
    '''
    
    #rated_items_df_user = pd.DataFrame(R).iloc[user_idx, :]                 # get the list of actual ratings of user_idx (seen movies)
    #user_prediction_df_user = pd.DataFrame(prediction).iloc[user_idx,:]     # get the list of predicted ratings of user_idx (unseen movies)
    #reco_df = pd.concat([rated_items_df_user, user_prediction_df_user, movie_titles], axis=1)   # merge both lists with the movie's title
    #reco_df.columns = ['rating','prediction','title']
    

    R_df = pd.DataFrame(R).iloc[user_id, :] 
    R_pred_df = pd.DataFrame(prediction).iloc[user_id, :] 

    recList = pd.DataFrame(pd.merge(R_pred_df, R_df , left_index=True, right_index=True)  )

    # Sort ascending order
    recList.sort_values(recList.columns[0], ascending=False, inplace=True)

    ###remove movies already viewed by the user
    recList.drop(recList[recList.iloc[:,1] > 0].index, inplace=True)

    ###drop exisitng film rated column
    #recList.drop(recList.iloc[:,1].index, inplace=True)
    recList=recList.drop(recList.columns[1], axis=1)

    ##trim predicted ratins when revie > 0
    recList.drop(recList[recList.iloc[:,0] == 0].index, inplace=True)

    ###create movieid column in rec list using the index
    recList['movieId'] = recList.index

    ##creat full movie title list from original movies df
    movies_list = movies[['title', 'movieId']]

    ##join  recList10 with the 'movie_names' dataframe to insert the Movie titles
    recList =  pd.merge(recList, movies_list, left_on=['movieId'], right_on=['movieId'], how='inner')

    #top  or top k recommendations
    recList10 = recList.head(k)
    recList10.rename(columns={recList10.columns[0]: 'Predictions'},inplace=True)
    recList10['Rank'] = recList10['Predictions'].rank(ascending=False)

    return recList10[['Rank','title']]




In [621]:
make_recommendation_activeuser(R, R_pred, user_id=50, k=10)
#make_recommendation_activeuser(R, R_pred, user_idx=130, k=10)

Unnamed: 0,Rank,title
0,1.0,Vertigo (1958)
1,2.0,Powder (1995)
2,3.0,Mr. Wonderful (1993)
3,4.0,Crumb (1994)
4,5.0,It Takes Two (1995)
5,6.0,Before Sunrise (1995)
6,7.0,Money Train (1995)
7,8.0,Beautiful Girls (1996)
8,9.0,Eye for an Eye (1996)
9,10.0,Shanghai Triad (Yao a yao yao dao waipo qiao) ...


In [622]:
#####new user function
query = {12: 4, 92: 5, 177: 4, 196: 5, 891: 4, 1128: 5, 1258: 5, 1320: 4}

In [624]:
data = list(query.values())   # the ratings of the new user
row_ind = [0]*len(data)       # we use just a single row 0 for this user 
col_ind = list(query.keys())  # the columns (=movieId) of the ratings
data, row_ind, col_ind

([4, 5, 4, 5, 4, 5, 5, 4],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [12, 92, 177, 196, 891, 1128, 1258, 1320])

In [625]:
R.shape[1]

2698

In [626]:
nuser_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R.shape[1]))
nuser_vec

<1x2698 sparse matrix of type '<class 'numpy.intc'>'
	with 8 stored elements in Compressed Sparse Row format>

In [630]:
###converrt dense R matric to sparse
R_sp = csr_matrix(R)
R_sp

<610x2698 sparse matrix of type '<class 'numpy.int64'>'
	with 6559 stored elements in Compressed Sparse Row format>

In [634]:
###trian model 2 NMF


model2 = NMF(n_components=20, init='nndsvd', max_iter=10000, tol=0.01, verbose=2)
# fit it to the user-item rating matrix
model2.fit(R_sp)

violation: 1.0
violation: 0.3628336098224234
violation: 0.2655168124582823
violation: 0.20760339045847945
violation: 0.1632238280341509
violation: 0.13096531918009907
violation: 0.10838819683385183
violation: 0.09108200531368006
violation: 0.07734289138423955
violation: 0.06672526167164686
violation: 0.05879197356986169
violation: 0.05296301754387636
violation: 0.04855998393365482
violation: 0.04524016886380106
violation: 0.042795935866795934
violation: 0.04116509976085029
violation: 0.040198767480496146
violation: 0.038699925411774805
violation: 0.036670524812483805
violation: 0.03426029804143876
violation: 0.0318163969637116
violation: 0.030902872242907307
violation: 0.031031409364830058
violation: 0.031409416264783795
violation: 0.03015372932737926
violation: 0.027910996362005742
violation: 0.025568519647768247
violation: 0.022662622013688984
violation: 0.020566421202884325
violation: 0.018894450893642695
violation: 0.017537856756638118
violation: 0.016582678738989103
violation: 0.0

In [635]:
model2.components_.shape

(20, 2698)

In [636]:
# user-'genre' matrix [611x55]
P=model2.transform(R)
# movie-'genre' matrix [55x168253]
Q=model2.components_
P.shape, Q.shape

violation: 1.0
violation: 0.114924467454566
violation: 0.008384397908123
Converged at iteration 4


((610, 20), (20, 2698))

In [637]:
# user with id 1: sparse format
R_sp[1,:]

<1x2698 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [639]:
# user with id 1: dense embedding
P[1, :]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [640]:
# reconstructed matrix Rhat
R_sp_hat = P.dot(Q)
R_sp_hat

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.0253681 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.03825445],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00338201],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.03053722]])

In [642]:
# R -> encoding -> P -> decoding -> Rhat (one-liner to get R_hat)
R_sp_hat = model2.inverse_transform(model2.transform(R_sp))

violation: 1.0
violation: 0.11492446745456598
violation: 0.008384397908122983
Converged at iteration 4


In [643]:
R_sp.shape, R_sp_hat.shape

((610, 2698), (610, 2698))

In [644]:
model2.reconstruction_err_

174.6651357190336

In [645]:
with open('nmf_recommenderM2.pkl', 'wb') as file:
    pickle.dump(model2, file)

In [646]:
##read the model back
with open('nmf_recommenderM2.pkl', 'rb') as file:
    model2 = pickle.load(file)

In [647]:
query

{12: 4, 92: 5, 177: 4, 196: 5, 891: 4, 1128: 5, 1258: 5, 1320: 4}

In [649]:
R_sp[1,:]

<1x2698 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [650]:
#Construct a user vector
data = list(query.values())   # the ratings of the new user
row_ind = [0]*len(data)       # we use just a single row 0 for this user 
col_ind = list(query.keys())  # the columns (=movieId) of the ratings
data, row_ind, col_ind


([4, 5, 4, 5, 4, 5, 5, 4],
 [0, 0, 0, 0, 0, 0, 0, 0],
 [12, 92, 177, 196, 891, 1128, 1258, 1320])

In [651]:
# new user vector: needs to have the same format as the training data

n_user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R_sp.shape[1]))
n_user_vec

<1x2698 sparse matrix of type '<class 'numpy.intc'>'
	with 8 stored elements in Compressed Sparse Row format>

In [652]:
#calculate model2 score

scores = model2.inverse_transform(model2.transform(n_user_vec))

# convert to a pandas series
scores = pd.Series(scores[0])
scores

violation: 1.0
violation: 0.04816419115662636
violation: 0.00023135885686553767
Converged at iteration 4


0       0.000000
1       0.000000
2       0.000000
3       0.000000
4       0.000000
          ...   
2693    0.000000
2694    0.000000
2695    0.000000
2696    0.000000
2697    0.002773
Length: 2698, dtype: float64

In [653]:
query.keys()

dict_keys([12, 92, 177, 196, 891, 1128, 1258, 1320])

In [654]:
# give a zero score to movies the user has allready seen
scores[query.keys()] = 0

In [655]:
#sort scores
scores = scores.sort_values(ascending=False)
scores

61      0.166956
140     0.142244
94      0.138527
167     0.107294
111     0.105264
          ...   
1014    0.000000
1015    0.000000
1016    0.000000
1017    0.000000
1349    0.000000
Length: 2698, dtype: float64

In [656]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([61, 140, 94, 167, 111, 134, 35, 276, 30, 75], dtype='int64')

In [663]:
movies_list.columns

Index(['title', 'movieId'], dtype='object')

In [670]:
recommendations.shape


(10,)

In [679]:
#movies_list.set_index('movieId', infile= True)
#movies_list = movies_list.set_index('movieId')
movies_list['movieId'] = movies_list.index

In [680]:
movies_list.columns

Index(['title', 'movieId'], dtype='object')

In [683]:
#movies_list.set_index('movieId').loc[recommendations]


#movies_list = movies_list.set_index('movieId')
recs = pd.DataFrame(recommendations)
recs.rename(columns={recs.columns[0]: 'movieId'},inplace=True)

recs= pd.merge(recs, movies_list, left_on=['movieId'], right_on=movies_list['movieId'], how='inner')
recs[['title']



In [686]:
recs[['title']]

Unnamed: 0,title
0,Eye for an Eye (1996)
1,Up Close and Personal (1996)
2,Beautiful Girls (1996)
3,Taxi Driver (1976)
4,Milk Money (1994)
5,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,Big Bully (1996)


In [665]:
query

{12: 4, 92: 5, 177: 4, 196: 5, 891: 4, 1128: 5, 1258: 5, 1320: 4}

In [691]:
# collaborative filtering = look at ratings only!
def new_recommend_nmf(query, model, ratings, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
   #Construct a user vector
    data = list(query.values())   # the ratings of the new user
    row_ind = [0]*len(data)       # we use just a single row 0 for this user 
    col_ind = list(query.keys())  # the columns (=movieId) of the ratings
    data, row_ind, col_ind
    
    # construct a user vector
    # new user vector: needs to have the same format as the training data

    n_user_vec = csr_matrix((data, (row_ind, col_ind)), shape=(1, R_sp.shape[1]))
    n_user_vec
   
    # 2. scoring
    
    #calculate model2 score
    scores = model2.inverse_transform(model2.transform(n_user_vec))
    # convert to a pandas series
    scores = pd.Series(scores[0])
    scores = scores.sort_values(ascending=False)
  
    
    # 3. ranking
    
    # filter out movies allready seen by the user
    # give a zero score to movies the user has allready seen
    scores[query.keys()] = 0
    # return the top-k highst rated movie ids or titles
    recommendations = scores.head(k).index
    

    #movies_list = movies_list.set_index('movieId')
    recs = pd.DataFrame(recommendations)
    recs.rename(columns={recs.columns[0]: 'movieId'},inplace=True)

    recs= pd.merge(recs, movies_list, left_on=['movieId'], right_on=movies_list['movieId'], how='inner')
    return recs[['title']]
#movies.set_index('movieId').loc[recommendations]

In [692]:
new_recommend_nmf(query, model2, ratings, k=10)

violation: 1.0
violation: 0.04816419115662636
violation: 0.00023135885686553767
Converged at iteration 4


Unnamed: 0,title
0,Eye for an Eye (1996)
1,Up Close and Personal (1996)
2,Beautiful Girls (1996)
3,Taxi Driver (1976)
4,Milk Money (1994)
5,Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6,Big Bully (1996)
