# NMF_for_MovieRatings

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 
from pytest import approx
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
MV_users = pd.read_csv('data/users.csv')
MV_movies = pd.read_csv('data/movies.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)
display(data.test)

Unnamed: 0,uID,mID,rating
0,2233,440,4
1,4274,587,5
2,2498,454,3
3,2868,2336,5
4,1636,2686,5
...,...,...,...
300058,810,247,4
300059,1193,3210,4
300060,6039,2289,4
300061,5397,429,3


In [2]:
class RecSys():
    def __init__(self,data):
        self.data=data
        self.allusers = list(self.data.users['uID'])
        self.allmovies = list(self.data.movies['mID'])
        self.genres = list(self.data.movies.columns.drop(['mID', 'title', 'year']))
        self.mid2idx = dict(zip(self.data.movies.mID,list(range(len(self.data.movies)))))
        self.uid2idx = dict(zip(self.data.users.uID,list(range(len(self.data.users)))))
        self.Mr=self.rating_matrix()
        self.Mm=None 
        self.sim=np.zeros((len(self.allmovies),len(self.allmovies)))
        self.movies_rated_by_dict = self.train_movies_rated()
        self.avg_user_ratings, self.avg_user_rating_array = self.get_avg_user_ratings()
         
        
    def get_avg_user_ratings(self):
        """
        For each unique uid, add to a dictionary the average ratings for movies rated by that user.
        Return a dictionary and and array, with respective keys of uid and user index.
        """
        df2 = self.data.train[['uID','rating']].groupby('uID',as_index=False).mean()
        dict1 = {uid: rating for uid, rating in df2.itertuples(index=False, name=None)}
        dict2 = {self.uid2idx[uid]: rating for uid, rating in df2.itertuples(index=False, name=None)}
        arr = np.zeros(len(self.data.users))
        for i in range(len(self.data.users)):
            arr[i] = dict2[i]
        return dict1, arr
        
    def train_movies_rated(self):
        """
        For each unique uid, construct a list of movie INDEXES rated by that user.
        """
        d = {}
        df2 = self.data.train.drop(['rating'],axis=1)
        for uid, mid in df2.itertuples(index=False, name=None):
            midx = self.mid2idx[mid]
            if uid not in d:
                d[uid] = []
            d[uid].append(midx)
        return d
        
    def rating_matrix(self):
        """
        Convert the rating matrix to numpy array of shape (#allusers,#allmovies)
        """
        ind_movie = [self.mid2idx[x] for x in self.data.train.mID] 
        ind_user = [self.uid2idx[x] for x in self.data.train.uID]
        rating_train = list(train.rating)
        return np.array(coo_matrix((rating_train, 
                                    (ind_user, ind_movie)), shape=(len(self.allusers), len(self.allmovies))).toarray())

    def predict_everything_to_3(self):
        """
        Predict everything to 3 for the test data
        """
        # your code here
        return np.array([3] * self.data.test.shape[0])
        
        
    def predict_to_user_average(self):
        """
        Predict to average rating for the user.
        Returns numpy array of shape (#users,)
        """
        n_test = self.data.test.shape[0]
        predictions = np.zeros(n_test)
        df = self.data.test
        for idx, uid in enumerate(list(df['uID'])):
            predictions[idx] = self.avg_user_ratings[uid]
        return predictions
    
    def predict_from_sim(self,uid,mid):
        """
        Predict a user rating on a movie given userID and movieID
        """
        ## For movies rated by this user in train
        ## weight each rating by its similarity score relative to the target movie
        uidx = self.uid2idx[uid]
        m_target = self.mid2idx[mid]
        movies_rated = self.movies_rated_by_dict[uid]        
        ratings = [ self.Mr[uidx,midx] for midx in movies_rated]
        similarity = [self.sim[m_target,midx]  for midx in movies_rated]
        weighted_scores = np.dot(ratings, similarity)
        numerator = np.sum(weighted_scores)
        denominator = np.sum(similarity)
        if denominator > 0:
            prediction = numerator / denominator
        else:
            prediction = self.avg_user_ratings[uid]
        return prediction
        
    def predict(self):
        """
        Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
        """
        # your code here
        n_test = self.data.test.shape[0]
        predictions = np.zeros(n_test)
        df = self.data.test
        for idx, (uid, mid) in enumerate(df[['uID','mID']].itertuples(index=False, name=None)):
            prediction = self.predict_from_sim(uid, mid)
            predictions[idx] = prediction
        return predictions
    
    def rmse(self,yp):
        yp[np.isnan(yp)]=3 #In case there is nan values in prediction, it will impute to 3.
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())

    
class ContentBased(RecSys):
    def __init__(self,data):
        super().__init__(data)
        self.data=data
        self.Mm = self.calc_movie_feature_matrix()
       
        
    def calc_movie_feature_matrix(self):
        """
        Create movie feature matrix in a numpy array of shape (#allmovies, #genres) 
        """
        # your code here
        df = self.data.movies
        df2 = df.drop(columns=['mID','title','year'])
        return df2.to_numpy()

    def jaccard(self, sa, sb):
        """
        calculate Jaccard similarity for two binary vectors
        which represent the movie genres present in two movies being compared.
        """
        return sum(sa & sb)/sum(sa | sb)
    
    def calc_item_item_similarity(self):
        """
        Create item-item similarity using Jaccard similarity
        """
        n = self.Mm.shape[0]
        print(f'n = number of Movies = {n}')
        for i in range(n):
            movie_i = self.Mm[i,:]
            for j in range(i,n):
                movie_j = self.Mm[j,:]
                sim_score = self.jaccard(movie_i, movie_j)
                ## jaccard similarity between any 2 movies 
                ## leads to a symmetric matix
                self.sim[i,j] = sim_score
                self.sim[j,i] = sim_score
                
class Collaborative(RecSys):    
    def __init__(self,data):
        super().__init__(data)
        self.X = self.impute_missing_ratings()
        
    def impute_missing_ratings(self):
        rows = self.Mr.shape[0]
        cols = self.Mr.shape[1]
        X = np.zeros((rows,cols))
        for uidx in range(rows):
            user_avg_rating = self.avg_user_rating_array[uidx]
            for midx in range(cols):
                X[uidx, midx] = self.Mr[uidx, midx]
                if not self.Mr[uidx, midx]:
                    X[uidx, midx] = user_avg_rating
        X = X - self.avg_user_rating_array[:, np.newaxis]
        return X
                
        
    def calc_item_item_similarity(self, simfunction, *X):  
        """
        Create item-item similarity using similarity function. 
        X is an optional transformed matrix of Mr
        """    
        if len(X)==0:
            self.sim = simfunction()            
        else:
            self.sim = simfunction(X[0]) # *X passes in a tuple format of (X,), to X[0] will be the actual transformed matrix
            
    def cossim(self):    
        """
        Calculates item-item similarity for all pairs of items using cosine similarity (values from 0 to 1) on utility matrix
        Returns a cosine similarity matrix of size (#all movies, #all movies)
        """
        #         **To Do:**    
        # 1.Impute the unrated entries in self.Mr to the user's average rating 
        # then subtract by the user mean, call this matrix X.   
        # 2.Calculate cosine similarity for all item-item pairs. 
        # Don't forget to rescale the cosine similarity to be 0~1.    
        # You might encounter divide by zero warning (numpy will fill nan value for that entry). 
        # In that case, you can fill those with appropriate values.
        
        self.pdist = pdist(self.X.T, metric='cosine')
        self.squareform = squareform(self.pdist)
        self.cosine_raw = -1 * self.squareform + 1
        self.cosine_raw = np.nan_to_num(self.cosine_raw)
        self.cosine = 0.5 + 0.5 * self.cosine_raw
        return self.cosine
    
    def jacsim(self,Xr):
        """
        Calculates item-item similarity for all pairs of items using jaccard similarity (values from 0 to 1)
        Xr is the transformed rating matrix.
        """     
        self.jac_pdist = pdist(Xr.T, metric='jaccard')
        self.jac_squareform = squareform(self.jac_pdist)
        self.jaccard = -1 * self.jac_squareform + 1
        return self.jaccard
    
    

In [3]:
# tests predict_everything_to_3 in class RecSys
rs = RecSys(data)
yp = rs.predict_everything_to_3()
print(rs.rmse(yp))

1.2585510334053043


In [4]:
# tests predict_to_user_average in the class RecSys
yp = rs.predict_to_user_average()
print(rs.rmse(yp))

1.0352910334228647


## For comparison with NMF this is the best prediction RMSE for item to item similarity using the methods from Homework 3

In [7]:
%%time
cf = Collaborative(data)
Xr = cf.Mr.astype(int)
t0=time.perf_counter()
cf.calc_item_item_similarity(cf.jacsim,Xr)
t1=time.perf_counter()
time_sim = t1-t0
print('similarity calculation time',time_sim)
yp = cf.predict()
rmse = cf.rmse(yp)
print(rmse)
assert(rmse<0.96)

similarity calculation time 206.0671339999999
0.9509147941162469
CPU times: total: 4min 28s
Wall time: 4min 29s


In [8]:
cf.Mr.shape

(6040, 3883)

In [9]:
cf.genres

['Doc',
 'Com',
 'Hor',
 'Adv',
 'Wes',
 'Dra',
 'Ani',
 'War',
 'Chi',
 'Cri',
 'Thr',
 'Sci',
 'Mys',
 'Rom',
 'Fil',
 'Fan',
 'Act',
 'Mus']

In [10]:
len(cf.genres)

18

In [26]:
cf.Mr.shape[0] * cf.Mr.shape[1]

23453320

In [12]:
np.unique(cf.Mr, return_counts=True)

(array([0, 1, 2, 3, 4, 5]),
 array([22753174,    39436,    75174,   182802,   244225,   158509],
       dtype=int64))

In [27]:
proportion_of_missing_values = 22753174/23453320
print(f'{proportion_of_missing_values=}')

proportion_of_missing_values=0.9701472542053747


## Imputing missing values in the Movie Rating Matrix

We want to apply NMF matrix factorization to a matrix MovieRatings of #users by #movies which contains
user ratings 1 to 5 for some of the movies. Missing ratings have been represented by zero. Note that about 97%
of the values are missing. On average each user rated about 3% of the movies.
We would like to factor this matrix to infer the genres of the movies on the theory that the genre
weighting of the various movies is the main determiner of the user preference for the movies.
So in theory we should be able to factor MovieRatings into the product of two matracies
of dimensions #Users by #Genres and #Genres by #Movie.

However we have a problem because the NMF method does not work with missing values. If we leave the missing values
represented as zero, this would assume that most users have a very low opinion of most movies. I propose replacing
all of the zero ratings by the average movie rating for each user, before we attempt the matrix factorization. This
is still not a very good solution because it will weight any predicitions toward the average of user's ratings.

In [21]:
## Complete the Movie Ratings Matrix by replacing zeros with the average movie rating for each user.
MovieRatings = cf.Mr
MovieRatingsImputed = np.zeros(MovieRatings.shape)
display(MovieRatings.shape)
zero_count = 0
for user_idx in range(MovieRatings.shape[0]):
    for movie_idx in range(MovieRatings.shape[1]):
        if MovieRatings[user_idx,movie_idx] == 0:
            zero_count = zero_count + 1
            MovieRatingsImputed[user_idx,movie_idx] = cf.avg_user_rating_array[user_idx]
        else:
            MovieRatingsImputed[user_idx,movie_idx] = MovieRatings[user_idx,movie_idx]

(6040, 3883)

In [28]:
np.mean(MovieRatingsImputed)

3.702782708415624

# Matrix Factorization

In [31]:
%%time
from sklearn.decomposition import NMF

MAX_NMF_ITERATIONS = 10000
NUMBER_OF_CATEGORIES = 18 # number of Genres associated with movie ratings
# Run the nmf model
nmf = NMF(
    n_components=NUMBER_OF_CATEGORIES, 
    init='nndsvd',
    max_iter=MAX_NMF_ITERATIONS,
    l1_ratio=0.0,
    solver='cd',
    alpha_W=0.0, 
    alpha_H='same',
    tol=1e-5,
    random_state=42
).fit(MovieRatingsImputed)

weights = nmf.transform(MovieRatingsImputed)
components = nmf.components_
print(f'{weights.shape=}')
print(f'{components.shape=}')



weights.shape=(6040, 18)
components.shape=(18, 3883)
CPU times: total: 36min 43s
Wall time: 6min 8s


In [35]:
def predict_from_NMF_weights(user_weights, movie_components, user_idx, movie_idx):
    """
    Predict user ratings for movies based on NMF factorization
    """
    prediction = np.dot(user_weights[user_idx,:],movie_components[:,movie_idx])
    return prediction


In [38]:
def predict_from_NMF(rec_sys, user_weights, movie_components):
    """
    Predict ratings in the test data. Returns predicted rating in a numpy array of size (# of rows in testdata,)
    """
    # your code here
    n_test = rec_sys.data.test.shape[0]
    predictions = np.zeros(n_test)
    df = rec_sys.data.test
    for idx, (uid, mid) in enumerate(df[['uID','mID']].itertuples(index=False, name=None)):
        user_idx = rec_sys.uid2idx[uid]
        movie_idx = rec_sys.mid2idx[mid]
        prediction = predict_from_NMF_weights(user_weights, movie_components, user_idx, movie_idx)
        predictions[idx] = prediction
    return predictions


In [39]:
yp = predict_from_NMF(cf, weights, components)
print(cf.rmse(yp))

0.9694834109776633
