In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix, csr_matrix
from scipy.spatial.distance import jaccard, cosine 
#from pytest import approx

In [3]:
MV_users = pd.read_csv('data/movie/users.csv')
MV_movies = pd.read_csv('data/movie/movies.csv')
train = pd.read_csv('data/movie/train.csv')
test = pd.read_csv('data/movie/test.csv')

In [4]:
from collections import namedtuple
Data = namedtuple('Data', ['users','movies','train','test'])
data = Data(MV_users, MV_movies, train, test)

In [17]:
#data.train[data.train['uID'] == 2]['rating'].mean()
data.train.rating.values

array([5, 4, 5, ..., 3, 3, 1], dtype=int64)

In [47]:
data.test

Unnamed: 0,uID,mID,rating
0,2233,440,4
1,4274,587,5
2,2498,454,3
3,2868,2336,5
4,1636,2686,5
...,...,...,...
300058,810,247,4
300059,1193,3210,4
300060,6039,2289,4
300061,5397,429,3


1. Load the movie ratings data (as in the HW3-recommender-system) and use matrix factorization technique(s) and predict the missing ratings from the test data. Measure the RMSE. You should use sklearn library. [10 pts]

> The lowest RMSE from matrix factorization after label permutation is 1.609.

In [58]:
class MatrixFac():
    def __init__(self,data):
        self.data=data
        
    
    def rmse(self,yp):
        #yp[np.isnan(yp)]=3 
        yt=np.array(self.data.test.rating)
        return np.sqrt(((yt-yp)**2).mean())
    
    def rand(self, yp):
        from sklearn.metrics import rand_score
        return rand_score(yp, self.data.test.rating.values)
    
    def matrix_factor_alg(self):
        '''
        use matrix factorization alg to predict test rating
        '''
        
        from sklearn.decomposition import NMF

        nmf = NMF(n_components=5, max_iter=1000,solver='cd', #beta_loss='kullback-leibler',
                  init='random',random_state=10) 

        
        uid_mid_matrix = self.data.train[['uID','mID']].values
        nmf.fit(uid_mid_matrix)
        pred_prob = nmf.transform(self.data.test[['uID','mID']].values)
        pred_labels = np.argmax(pred_prob, axis=1)
        
        return pred_labels
        

In [59]:
mf = MatrixFac(data)
yp = mf.matrix_factor_alg()
print(mf.rmse(yp))

3.2879881971694904


In [54]:
from itertools import permutations

def label_permute_compare(ytdf,yp,n=5):
    """
    ytdf: labels dataframe object
    yp: clustering label prediction output
    Returns permuted label order and accuracy. 
    Example output: (3, 4, 1, 2, 0), 0.74 
    """
    # your code here
    # map permutaion list into label list
    # eg each permu like (3, 4, 1, 2, 0), first num 3 map to 'PRAD' ...etc
    
    

    label_lst = [1,2,3,4,5]
    result = [] # list of tuple, [(permu1, acc1), (permu2, acc2)...]
    permus = permutations(range(n))
    for permu in permus:
        ytdf['temp_label'] = ytdf['rating'].map({label_lst[0]:permu[0], 
                                                  label_lst[1]:permu[1],
                                                  label_lst[2]:permu[2],
                                                  label_lst[3]:permu[3],
                                                  label_lst[4]:permu[4]})
        #acc = metrics.accuracy_score(yp, ytdf['temp_label'])
        yt = ytdf['temp_label']
        dist = np.sqrt(((yt-yp)**2).mean())
        result.append((permu, round(dist,3)))
        
    return result

In [55]:
yt = data.test

result = label_permute_compare(yt, yp)

result.sort(key=lambda x: x[1], reverse=False)
result

[((4, 3, 2, 1, 0), 1.609),
 ((4, 3, 0, 1, 2), 1.613),
 ((4, 3, 1, 0, 2), 1.617),
 ((4, 3, 2, 0, 1), 1.626),
 ((4, 3, 1, 2, 0), 1.659),
 ((4, 3, 0, 2, 1), 1.672),
 ((3, 4, 2, 1, 0), 1.698),
 ((3, 4, 0, 1, 2), 1.703),
 ((3, 4, 1, 0, 2), 1.706),
 ((3, 4, 2, 0, 1), 1.714),
 ((3, 4, 1, 2, 0), 1.746),
 ((4, 2, 0, 1, 3), 1.754),
 ((4, 2, 1, 0, 3), 1.757),
 ((3, 4, 0, 2, 1), 1.758),
 ((4, 2, 3, 1, 0), 1.778),
 ((4, 2, 3, 0, 1), 1.793),
 ((4, 0, 2, 1, 3), 1.816),
 ((4, 1, 2, 0, 3), 1.831),
 ((4, 0, 3, 1, 2), 1.843),
 ((4, 1, 3, 0, 2), 1.858),
 ((4, 0, 1, 2, 3), 1.861),
 ((4, 1, 0, 2, 3), 1.872),
 ((2, 4, 0, 1, 3), 1.888),
 ((2, 4, 1, 0, 3), 1.891),
 ((4, 1, 3, 2, 0), 1.894),
 ((4, 0, 3, 2, 1), 1.895),
 ((2, 4, 3, 1, 0), 1.91),
 ((4, 2, 1, 3, 0), 1.914),
 ((2, 4, 3, 0, 1), 1.924),
 ((4, 2, 0, 3, 1), 1.925),
 ((0, 4, 2, 1, 3), 1.964),
 ((4, 0, 1, 3, 2), 1.975),
 ((1, 4, 2, 0, 3), 1.982),
 ((4, 0, 2, 3, 1), 1.982),
 ((4, 1, 2, 3, 0), 1.982),
 ((4, 1, 0, 3, 2), 1.986),
 ((0, 4, 3, 1, 2), 1.989),
 (

2. Discuss the results and why sklearn's non-negative matrix facorization library did not work well compared to simple baseline or similarity-based methods we’ve done in Module 3. Can you suggest a way(s) to fix it? [10 pts]

> The baseline has rmse 1.26, and the nmf model has the smallest rmse 1.609 after label correction.
Some possible reason this method did not work well: the matrix has small number of column, only 2 mid and uid, compare to tf-idf, so the original dimension is not large, even the n_compoent=5 is larger than the number of column in data matrix. 
Suggestion on fixing: let the data matrix include more features, movie genre, date, director, number of views, etc.