In [164]:
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.manifold import TSNE
from scipy.linalg import sqrtm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.io
import math

In [30]:
movies = pd.read_csv("movie_lense/movies.csv", index_col=0)
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [148]:
R = pd.read_csv("movie_lense/ratings.csv").head(100000)
print ('{0}x{1} user by movie matrix'.format(*R.shape))
R.head()

100000x4 user by movie matrix


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [155]:
pivoted = R.pivot(index='userId', columns='movieId', values='rating')
pivoted.shape

(1041, 9289)

In [156]:
pivoted = pivoted.loc[:, pivoted.sum(axis=0) > 50]
pivoted.shape


(1041, 1580)

In [157]:
pivoted

movieId,1,2,3,5,6,7,9,10,11,12,...,140174,142488,148626,152077,152081,164179,166528,168252,174055,176371
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.0,4.0,,2.0,4.5,,,4.0,3.5,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,,,,,,,,,,,...,,,,,,,,,,
1038,,,3.0,,,,,,,,...,,,,,,,,,,
1039,,,,,,,,3.0,,,...,,,,,,,,,,
1040,2.5,,,,2.5,,,,,,...,,,,,4.0,4.0,2.5,3.0,2.5,


In [158]:
train = pivoted[:750]
test = pivoted[750:]
train

movieId,1,2,3,5,6,7,9,10,11,12,...,140174,142488,148626,152077,152081,164179,166528,168252,174055,176371
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.0,4.0,,2.0,4.5,,,4.0,3.5,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,,,,,,,,4.0,,,...,,,,,,,,,,
747,,,,,,,,,,,...,,,,,,,,,,
748,,,,,,,,,5.0,,...,,,,,,,,,,
749,,,,,,,,,,,...,,,,,,,,,,


In [172]:
def svd(train, k):
    utilMat = np.array(train)
    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))
    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x
    # The magic happens here. U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)
    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    s_root=sqrtm(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    UsV = UsV + x
    print("svd done")
    return UsV


In [184]:
itemcols = list(pivoted.columns)
items_index = {itemcols[i]: i for i in range(len(itemcols))}

pred= []
no_of_features = [8,10,12,14,17]
for f in no_of_features: 
    svdout = svd(train, k=f)
    pred = [] #to store the predicted ratings
    for _,row in test.iterrows():
        user = int(row['userId'])
        item = int(row['movieId'])
        print(user, item)
        
        u_index = pivoted.iloc[int(user)]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = svdout[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        pred.append(pred_rating)
        


print(rmse(test['rating'], pred))


svd done
780 100498


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices