In [1]:
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.manifold import TSNE
from scipy.linalg import sqrtm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.io
import math

In [35]:
movies = pd.read_csv("movie_lense/movies.csv", index_col=0)
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [84]:
tags = pd.read_csv("movie_lense/tags.csv", index_col=0)
tags.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,movieId,tag,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
14,110,epic,1443148538
14,110,Medieval,1443148532
14,260,sci-fi,1442169410
14,260,space action,1442169421
14,318,imdb top 250,1442615195


In [53]:
R = pd.read_csv("movie_lense/ratings.csv").head(10000)
print ('{0}x{1} user by movie matrix'.format(*R.shape))
R.head()

10000x4 user by movie matrix


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [87]:
result = pd.concat([tags, R], keys=['movieId'])
result

Unnamed: 0_level_0,Unnamed: 1_level_0,movieId,tag,timestamp
Unnamed: 0_level_1,userId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
movieId,14,110,epic,1443148538
movieId,14,110,Medieval,1443148532
movieId,14,260,sci-fi,1442169410
movieId,14,260,space action,1442169421
movieId,14,318,imdb top 250,1442615195
movieId,...,...,...,...
movieId,283206,73017,fun,1264379059
movieId,283206,73017,homoerotic subtext,1264379058
movieId,283206,73017,pacing,1264379058
movieId,283206,73017,plot,1264379058


In [69]:
pivoted = R.pivot(index='userId', columns='movieId', values='rating')
pivoted.shape

(100, 3198)

In [70]:
pivoted = pivoted.loc[:, pivoted.sum(axis=0) > 120]
pivoted.shape


(100, 7)

In [71]:
pivoted

movieId,260,296,318,356,593,2571,2858
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,4.5,5.0,5.0,3.5,4.5,4.5,4.0
5,,5.0,5.0,,,,5.0
...,...,...,...,...,...,...,...
96,3.5,,,,,,
97,,,,,,,
98,,,,,,,5.0
99,,,5.0,5.0,,4.5,


In [72]:
train = pivoted[:75]
test = pivoted[75:]
train

movieId,260,296,318,356,593,2571,2858
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,4.5,5.0,5.0,3.5,4.5,4.5,4.0
5,,5.0,5.0,,,,5.0
...,...,...,...,...,...,...,...
71,4.0,4.5,4.0,3.0,,4.5,4.5
72,5.0,4.5,3.0,4.0,4.5,4.5,
73,4.5,5.0,4.0,4.0,4.5,4.5,4.5
74,,,,,,,


In [59]:
def svd(train, k):
    utilMat = np.array(train)
    # the nan or unavailable entries are masked
    mask = np.isnan(utilMat)
    masked_arr = np.ma.masked_array(utilMat, mask)
    item_means = np.mean(masked_arr, axis=0)
    # nan entries will replaced by the average rating for each item
    utilMat = masked_arr.filled(item_means)
    x = np.tile(item_means, (utilMat.shape[0],1))
    # we remove the per item average from all entries.
    # the above mentioned nan entries will be essentially zero now
    utilMat = utilMat - x
    # The magic happens here. U and V are user and item features
    U, s, V=np.linalg.svd(utilMat, full_matrices=False)
    s=np.diag(s)
    # we take only the k most significant features
    s=s[0:k,0:k]
    U=U[:,0:k]
    V=V[0:k,:]
    s_root=sqrtm(s)
    Usk=np.dot(U,s_root)
    skV=np.dot(s_root,V)
    UsV = np.dot(Usk, skV)
    UsV = UsV + x
    print("svd done")
    return UsV


In [83]:
itemcols = list(pivoted.columns)
items_index = {itemcols[i]: i for i in range(len(itemcols))}

pred= []
no_of_features = [8]
for f in no_of_features: 
    svdout = svd(pivoted, k=f)
    for i in range(0, 7):
        print(svdout[4][i], pivoted.iloc[4,i])

svd done
4.258620689655173 nan
5.0 5.0
5.0 5.0
4.069444444444445 nan
4.1 nan
4.348484848484849 nan
5.0 5.0
