In [17]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from math import sqrt

#Split data as testing and training 
from sklearn import cross_validation as cv

#For calculating cosine similarity
from sklearn.metrics.pairwise import pairwise_distances

import scipy.sparse as sp
from scipy.sparse.linalg import svds

import plotly.offline as plot
import plotly.graph_objs as go
plot.offline.init_notebook_mode(connected = True)

## Getting the data ##

In [2]:
user_details = pd.read_csv('/home/user/Downloads/ml-100k/u.user', 
                  sep = '|',
                  header = None,
                  names = ['user id','age','gender','occupation','zip code'])
user_data = pd.read_table('/home/user/Downloads/ml-100k/u.data',
                    header = None,
                    names = ['user id','item id','rating','timestamp'])


In [3]:
n_users = user_data['user id'].unique().shape[0]
n_items = user_data['item id'].unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


#### Separating the training and testing data ####

In [4]:
train_data, test_data = cv.train_test_split(user_data, test_size=0.25)

## Memory Based algorithm ##

In [5]:
#Create two user-item matrices, one for training
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]  

# and another for testing
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]


In [6]:
np.shape(train_data_matrix)

(943, 1682)

#### Cosine similarity of user-item and item-item ####

In [7]:
user_similarity = 1 - pairwise_distances(train_data_matrix, metric = 'cosine') # 0 is maximum
item_similarity = 1 - pairwise_distances(train_data_matrix.T, metric = 'cosine')


In [8]:
def predict(ratings, similarity, type = 'user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis = 1) 
        
        #You use np.newaxis so that mean_user_rating has same dimension as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [9]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

#### Rmse value ####

In [10]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()  # select ratings which are specified
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [11]:
print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 2.9586589316449934
Item-based CF RMSE: 3.1601006304211925


## Model based Algorithm ##

In [12]:
sparsity=round(1.0-len(user_data)/float(n_users*n_items),3)
print ('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 93.7%


In [18]:

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print ('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 2.714519947272233


## Box Plot ##

In [19]:
prediction = user_prediction[test_data_matrix.nonzero()]
ground_truth = test_data_matrix[test_data_matrix.nonzero()]

trace0 = go.Box(
    y = (ground_truth - prediction)
)

data = [trace0]

plot.iplot(data)

## Histogram frequency plot ##

In [20]:

trace0 = go.Histogram(
    x = (ground_truth - prediction)
)

data = [trace0]

plot.iplot(data)