# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt

# Reading the Data

In [6]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
user_data = pd.read_csv('ml-100k/u.data', sep = '\t',names = columns)
movie_data = pd.read_csv('ml-100k/u.item', delimiter = '|', encoding = 'ISO-8859-1',header = None)
movie_data = movie_data.ix[:,:1]
movie_data.columns = ['item_id','movie_name']
movie_data.index = range(1,len(movie_data)+1)

# Checking my Data

In [7]:
n_users = user_data.user_id.unique().shape[0]
n_items = user_data.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


# Coverting dataframe into matrix

In [15]:
data_matrix = np.zeros((n_users, n_items))
for line in user_data.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

# Computing Cosine Similarity

In [16]:
user_similarity = 1 - pairwise_distances(data_matrix, metric='cosine')
item_similarity = 1 - pairwise_distances(data_matrix.T, metric='cosine')

# Prediction function

In [6]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + (similarity.dot(ratings_diff) / 
            (np.array([np.abs(similarity).sum(axis=1)]).T))
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)]) 
    return pred


# Calling the prediction function

In [7]:
item_prediction = predict(data_matrix, item_similarity, type='item')
user_prediction = predict(data_matrix, user_similarity, type='user')

# Defining RMSE

In [8]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

# Computing RMSE

In [9]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, data_matrix)))

User-based CF RMSE: 2.691101891161349
Item-based CF RMSE: 2.950193296897328
