In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sp
from time import time
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
from scipy.stats.stats import pearsonr
import graphlab
from sklearn.cross_validation import train_test_split
from math import sqrt
%matplotlib inline

In [2]:
df = pd.read_csv("processed_data.csv")

In [3]:
reg = df[df["user_review_count"] > 2]
reg.shape

(47583, 36)

In [35]:
n_users = reg.userid.nunique()
n_items = reg.course_id.nunique()

In [36]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
validate_df = pd.read_csv("validation.csv")

In [37]:
train_df = train_df[["new_course_id", "new_user_id", "rating"]]
validate_df = validate_df[["new_course_id", "new_user_id", "rating"]]
test_df = test_df[["new_course_id", "new_user_id", "rating"]]

In [38]:
train_df_matrix = np.zeros((n_users, n_items))
for line in train_df.itertuples():
    train_df_matrix[line[2]-1, line[1]-1] = line[3]

In [39]:
validation_df_matrix = np.zeros((n_users, n_items))
for line in validate_df.itertuples():
    validation_df_matrix[line[2]-1, line[1]-1] = line[3]

In [40]:
test_df_matrix = np.zeros((n_users, n_items))
for line in test_df.itertuples():
    test_df_matrix[line[2]-1, line[1]-1] = line[3]

In [41]:
item_similarity = pairwise_distances(train_df_matrix.T, metric='cosine')

In [42]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [43]:
item_prediction = predict(train_df_matrix, item_similarity, type='item')

In [44]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [45]:
print('Item-based CF RMSE training ' + str(rmse(item_prediction, train_df_matrix)))
print('Item-based CF RMSE validation ' + str(rmse(item_prediction, validation_df_matrix)))
print('Item-based CF RMSE test ' + str(rmse(item_prediction, test_df_matrix)))

Item-based CF RMSE training 4.79705213754
Item-based CF RMSE validation 4.78990200143
Item-based CF RMSE test 4.78970437877
