In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sp
from time import time
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats.stats import pearsonr
import graphlab
from sklearn.cross_validation import train_test_split
%matplotlib inline



In [2]:
df = pd.read_csv("processed_data.csv")

In [3]:
n_users = df.userid.nunique()
n_items = df.course_id.nunique()

In [4]:
cold_users = df[df["user_review_count"] <=2]
cold_users.shape

(289628, 36)

In [5]:
reg = df[df["user_review_count"] > 2]
reg.shape

(47583, 36)

In [6]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
validate_df = pd.read_csv("validation.csv")

In [7]:
ybar = train_df["rating"].mean()
print ybar

4.77225723057


In [8]:
predictions_train_base = [ybar]*train_df.shape[0]
predictions_test_base = [ybar]*test_df.shape[0]
predictions_validation_base = [ybar]*validate_df.shape[0]

In [9]:
def get_rmse(s, s_predict):
    mse = mean_squared_error(s, s_predict)
    return np.sqrt(mse)

In [10]:
train_base_rmse = get_rmse(train_df["rating"], predictions_train_base)
test_base_rmse = get_rmse(test_df["rating"], predictions_test_base)
validation_base_rmse = get_rmse(validate_df["rating"], predictions_validation_base)

In [11]:
print "base model training rmse: ", train_base_rmse
print "base model validation rmse: ", validation_base_rmse
print "base model test rmse: ", test_base_rmse

base model training rmse:  0.598308674307
base model validation rmse:  0.604317926174
base model test rmse:  0.616235861638


In [12]:
train_userids_dic = {}
for i in train_df["userid"].unique():
    train_userids_dic[i] = (train_df[train_df["userid"]==i].rating.mean())-ybar

In [13]:
train_courseids_dic = {}
for i in train_df["course_id"].unique():
    train_courseids_dic[i] = (train_df[train_df["course_id"]==i].rating.mean())-ybar

In [14]:
train_avgs={'mean':ybar, 'users':train_userids_dic, 'items':train_courseids_dic}

In [15]:
def bias_baseline(df):
    predictions = []
    for i in xrange(len(df)):
        row = df.iloc[i]
        user = row["userid"]
        course = row["course_id"]
        if course not in train_courseids_dic.keys() and user not in train_userids_dic.keys():
            predictions.append(ybar)
        elif course not in train_courseids_dic.keys():
            predictions.append(ybar+train_userids_dic[user])
        elif user not in train_userids_dic.keys():
            predictions.append(ybar+train_courseids_dic[course])
        else:
            predictions.append(ybar+train_userids_dic[user]+train_courseids_dic[course])
            
    return predictions

In [16]:
predictions_train_biasmodel = bias_baseline(train_df)
predictions_test_biasmodel = bias_baseline(test_df)
predictions_validation_biasmodel = bias_baseline(validate_df)

In [17]:
train_bias_rmse = get_rmse(train_df["rating"], predictions_train_biasmodel)
test_bias_rmse = get_rmse(test_df["rating"], predictions_test_biasmodel)
validation_bias_rmse = get_rmse(validate_df["rating"], predictions_validation_biasmodel)

In [18]:
print "bias model training rmse: ", train_bias_rmse
print "bias model validation rmse: ", validation_bias_rmse
print "bias model test rmse: ", test_bias_rmse

bias model training rmse:  0.365101642064
bias model validation rmse:  0.735938573011
bias model test rmse:  0.707701339847


#### Random train test split

In [19]:
train_data, test_data = train_test_split(reg, test_size=0.25)

In [20]:
train_data, validation_data = train_test_split(train_data, train_size=0.75)

In [21]:
ybar_random = train_data["rating"].mean()
print ybar_random

4.76256304876


In [22]:
predictions_train_random_base = [ybar_random]*train_data.shape[0]
predictions_test_random_base = [ybar_random]*test_data.shape[0]
predictions_validation_random_base = [ybar_random]*validation_data.shape[0]

In [23]:
train_base_random_rmse = get_rmse(train_data["rating"], predictions_train_random_base)
test_base_random_rmse = get_rmse(test_data["rating"], predictions_test_random_base)
validation_base_random_rmse = get_rmse(validation_data["rating"], predictions_validation_random_base)

In [24]:
print "random base model training rmse: ", train_base_random_rmse
print "random base model validation rmse: ", validation_base_random_rmse
print "random base model test rmse: ", test_base_random_rmse

random base model training rmse:  0.606185445945
random base model validation rmse:  0.592666298427
random base model test rmse:  0.609199399768


In [25]:
train_userids_dic_random = {}
for i in train_data["userid"].unique():
    train_userids_dic_random[i] = (train_data[train_data["userid"]==i].rating.mean())-ybar_random

In [26]:
train_courseids_dic_random = {}
for i in train_data["course_id"].unique():
    train_courseids_dic_random[i] = (train_data[train_data["course_id"]==i].rating.mean())-ybar_random

In [27]:
train_avgs_random={'mean':ybar_random, 'users':train_userids_dic_random, 'items':train_courseids_dic_random}

In [28]:
def bias_baseline_random(df):
    predictions = []
    for i in xrange(len(df)):
        row = df.iloc[i]
        user = row["userid"]
        course = row["course_id"]
        if course not in train_courseids_dic_random.keys() and user not in train_userids_dic_random.keys():
            predictions.append(ybar_random)
        elif course not in train_courseids_dic_random.keys():
            predictions.append(ybar_random+train_userids_dic_random[user])
        elif user not in train_userids_dic_random.keys():
            predictions.append(ybar_random+train_courseids_dic_random[course])
        else:
            predictions.append(ybar_random+train_userids_dic_random[user]+train_courseids_dic_random[course])
            
    return predictions

In [29]:
predictions_train_biasmodel_random = bias_baseline_random(train_data)
predictions_test_biasmodel_random = bias_baseline_random(test_data)
predictions_validation_biasmodel_random = bias_baseline_random(validation_data)

In [30]:
train_bias_rmse_random = get_rmse(train_data["rating"], predictions_train_biasmodel_random)
test_bias_rmse_random = get_rmse(test_data["rating"], predictions_test_biasmodel_random)
validation_bias_rmse_random = get_rmse(validation_data["rating"], predictions_validation_biasmodel_random)

In [31]:
print "bias model training rmse random: ", train_bias_rmse_random
print "bias model validation rmse random: ", validation_bias_rmse_random
print "bias model test rmse random: ", test_bias_rmse_random

bias model training rmse random:  0.400233964998
bias model validation rmse random:  0.645229978019
bias model test rmse random:  0.669751988115


In [32]:
similarities_dictionary = np.load('similarities_corr.npy').item()

In [33]:
len(similarities_dictionary.keys())

475800