In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.sparse as sp
from time import time
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from scipy.stats.stats import pearsonr
import graphlab
from sklearn.cross_validation import train_test_split
from math import sqrt
%matplotlib inline

In [32]:
df = pd.read_csv("processed_data.csv")

In [33]:
reg = df[df["user_review_count"] > 2]
reg.shape

(47583, 36)

In [34]:
high_user = reg.new_user_id.max()
high_item = reg.new_course_id.max()

In [35]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
validate_df = pd.read_csv("validation.csv")

In [36]:
train_df = train_df[["new_course_id", "new_user_id", "rating"]]
validate_df = validate_df[["new_course_id", "new_user_id", "rating"]]
test_df = test_df[["new_course_id", "new_user_id", "rating"]]

In [37]:
train_df.head()

Unnamed: 0,new_course_id,new_user_id,rating
0,198,90879,5.0
1,643,90879,5.0
2,79,90879,5.0
3,365,257277,5.0
4,198,124321,5.0


In [38]:
train_df_matrix = np.zeros((high_user, high_item))
for line in train_df.itertuples():
    train_df_matrix[line[2]-1, line[1]-1] = line[3]

In [39]:
validation_df_matrix = np.zeros((high_user, high_item))
for line in validate_df.itertuples():
    validation_df_matrix[line[2]-1, line[1]-1] = line[3]

In [40]:
test_df_matrix = np.zeros((high_user, high_item))
for line in test_df.itertuples():
    test_df_matrix[line[2]-1, line[1]-1] = line[3]

In [41]:
items_sim = cosine_similarity(train_df_matrix.T)
least_to_most = np.argsort(items_sim,1)
neighborhood = least_to_most[:, -75:]

In [42]:
n_items = reg["new_course_id"].nunique()
userid = 90879

In [43]:
items_rated_by_user = train_df_matrix[userid-1].nonzero()

In [44]:
items_rated_by_user

(array([ 78, 197, 642]),)

In [45]:
def pred_one_user(userid):
    items_rated_by_user = train_df_matrix[userid-1].nonzero()[0]
    out = np.zeros(n_items)
    for item_to_rate in range(n_items):
        #print item_to_rate
        relevant_items = np.intersect1d(neighborhood[item_to_rate],items_rated_by_user, assume_unique=True)
        
        out[item_to_rate] = np.mean((train_df_matrix[userid-1, relevant_items] * items_sim[item_to_rate, relevant_items]) / items_sim[item_to_rate, relevant_items].sum())
    cleaned_out = np.nan_to_num(out)
    return np.where(cleaned_out > 0)[0], cleaned_out[cleaned_out > 0]

In [46]:
pred_one_user(userid)

(array([  5,   7,  12,  15,  16,  21,  24,  28,  30,  37,  38,  43,  44,
         47,  53,  54,  59,  64,  68,  71,  72,  73,  78,  82,  86,  87,
         88,  94,  96,  98,  99, 103, 104, 110, 112, 119, 128, 129, 133,
        145, 150, 153, 154, 155, 156, 157, 167, 168, 170, 174, 182, 184,
        185, 187, 197, 206, 207, 210, 211, 213, 214, 216, 217, 222, 224,
        229, 230, 232, 233, 235, 237, 238, 245, 247, 251, 254, 256, 257,
        259, 262, 263, 265, 268, 270, 271, 276, 285, 286, 287, 288, 289,
        293, 295, 298, 301, 302, 303, 306, 307, 310, 312, 318, 323, 328,
        331, 333, 341, 342, 345, 346, 348, 351, 353, 357, 360, 363, 364,
        366, 368, 369, 371, 374, 375, 377, 380, 381, 385, 386, 387, 390,
        393, 395, 397, 398, 402, 410, 413, 416, 417, 419, 420, 422, 423,
        424, 429, 430, 431, 434, 438, 439, 440, 442, 452, 459, 464, 470,
        471, 480, 483, 484, 485, 486, 489, 490, 493, 494, 499, 501, 506,
        511, 512, 515, 524, 529, 531, 533, 534, 537

In [47]:
item_item_pred = {}
for i in test_df["new_user_id"].unique():
    #print i
    item_item_pred[i] = pred_one_user(i)

  


In [48]:
item_item_pred[229383]

(array([  2,   6,   7,  21,  27,  28,  37,  40,  41,  47,  49,  54,  55,
         59,  65,  66,  75,  76,  85,  86,  87,  94,  96,  98, 104, 107,
        110, 115, 123, 128, 129, 133, 150, 151, 153, 154, 155, 163, 170,
        172, 174, 175, 180, 182, 184, 185, 187, 197, 201, 206, 210, 211,
        217, 219, 223, 224, 226, 231, 233, 235, 236, 237, 239, 249, 250,
        251, 252, 254, 256, 259, 260, 265, 272, 275, 276, 282, 285, 286,
        289, 293, 294, 298, 303, 306, 309, 311, 318, 326, 330, 333, 341,
        344, 345, 350, 351, 360, 364, 366, 368, 369, 371, 374, 375, 380,
        388, 390, 392, 395, 396, 397, 398, 402, 406, 407, 410, 413, 415,
        417, 418, 419, 422, 423, 425, 430, 431, 433, 434, 438, 441, 450,
        452, 464, 467, 469, 472, 474, 480, 482, 484, 485, 489, 494, 496,
        498, 499, 500, 501, 506, 509, 511, 512, 513, 516, 525, 526, 539,
        541, 542, 545, 546, 549, 552, 553, 555, 557, 559, 565, 567, 575,
        577, 580, 583, 585, 593, 594, 601, 603, 605

In [49]:
ybar = train_df.rating.mean()

In [50]:
predictions = []
for i in xrange(len(test_df)):
    userid = test_df.iloc[i]["new_user_id"]
    item_not_rated = test_df.iloc[i]["new_course_id"]
    rel_items = item_item_pred[userid][0]
    if item_not_rated in rel_items:
        idx = np.where(item_item_pred[userid][0] == item_not_rated)
        predictions.append(item_item_pred[userid][1][idx])
    else:
        if item_not_rated in train_df.new_course_id.unique():
            predictions.append(train_df[train_df["new_course_id"] == item_not_rated]["rating"].mean())
        else:
            predictions.append(ybar)

In [51]:
len(predictions)

12867

In [52]:
test_df[test_df["new_user_id"] == 229383]

Unnamed: 0,new_course_id,new_user_id,rating
2342,342,229383,5.0


In [53]:
def get_rmse(true_rat, predictions):
    return np.sqrt(mean_squared_error(true_rat, predictions))

In [54]:
get_rmse(test_df["rating"], predictions)

0.83251624505910471