In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from IPython.display import Image
np.set_printoptions(precision = 3)

In [68]:
data = pd.read_csv("jester-data-1.csv", header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,99.0,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,99.0,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


In [108]:
data = data.head(50)

In [109]:
data.shape

(10, 101)

In [110]:
alpha = 0.001
iterations = 30
k = 5
n_features = 2

As the missing values are replaced with the number "99", the missing values are now taken as NaN values

In [111]:
new_data = data[data.applymap(lambda x: x!=99)]

In [112]:
data_narray = np.array(new_data)

In [113]:
data_narray

array([[ 7.40e+01, -7.82e+00,  8.79e+00, ...,       nan,       nan,
              nan],
       [ 1.00e+02,  4.08e+00, -2.90e-01, ...,  3.40e-01, -4.32e+00,
         1.07e+00],
       [ 4.90e+01,       nan,       nan, ...,       nan,       nan,
              nan],
       ...,
       [ 1.00e+02,  6.84e+00,  3.16e+00, ..., -5.00e-02,  1.31e+00,
         0.00e+00],
       [ 1.00e+02, -3.79e+00, -3.54e+00, ..., -2.90e-01, -3.40e+00,
        -4.95e+00],
       [ 7.20e+01,  3.01e+00,  5.15e+00, ...,       nan,       nan,
              nan]])

In [114]:
train_indices = np.argwhere(~np.isnan(data_narray))
null_indices = np.argwhere(np.isnan(data_narray))

In [115]:
train_indices

array([[ 0,  0],
       [ 0,  1],
       [ 0,  2],
       ...,
       [ 9, 70],
       [ 9, 85],
       [ 9, 92]], dtype=int64)

In [116]:
user_ratings = new_data.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_features))

In [117]:
print("latent jokes feature size:  ", latent_item_features.shape)
print("latent user feature size:  ", latent_user_preferences.shape)

latent jokes feature size:   (101, 2)
latent user feature size:   (10, 2)


In [118]:
def predict_rating(user_id,item_id,latent_user_preferences, latent_item_features):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    return user_preference.dot(item_preference)

def train(user_id, item_id, rating, latent_user_preferences, latent_item_features, alpha = 0.001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id,latent_user_preferences, latent_item_features)
    err =  ( prediction_rating- rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err * latent_item_features[item_id]
    latent_item_features[item_id] -= alpha * err * user_pref_values
    return err, latent_user_preferences, latent_item_features

In [119]:
def sgd(fold_train, fold_test,latent_user_preferences, latent_item_features, iterations = 50):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    #fold_train= [ list(item) for item in fold_train ]
    #fold_test= [ list(item) for item in fold_test ]
    
    train_mse = 0 
    test_mse = 0
    
    for iteration in range(0,iterations):
        error = []
        for i,j in fold_train:
            user_id = fold_train[i][0]
            item_id = fold_train[i][1]
            rating = user_ratings[i][j]
            train_err,latent_user_preferences, latent_item_features= train(user_id, 
                                                                           item_id, rating, 
                                                                           latent_user_preferences, 
                                                                           latent_item_features, alpha = 0.001)
            error.append(train_err)
       
        train_mse = (np.array(error) ** 2).mean()   
        #if(iteration%10 == 0 ):
         #   print("train mse ",train_mse)
    
    test_error =[]
    for i,j in fold_test:
        user_id = fold_test[i][0]
        item_id = fold_test[i][1]
        rating = user_ratings[i][j]
        err  =user_ratings[i][j]- predict_rating(user_id, item_id,latent_user_preferences, latent_item_features)
        test_error.append(err)
    
    test_mse = (np.array(test_error) ** 2).mean()   
    #print("test mse ",test_mse)
    
    return train_mse, test_mse

In [120]:
kf = KFold(n_splits=k)
for train_ind, test_ind in kf.split(train_indices):
    fold_train = train_indices[train_ind]
    fold_test = train_indices[test_ind]
    latent_user_preferences = np.random.random((user_ratings.shape[0], n_features))
    latent_item_features = np.random.random((user_ratings.shape[1],n_features))
    train_mse, test_mse = sgd(fold_train, fold_test,latent_user_preferences, latent_item_features)
    print("train mse "+ str(train_mse) +"   test mse: " + str(test_mse))

train mse 94.16000000197829   test mse: 137.74774832516653
train mse 103.42827111659626   test mse: 115.83663832355063
train mse 103.18122899537327   test mse: 109.85636803890681
train mse 88.01239821011623   test mse: 159.21689750836467
train mse 116.79619155542491   test mse: 48.41479519588243
