                Estimating Non-existing Ratings from Existing Ones

In [31]:
#import the necessary tools
import numpy as np
import pandas as pd
from fancyimpute import BiScaler
from helperFunCF import GenerateTrainingSet
from soft_impute_local import SoftImpute


In [13]:
#read in the data and check it out 
rating = pd.read_csv('jester-data-3.csv', sep = ",").values
print(rating[:5,:])

[[ 1.    1.   -7.82]
 [ 1.    2.    8.79]
 [ 1.    3.   -9.66]
 [ 1.    4.   -8.16]
 [ 1.    5.   -7.52]]


In [15]:
#create and incomplete matrix
matrix_incomplete = np.zeros((len(np.unique(rating[:, 0])), len(np.unique(rating[:,1]))))
print(matrix_incomplete)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [17]:
# change the data to use only the jokes that have ratings since not all of them have ratings
usedID = np.unique(rating[:,1])
for i in range(len(rating[:,1])):
    rating[:,1][i]= np.where(usedID==rating[:,1][i])[0][0]+1


In [18]:
#change the incomplete matrix to where the incomplete components are nan by default
matrix_incomplete[:]= np.nan
#create an index pair of components with ratings
indices = np.array(rating[:, 0]- 1).astype(int), np.array(rating[:, 1]-1).astype(int)
#change the values to the known rating information
matrix_incomplete[indices] = rating[:,2]


In [19]:
# split the data into training set (80%) and validation set using the predefined helper function
train_indices, validation_indices = GenerateTrainingSet(rating[:,0], rating[:,1], 0.80)


In [20]:
# now use the index pairs to create the incomplete training test 
matrix_train = matrix_incomplete.copy()
matrix_train[:] = np.nan
matrix_train[train_indices]= matrix_incomplete[train_indices]

Softimpute Model for Collaborative Filtering

In [37]:
#now that we have our matrixes, we can run the softimpute model to estimate missing ratings. 
#we start by creating a Biscalar model and rescaling both rows and columns to have a 0 mean (normalize)
biscaler= BiScaler(scale_rows= False, scale_columns= False, max_iters= 50, verbose = False)
matrix_train_normalized= biscaler.fit_transform(matrix_train)


In [34]:
#use softinpute to complete the matrix, with 9 archetypes
softImpute= SoftImpute(J= 9, maxit = 200, random_seed = 2022, verbose = False)

In [35]:
# now we run the softimpute model on the normalized dataset
matrix_train_softImpute = softImpute.fit(matrix_train_normalized)

In [39]:
#now we use the softImpute model to create the predicted matrix. 
#we set copyto as False to avoid directly changing the value of matrix_train_normalized
matrix_train_filled_normalized= matrix_train_softImpute.predict(matrix_train_normalized,copyto=False)
#now inverse transform to undo the scaling we did above with .fit_transform()
matrix_train_filled = biscaler.inverse_transform(matrix_train_filled_normalized)
print(matrix_train_filled)

[[-1.84722898e+00 -8.75379265e-03 -5.19988667e+00 ... -2.30893147e+00
  -7.24343457e+00 -5.43703966e+00]
 [ 5.38981785e+00  2.09806557e+00  8.18552860e+00 ...  1.21377518e+00
   2.15783324e+00  6.26199274e+00]
 [ 9.76160769e+00  8.89171544e+00  6.97654940e+00 ...  5.85590944e+00
   6.78354250e+00  6.83737736e+00]
 ...
 [ 2.45095659e+00  1.81927842e+00  1.76973025e+00 ...  3.64457193e-01
  -3.85398257e-01  2.47811704e+00]
 [ 1.81586993e+00  9.07516130e-01 -4.61188973e-01 ... -2.56892433e+00
  -3.37971006e+00  1.19925568e+00]
 [ 1.67496737e+00 -9.97810303e-01  1.30932792e+00 ...  3.99815775e+00
   6.22565021e+00  1.69814605e+00]]


                Analyze the Predicted Ratings for Model Performance (OOS R^2)

In [40]:
# we create a baseline method of the training set using np.average()
train_average = np.average(matrix_train[train_indices])

In [41]:
#we now calculate OOS R2 and IS R2
validation_mse = ((matrix_train_filled[validation_indices]- matrix_incomplete[validation_indices])**2).mean()
training_mse = ((matrix_train_filled[train_indices]- matrix_incomplete[train_indices])**2).mean()
validation_mse_baseline = ((train_average- matrix_incomplete[validation_indices])**2).mean()
training_mse_baseline = ((train_average-matrix_incomplete[train_indices])**2).mean()
print("out-of-sample R2: %.4f, in-sample R2: %.4f." %(1-validation_mse/validation_mse_baseline, 1- training_mse / training_mse_baseline))

out-of-sample R2: 0.3404, in-sample R2: 0.5671.


In [None]:
#check out after matrix completing after prediction 

In [42]:
print("After matrix completion =", matrix_train_filled[10882,64])

After matrix completion = 2.0547280898465257


                    The END!