In [4]:
import pandas as pd
import numpy as np
from scipy.io import loadmat
from scipy.sparse import coo_matrix
from sklearn.preprocessing import Imputer
from sklearn.metrics.pairwise import cosine_similarity

from predictive_imputer import PredictiveImputer


In [34]:
def eucl_dist(x,y):
    return np.sqrt(np.sum(np.square(x-y)))    

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

scores = [3.0, 1.0, 0.2]
print(softmax(scores))

[0.8360188  0.11314284 0.05083836]


In [6]:
train_mat = loadmat('/home/bsong/kaggle_uci/kaggle77b_trainset.mat')
test_mat = loadmat('/home/bsong/kaggle_uci/kaggle77b_testset.mat')

train = np.array(train_mat['trainset'])
test = np.array(test_mat['testset'])

In [7]:
print(train.shape, test.shape)

((21983, 100), (3000, 100))


In [8]:
train[train==99] = np.NaN
test[test==99] = np.NaN

coor_test_pred_vals = np.where(test == 55)
test[test==55] = np.NaN

In [9]:
# impute with mean 

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp_fit = imp.fit(train)
train = imp.transform(train)


# impute with knn (using fancyimpute package)
#train = KNN(k=3).complete(train)
'''
# impute with rf using predictive_imputer
imp = PredictiveImputer()
imp.fit(train)
train = imp.transform(train)
'''

'\n# impute with rf using predictive_imputer\nimp = PredictiveImputer()\nimp.fit(train)\ntrain = imp.transform(train)\n'

In [10]:
coor_notnan_test = np.where(~np.isnan(test)) # tuple of ((x1,x2,...xn),(y1,y2,...,yn))

In [11]:
pred_mat = np.array([[0,0,0]*3000]).reshape(-1,3).astype(float)
k=200

for user in range(3000):
    #user = 10
    #get inds of user i and items that are not NaN
    user_chunk = np.where(coor_notnan_test[0] == user) 
    user_notnan_ind = coor_notnan_test[1][user_chunk]

    #get inds of items j that are for predicting
    pred_item_chunk = np.where(coor_test_pred_vals[0] == user)
    pred_item_ind = coor_test_pred_vals[1][pred_item_chunk]

    #find 10 similar users in train
    #user_cos_simil = cosine_similarity(test[user,user_notnan_ind].reshape(1,-1), train[:,user_notnan_ind])
    
    simil_inds = np.squeeze(np.argsort(-user_cos_simil))[:k] #
    simil_inds_abs = np.squeeze(np.argsort(-np.abs(user_cos_simil)))[:k] # absolute max

    #filter train set to just similar users 
    train_short_col = train[:, pred_item_ind]
    train_short_rowcol = train_short_col[simil_inds,:].copy() 
    train_short_rowcol_abs = train_short_col[simil_inds_abs,:].copy()

    #find weighted mean rating of item j for user i 
    # any variables with '_abs' at the end represents the variables used for the absolute similar results  
    
    weights_users = user_cos_simil[0,simil_inds].reshape(1,k)
    weights_users_abs = user_cos_simil[0,simil_inds_abs].reshape(1,k)

    #get a matrix of +1/-1 to multiply with weights that lose signs when obtaining softmax 
    weights_users_ones = weights_users_abs.copy()
    weights_users_ones[weights_users_ones < 0] = -1
    weights_users_ones[weights_users_ones > 0] = 1

    soft_weights = softmax(weights_users)#*weights_users_ones
    soft_weights_abs = softmax(np.abs(weights_users_abs))*weights_users_ones

    preds_notabs = np.matmul(soft_weights,train_short_rowcol)
    preds_abs = np.matmul(soft_weights_abs, train_short_rowcol_abs)

    preds = np.mean(np.array([preds_notabs,preds_abs]), axis = 0 )

    # append to matrix
    pred_mat[user,:] = preds
    
    if (user+1)%100 == 0:
        print('Done with ' + str(user+1) + ' users.')


ValueError: cannot reshape array of size 8800 into shape (1,200)

In [35]:
euc_distances = [eucl_dist(test[user,user_notnan_ind].reshape(1,-1),i) for i in train[:,user_notnan_ind]]



In [36]:
euc_distances

[60.62769927504638,
 40.32134794373819,
 34.65304774227715,
 45.76053597667841,
 33.77050179502226,
 70.0410208377919,
 58.3526803154748,
 42.38220499218982,
 30.285944254479848,
 31.99862855047561,
 41.84590063554613,
 28.74322472778777,
 42.98743653673711,
 57.571381779491794,
 41.37871312643737,
 41.73111757209049,
 31.436943235626455,
 35.839728664328355,
 42.10778156331203,
 40.98618431388643,
 36.38287974924559,
 40.90559741649057,
 33.23376324440945,
 40.68544488040629,
 57.86510623325132,
 29.711283225233228,
 41.77542459389252,
 36.017762513466316,
 42.67129037085444,
 24.636146208366274,
 37.79608965029982,
 39.31331987234684,
 28.809165850345725,
 27.03435591982912,
 50.32053175663802,
 41.322900665193636,
 38.17088845071042,
 36.499149335427525,
 50.03843114534669,
 35.902227220403205,
 44.549315275182416,
 46.921507002497904,
 48.44976883329785,
 41.11367898887182,
 29.29726802910397,
 44.791236313991945,
 62.70004625197656,
 48.19131587959942,
 46.500436214981896,
 41.873

In [None]:
pred_mat[:6,]

In [None]:
pred_df = pd.DataFrame({'UserId':[i for i in range(1,3001)],'Rating1':pred_mat[:,0],'Rating2':pred_mat[:,1],'Rating3':pred_mat[:,2]})
pred_df=pred_df[['UserId','Rating1','Rating2','Rating3']]
pred_df.to_csv('/home/bsong/kaggle_uci/predictions.csv',index = False)