In [21]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import math

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
%cd /content/gdrive/MyDrive
%cd CF_Data

/content/gdrive/MyDrive
/content/gdrive/MyDrive/CF_Data


In [4]:
df = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [5]:
#creating a matrix of users and items
ratings = df.pivot_table(index='user_id', columns='item_id', values='rating')

In [6]:
ratings

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [7]:
def make_sim_mat(ratings):
    user_sim = np.zeros((ratings.shape[0], ratings.shape[0]))
    print(ratings.shape[0])
    for i in range(ratings.shape[0]):
        for j in range(i, ratings.shape[0]):
            rating1 = np.array(ratings.iloc[i])
            rating2 = np.array(ratings.iloc[j])
            nan_index = np.argwhere(np.isnan(rating1) | np.isnan(rating2))
            rating1[nan_index] = 0
            rating2[nan_index] = 0
            norm1=np.linalg.norm(rating1)
            norm2=np.linalg.norm(rating2)
            if norm1==0 or norm2==0:
              continue
            sim = np.dot(rating1, rating2)/(norm1*norm2)
            user_sim[i,j] = sim
            user_sim[j,i] = sim
    return user_sim

sim_mat = make_sim_mat(ratings)

943


In [8]:
sim_mat

array([[1.        , 0.96058196, 0.85707467, ..., 0.97083582, 0.92350499,
        0.92974418],
       [0.96058196, 1.        , 0.93560149, ..., 0.94022545, 0.94849315,
        0.97747425],
       [0.85707467, 0.93560149, 1.        , ..., 0.9732227 , 0.91200932,
        0.98994949],
       ...,
       [0.97083582, 0.94022545, 0.9732227 , ..., 1.        , 0.98230247,
        0.96075428],
       [0.92350499, 0.94849315, 0.91200932, ..., 0.98230247, 1.        ,
        0.9762865 ],
       [0.92974418, 0.97747425, 0.98994949, ..., 0.96075428, 0.9762865 ,
        1.        ]])

In [50]:
def find_neighbors(usr, sim_mat1, k):
    return np.argsort(np.delete(sim_mat1[usr],usr))[-50:]+1

def recommend_rating(usr, item, ratings, sim_mat, neighbors):
    rating = 0
    #print(usr, '------',neighbors,'\n--------------------------')
    train_data = usr
    test_data = item
    prediction = np.zeros((len(test_data), len(test_data[0])))
    for i in range(len(test_data)):
        for j in range(len(test_data[0])):
            if test_data[i][j] == 0:
                prediction[i][j] = -1
            else:
                count = 0
                sum = 0
                # for l in range(len(sim_mat)):
                for l in neighbors[i]:
                    if sim_mat[i][l] != 0 and train_data[l][j] != 0:
                        sum += sim_mat[i][l] * train_data[l][j]
                        count += 1
                if count != 0:
                    prediction[i][j] = sum / count
                else:
                    prediction[i][j] = -1
    return prediction

In [51]:

user_mae_table = np.zeros((6,5))  
def train_test(sim_mat, ratings, fold_index):
    train_data = pd.read_csv(f'ml-100k/u{fold_index}.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(f'ml-100k/u{fold_index}.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    train_data_matrix = np.zeros((943, 1682))
    for line in train_data.itertuples():
        train_data_matrix[line[1]-1, line[2]-1] = line[3]

    test_data_matrix = np.zeros((943, 1682))
    for line in test_data.itertuples():
        test_data_matrix[line[1]-1, line[2]-1] = line[3]

    knn_mae = np.zeros(5)
    knn_mae_count = np.zeros(5)
    for k in range(10,51,10):
        top_k_similar_users = np.zeros((len(sim_mat), k))
        for i in range(len(sim_mat)):
            top_k_similar_users[i] = np.argsort(sim_mat[i])[::-1][1:k+1]
        top_k_similar_users = top_k_similar_users.astype(int)
        prediction = recommend_rating(train_data_matrix, test_data_matrix, ratings, sim_mat, top_k_similar_users)
        mae = 0
        count = 0
        for i in range(len(test_data_matrix)):
            for j in range(len(test_data_matrix[0])):
                if test_data_matrix[i][j] != 0 and prediction[i][j] !=-1:
                    knn_mae[k//10-1] += abs(prediction[i][j] - test_data_matrix[i][j])
                    knn_mae_count[k//10-1] += 1
    for ind in range(5):
        knn_mae[ind] = knn_mae[ind]/knn_mae_count[ind]
        user_mae_table[fold_index-1][ind] = knn_mae[ind]
    print(knn_mae)

##User Based Recommeder System

In [52]:
for fold_index in range(1, 6):
    print(f'Fold {fold_index}')
    print('K=10 \t K=20 \t K=30 \t K=40 \t K=50')
    train_test(sim_mat, ratings, fold_index)
    print('-------------------------------------------------\n')

Fold 1
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.69341833 0.66217463 0.65243522 0.65583832 0.66030812]
-------------------------------------------------

Fold 2
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.7056505  0.68045424 0.66443122 0.66749875 0.66135999]
-------------------------------------------------

Fold 3
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.74456908 0.69148426 0.67425742 0.67142735 0.67258492]
-------------------------------------------------

Fold 4
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.73412287 0.68913558 0.66738187 0.66640145 0.66417656]
-------------------------------------------------

Fold 5
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.73603414 0.693678   0.67126889 0.66589191 0.66727361]
-------------------------------------------------



In [53]:
for i in range(5):
  user_mae_table[5][i] = np.mean(user_mae_table.T[i][:-1])
user_mae_table

array([[0.69341833, 0.66217463, 0.65243522, 0.65583832, 0.66030812],
       [0.7056505 , 0.68045424, 0.66443122, 0.66749875, 0.66135999],
       [0.74456908, 0.69148426, 0.67425742, 0.67142735, 0.67258492],
       [0.73412287, 0.68913558, 0.66738187, 0.66640145, 0.66417656],
       [0.73603414, 0.693678  , 0.67126889, 0.66589191, 0.66727361],
       [0.72275898, 0.68338534, 0.66595492, 0.66541156, 0.66514064]])

In [54]:
user_based_mae_df = pd.DataFrame(user_mae_table, columns=['k = 10', 'k = 20', 'k = 30', 'k = 40', 'k = 50'], index=['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Average'])
user_based_mae_df

Unnamed: 0,k = 10,k = 20,k = 30,k = 40,k = 50
Fold 1,0.693418,0.662175,0.652435,0.655838,0.660308
Fold 2,0.705651,0.680454,0.664431,0.667499,0.66136
Fold 3,0.744569,0.691484,0.674257,0.671427,0.672585
Fold 4,0.734123,0.689136,0.667382,0.666401,0.664177
Fold 5,0.736034,0.693678,0.671269,0.665892,0.667274
Average,0.722759,0.683385,0.665955,0.665412,0.665141


#Item-Based Recommender System

In [55]:
ratings_item = df.pivot_table(index='item_id', columns='user_id', values='rating')
ratings_item

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,4.0,,,4.0,4.0,,,,4.0,...,2.0,3.0,4.0,,4.0,,,5.0,,
2,3.0,,,,3.0,,,,,,...,4.0,,,,,,,,,5.0
3,4.0,,,,,,,,,,...,,,4.0,,,,,,,
4,3.0,,,,,,5.0,,,4.0,...,5.0,,,,,,2.0,,,
5,3.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,,,,,,,,,,,...,,,,,,,,,,
1679,,,,,,,,,,,...,,,,,,,,,,
1680,,,,,,,,,,,...,,,,,,,,,,
1681,,,,,,,,,,,...,,,,,,,,,,


In [56]:
sim_mat_item = make_sim_mat(ratings_item)
sim_mat_item

1682


array([[1.        , 0.94873739, 0.91329972, ..., 0.        , 1.        ,
        1.        ],
       [0.94873739, 1.        , 0.90887971, ..., 0.        , 1.        ,
        1.        ],
       [0.91329972, 0.90887971, 1.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [1.        , 1.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [1.        , 1.        , 1.        , ..., 0.        , 0.        ,
        1.        ]])

In [57]:
item_mae_table = np.zeros((6,5))  
def train_test_item(sim_mat, ratings, fold_index):
    train_data = pd.read_csv(f'ml-100k/u{fold_index}.base', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    test_data = pd.read_csv(f'ml-100k/u{fold_index}.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
    train_data_matrix = np.zeros((943, 1682))
    for line in train_data.itertuples():
        train_data_matrix[line[1]-1, line[2]-1] = line[3]

    test_data_matrix = np.zeros((943, 1682))
    for line in test_data.itertuples():
        test_data_matrix[line[1]-1, line[2]-1] = line[3]
    test_data_matrix = test_data_matrix.T
    knn_mae = np.zeros(5)
    knn_mae_count = np.zeros(5)
    for k in range(10,51,9):
        top_k_similar_items = np.zeros((len(sim_mat), k))
        for i in range(len(sim_mat)):
            top_k_similar_items[i] = np.argsort(sim_mat[i])[::-1][1:k+1]
        top_k_similar_items = top_k_similar_items.astype(int)
        prediction = recommend_rating(train_data_matrix.T, test_data_matrix, ratings, sim_mat, top_k_similar_items)
        mae = 0
        count = 0
        for i in range(len(test_data_matrix)):
            for j in range(len(test_data_matrix[0])):
                if test_data_matrix[i][j] != 0 and prediction[i][j] !=-1:
                    knn_mae[k//9-1] += abs(prediction[i][j] - test_data_matrix[i][j])
                    knn_mae_count[k//9-1] += 1
    for ind in range(5):
        knn_mae[ind] = knn_mae[ind]/knn_mae_count[ind]
        item_mae_table[fold_index-1][ind] = knn_mae[ind]
    print(knn_mae)

In [58]:
for fold_index in range(1, 6):
    print(f'Fold {fold_index}')
    print('K=10 \t K=20 \t K=30 \t K=40 \t K=50')
    train_test_item(sim_mat_item, ratings_item, fold_index)
    print('-------------------------------------------------\n')

Fold 1
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.58194109 0.76436431 0.82764885 0.87929665 0.90119949]
-------------------------------------------------

Fold 2
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.57267618 0.74757766 0.83813614 0.8751361  0.90464595]
-------------------------------------------------

Fold 3
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.56173208 0.72981686 0.82406259 0.88137369 0.92750844]
-------------------------------------------------

Fold 4
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.47517637 0.65488628 0.76255868 0.83719287 0.87335419]
-------------------------------------------------

Fold 5
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.45926615 0.64218053 0.74437248 0.79214206 0.83013066]
-------------------------------------------------



In [59]:
for i in range(5):
  item_mae_table[5][i] = np.mean(item_mae_table.T[i][:-1])
item_mae_table

array([[0.58194109, 0.76436431, 0.82764885, 0.87929665, 0.90119949],
       [0.57267618, 0.74757766, 0.83813614, 0.8751361 , 0.90464595],
       [0.56173208, 0.72981686, 0.82406259, 0.88137369, 0.92750844],
       [0.47517637, 0.65488628, 0.76255868, 0.83719287, 0.87335419],
       [0.45926615, 0.64218053, 0.74437248, 0.79214206, 0.83013066],
       [0.53015837, 0.70776513, 0.79935575, 0.85302827, 0.88736775]])

In [60]:
item_based_mae_df = pd.DataFrame(item_mae_table, columns=['k = 10', 'k = 20', 'k = 30', 'k = 40', 'k = 50'], index=['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Average'])
item_based_mae_df

Unnamed: 0,k = 10,k = 20,k = 30,k = 40,k = 50
Fold 1,0.581941,0.764364,0.827649,0.879297,0.901199
Fold 2,0.572676,0.747578,0.838136,0.875136,0.904646
Fold 3,0.561732,0.729817,0.824063,0.881374,0.927508
Fold 4,0.475176,0.654886,0.762559,0.837193,0.873354
Fold 5,0.459266,0.642181,0.744372,0.792142,0.830131
Average,0.530158,0.707765,0.799356,0.853028,0.887368


#Variance Weighting

In [61]:
def calculate_variance_weights(ratings_user, ratings_item):
    var_weights_user = np.zeros(943)
    var_weights_item = np.zeros(1682)
    for i in range(943):
        var_weights_user[i] = np.var(ratings_user.iloc[i])
    var_weights_user = var_weights_user - (np.min(var_weights_user)/(np.max(var_weights_user) - np.min(var_weights_user)+1))
    for i in range(1682):
      var_weights_item[i] = np.var(ratings_item.iloc[i])
    var_weights_item = var_weights_item - np.min(var_weights_item)/(np.max(var_weights_item) - np.min(var_weights_item)+1)
    return var_weights_user, var_weights_item

In [62]:
def recommend_rating(usr, item, ratings, sim_mat, neighbors):
    rating = 0
    #print(usr, '------',neighbors,'\n--------------------------')
    train_data = usr
    test_data = item
    prediction = np.zeros((len(test_data), len(test_data[0])))
    for i in range(len(test_data)):
        for j in range(len(test_data[0])):
            if test_data[i][j] == 0:
                prediction[i][j] = -1
            else:
                count = 0
                sum = 0
                # for l in range(len(sim_mat)):
                for l in neighbors[i]:
                    if sim_mat[i][l] != 0 and train_data[l][j] != 0:
                        sum += sim_mat[i][l] * train_data[l][j] * var_weights[i]
                        count += 0.99*var_weights[i]
                if count != 0:
                    prediction[i][j] = sum / count
                else:
                    prediction[i][j] = -1
    return prediction

In [63]:
var_weights_user, var_weights_item = calculate_variance_weights(ratings, ratings_item)
var_weights = var_weights_user
for fold_index in range(1, 6):
    print(f'Fold {fold_index}')
    print('K=10 \t K=20 \t K=30 \t K=40 \t K=50')
    train_test(sim_mat, ratings, fold_index)
    print('-------------------------------------------------\n')

Fold 1
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.69775036 0.66279288 0.65257804 0.65571311 0.6603044 ]
-------------------------------------------------

Fold 2
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.71106315 0.68194845 0.66513336 0.668097   0.66185654]
-------------------------------------------------

Fold 3
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.75339425 0.69555167 0.67626844 0.67327229 0.67468055]
-------------------------------------------------

Fold 4
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.74275872 0.69274368 0.6701209  0.66927943 0.66701543]
-------------------------------------------------

Fold 5
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.74531563 0.69746781 0.67396288 0.66843857 0.6696856 ]
-------------------------------------------------



In [64]:
for i in range(5):
  user_mae_table[5][i] = np.mean(user_mae_table.T[i][:-1])
user_mae_table

array([[0.69775036, 0.66279288, 0.65257804, 0.65571311, 0.6603044 ],
       [0.71106315, 0.68194845, 0.66513336, 0.668097  , 0.66185654],
       [0.75339425, 0.69555167, 0.67626844, 0.67327229, 0.67468055],
       [0.74275872, 0.69274368, 0.6701209 , 0.66927943, 0.66701543],
       [0.74531563, 0.69746781, 0.67396288, 0.66843857, 0.6696856 ],
       [0.73005642, 0.6861009 , 0.66761273, 0.66696008, 0.6667085 ]])

In [65]:
user_based_mae_df = pd.DataFrame(user_mae_table, columns=['k = 10', 'k = 20', 'k = 30', 'k = 40', 'k = 50'], index=['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Average'])
user_based_mae_df

Unnamed: 0,k = 10,k = 20,k = 30,k = 40,k = 50
Fold 1,0.69775,0.662793,0.652578,0.655713,0.660304
Fold 2,0.711063,0.681948,0.665133,0.668097,0.661857
Fold 3,0.753394,0.695552,0.676268,0.673272,0.674681
Fold 4,0.742759,0.692744,0.670121,0.669279,0.667015
Fold 5,0.745316,0.697468,0.673963,0.668439,0.669686
Average,0.730056,0.686101,0.667613,0.66696,0.666709


###Item-Based Variance

In [66]:
var_weights = var_weights_item
for fold_index in range(1, 6):
    print(f'Fold {fold_index}')
    print('K=10 \t K=20 \t K=30 \t K=40 \t K=50')
    train_test_item(sim_mat_item, ratings_item, fold_index)
    print('-------------------------------------------------\n')

Fold 1
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.58993362 0.77617792 0.8390573  0.89075924 0.91134111]
-------------------------------------------------

Fold 2
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.59041915 0.76229984 0.85154878 0.88910855 0.91725298]
-------------------------------------------------

Fold 3
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.57190063 0.73996342 0.83426859 0.89081956 0.93659602]
-------------------------------------------------

Fold 4
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.4858706  0.66701735 0.77376258 0.84806459 0.88369944]
-------------------------------------------------

Fold 5
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.47226109 0.65527435 0.75714331 0.8056776  0.84301766]
-------------------------------------------------



In [67]:
for i in range(5):
  item_mae_table[5][i] = np.mean(item_mae_table.T[i][:-1])
item_mae_table

array([[0.58993362, 0.77617792, 0.8390573 , 0.89075924, 0.91134111],
       [0.59041915, 0.76229984, 0.85154878, 0.88910855, 0.91725298],
       [0.57190063, 0.73996342, 0.83426859, 0.89081956, 0.93659602],
       [0.4858706 , 0.66701735, 0.77376258, 0.84806459, 0.88369944],
       [0.47226109, 0.65527435, 0.75714331, 0.8056776 , 0.84301766],
       [0.54207702, 0.72014658, 0.81115611, 0.86488591, 0.89838144]])

In [68]:
item_based_mae_df = pd.DataFrame(item_mae_table, columns=['k = 10', 'k = 20', 'k = 30', 'k = 40', 'k = 50'], index=['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Average'])
item_based_mae_df

Unnamed: 0,k = 10,k = 20,k = 30,k = 40,k = 50
Fold 1,0.589934,0.776178,0.839057,0.890759,0.911341
Fold 2,0.590419,0.7623,0.851549,0.889109,0.917253
Fold 3,0.571901,0.739963,0.834269,0.89082,0.936596
Fold 4,0.485871,0.667017,0.773763,0.848065,0.883699
Fold 5,0.472261,0.655274,0.757143,0.805678,0.843018
Average,0.542077,0.720147,0.811156,0.864886,0.898381


#Significance Weighting

In [69]:
def make_sign_sim_mat(ratings):
    n = ratings.shape[0]
    user_sim = np.zeros((943,943))
    for i in range(n):
        for j in range(i+1,n):
            count = 0
            for k in range(1,1683):
                if i>942 or j>942:
                    print(i,j)
                    return
                if math.isnan(ratings[i+1][k]) or math.isnan(ratings[j+1][k]):
                    pass
                else:
                    count+=1
            if count<50:
                user_sim[i][j] = count/50
                user_sim[j][i] = count/50
            else:
                user_sim[i][j] = 1
                user_sim[j][i] = 1
    return user_sim

In [70]:
def make_sign_sim_mat(ratings):
    n = ratings.shape[0]
    user_sim = np.ones((n, n))
    for i in range(n):
        for j in range(i, n):
            rating1 = np.array(ratings.iloc[i])
            rating2 = np.array(ratings.iloc[j])
            nan_index = np.argwhere(np.isnan(rating1) | np.isnan(rating2))
            count = 1643 - len(nan_index)
            if count< 50:
                user_sim[i,j] = count/50
                user_sim[j,i] = user_sim[i,j]
            else:
                user_sim[i,j] = 1
                user_sim[j,i] = user_sim[i,j]
    return user_sim

In [81]:
def recommend_rating(usr, item, ratings, sim_mat, neighbors):
    rating = 0
    #print(usr, '------',neighbors,'\n--------------------------')
    train_data = usr
    test_data = item
    prediction = np.zeros((len(test_data), len(test_data[0])))
    usr_sign_sim_mat = np.ones((len(sim_mat), len(sim_mat)))
    for i in range(len(test_data)):
        for j in range(len(test_data[0])):
            if test_data[i][j] == 0:
                prediction[i][j] = -1
            else:
                count = 0
                sum = 0
                # for l in range(len(sim_mat)):
                for l in neighbors[i]:
                    if sim_mat[i][l] != 0 and train_data[l][j] != 0:
                        sum += sim_mat[i][l] * train_data[l][j] * usr_sign_sim_mat[i][l]
                        count += 0.99*usr_sign_sim_mat[i][l]
                if count != 0:
                    prediction[i][j] = sum / count
                else:
                    prediction[i][j] = -1
    return prediction

In [72]:
ratings.shape

(943, 1682)

In [73]:
ratings

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


####User-Based

In [74]:
usr_sign_sim_mat = make_sign_sim_mat(ratings)

In [75]:
for fold_index in range(1, 6):
    print(f'Fold {fold_index}')
    print('K=10 \t K=20 \t K=30 \t K=40 \t K=50')
    train_test(sim_mat, ratings, fold_index)
    print('-------------------------------------------------\n')

Fold 1
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.69775036 0.66279288 0.65257804 0.65571311 0.6603044 ]
-------------------------------------------------

Fold 2
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.71106315 0.68194845 0.66513336 0.668097   0.66185654]
-------------------------------------------------

Fold 3
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.75339425 0.69555167 0.67626844 0.67327229 0.67468055]
-------------------------------------------------

Fold 4
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.74275872 0.69274368 0.6701209  0.66927943 0.66701543]
-------------------------------------------------

Fold 5
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.74531563 0.69746781 0.67396288 0.66843857 0.6696856 ]
-------------------------------------------------



In [76]:
for i in range(5):
  user_mae_table[5][i] = np.mean(user_mae_table.T[i][:-1])
user_mae_table

array([[0.69775036, 0.66279288, 0.65257804, 0.65571311, 0.6603044 ],
       [0.71106315, 0.68194845, 0.66513336, 0.668097  , 0.66185654],
       [0.75339425, 0.69555167, 0.67626844, 0.67327229, 0.67468055],
       [0.74275872, 0.69274368, 0.6701209 , 0.66927943, 0.66701543],
       [0.74531563, 0.69746781, 0.67396288, 0.66843857, 0.6696856 ],
       [0.73005642, 0.6861009 , 0.66761273, 0.66696008, 0.6667085 ]])

In [77]:
user_based_mae_df = pd.DataFrame(user_mae_table, columns=['k = 10', 'k = 20', 'k = 30', 'k = 40', 'k = 50'], index=['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Average'])
user_based_mae_df

Unnamed: 0,k = 10,k = 20,k = 30,k = 40,k = 50
Fold 1,0.69775,0.662793,0.652578,0.655713,0.660304
Fold 2,0.711063,0.681948,0.665133,0.668097,0.661857
Fold 3,0.753394,0.695552,0.676268,0.673272,0.674681
Fold 4,0.742759,0.692744,0.670121,0.669279,0.667015
Fold 5,0.745316,0.697468,0.673963,0.668439,0.669686
Average,0.730056,0.686101,0.667613,0.66696,0.666709


#### Item-Based

In [78]:
item_sign_sim_mat = make_sign_sim_mat(ratings_item)

In [82]:
for fold_index in range(1, 6):
    print(f'Fold {fold_index}')
    print('K=10 \t K=20 \t K=30 \t K=40 \t K=50')
    train_test_item(sim_mat_item, ratings_item, fold_index)
    print('-------------------------------------------------\n')

Fold 1
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.59977187 0.77860405 0.83988438 0.8905043  0.91162556]
-------------------------------------------------

Fold 2
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.59147038 0.76269351 0.85120272 0.88710465 0.91561832]
-------------------------------------------------

Fold 3
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.58107511 0.74400861 0.8360423  0.89112035 0.93625836]
-------------------------------------------------

Fold 4
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.49471186 0.67019214 0.77560388 0.84906328 0.8843987 ]
-------------------------------------------------

Fold 5
K=10 	 K=20 	 K=30 	 K=40 	 K=50
[0.47915422 0.65780862 0.75750118 0.80411188 0.84134641]
-------------------------------------------------



In [83]:
for i in range(5):
  item_mae_table[5][i] = np.mean(item_mae_table.T[i][:-1])
item_mae_table

array([[0.59977187, 0.77860405, 0.83988438, 0.8905043 , 0.91162556],
       [0.59147038, 0.76269351, 0.85120272, 0.88710465, 0.91561832],
       [0.58107511, 0.74400861, 0.8360423 , 0.89112035, 0.93625836],
       [0.49471186, 0.67019214, 0.77560388, 0.84906328, 0.8843987 ],
       [0.47915422, 0.65780862, 0.75750118, 0.80411188, 0.84134641],
       [0.54923669, 0.72266139, 0.81204689, 0.86438089, 0.89784947]])

In [84]:
item_based_mae_df = pd.DataFrame(item_mae_table, columns=['k = 10', 'k = 20', 'k = 30', 'k = 40', 'k = 50'], index=['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5', 'Average'])
item_based_mae_df

Unnamed: 0,k = 10,k = 20,k = 30,k = 40,k = 50
Fold 1,0.599772,0.778604,0.839884,0.890504,0.911626
Fold 2,0.59147,0.762694,0.851203,0.887105,0.915618
Fold 3,0.581075,0.744009,0.836042,0.89112,0.936258
Fold 4,0.494712,0.670192,0.775604,0.849063,0.884399
Fold 5,0.479154,0.657809,0.757501,0.804112,0.841346
Average,0.549237,0.722661,0.812047,0.864381,0.897849


###Storing similarity matrices and rankings at last

In [85]:
import pickle
try:
    fd = open('sim_mat_user', 'wb')
    pickle.dump(sim_mat, fd)
    fd.close()
    
    fd = open('sim_mat_item', 'wb')
    pickle.dump(sim_mat_item, fd)
    fd.close()

    fd = open('item_sign_sim_mat', 'wb')
    pickle.dump(item_sign_sim_mat, fd)
    fd.close()
    fd = open('user_sign_sim_mat', 'wb')
    pickle.dump(usr_sign_sim_mat, fd)
    fd.close()
except:
  print('Error while saving')
