# Install and load necesary packages

In [1]:
# Please don't change this cell

import pandas as pd
import numpy as np  

import warnings
warnings.filterwarnings("ignore")

## Load the dataset using pandas

In [2]:
# Please don't change this cell
df = pd.read_csv('ml-100k/u.data', names=['user_id', 'item_id', 'rating', 'timestamp'], sep='\t')

df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


# Split dataset


## Randomly select one rating from each user as test set

In [3]:
# please do not change this cell

from sklearn.model_selection import train_test_split

n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print(str(n_users) + ' users')
print(str(n_items) + ' items')

train_df, test_df = train_test_split(df, test_size=0.2, random_state = 10)
train_df, test_df

# Training Dataset
train_ds = np.zeros((n_users, n_items))
item_popularity = np.zeros(n_items)
for row in train_df.itertuples():
    train_ds[row[1]-1, row[2]-1] = row[3]
    item_popularity[row[2]-1] =  item_popularity[row[2]-1] + 1
#train_ds = pd.DataFrame(train_ds)

# Testing Dataset
testsize = 0
test_ds = np.zeros((n_users, n_items))
for row in test_df.itertuples():
    if item_popularity[row[2]-1] > 30:
        test_ds[row[1]-1, row[2]-1] = row[3]
        testsize = testsize + 1
#test_ds = pd.DataFrame(test_ds)

print("Construct the rating matrix based on train_df:")
print(train_ds)

print("Construct the rating matrix based on test_df:")
print(test_ds)

print("Testsize = " + str(testsize))

943 users
1682 items
Construct the rating matrix based on train_df:
[[0. 3. 4. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 5. 0. ... 0. 0. 0.]]
Construct the rating matrix based on test_df:
[[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Testsize = 17678


# MAE and RMSE Utils

In [4]:
# Please don't change this cell
# you can use this devaluate Utils here, and you can also implement your own MAE and RMSE calculation. 

EPSILON = 1e-9

def evaluate(test_ds, predicted_ds):
    '''
    Function for evaluating on MAE and RMSE
    '''
    # MAE
    mask_test_ds = test_ds > 0
    MAE = np.sum(np.abs(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32))

    # RMSE
    RMSE = np.sqrt(np.sum(np.square(test_ds[mask_test_ds] - predicted_ds[mask_test_ds])) / np.sum(mask_test_ds.astype(np.float32)))

    return MAE, RMSE

# Solution for using USER-AVERAGE

In [None]:


MAE_solution1 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE_solution1 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.

#initialize a Numpy array to store user_averages later
user_averages = np.zeros(n_users)

for user_index in range(n_users):
    user_ratings_row = train_ds[user_index, :]
    actual_ratings = user_ratings_row[user_ratings_row > 0]
    if actual_ratings.size > 0:  # Check if the user has any ratings
        user_averages[user_index] = np.mean(actual_ratings)

user_averages

np_predictions = np.zeros((n_users, n_items))

for user_idx in range(n_users):
    np_predictions[user_idx, :] = user_averages[user_idx]

np_predictions

MAE_solution1, RMSE_solution1 = evaluate(test_ds, np_predictions)


In [6]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution1, RMSE_solution1))

MAE: 0.8258905090105331, RMSE: 1.031143070595138


# Solution for using ITEM-AVERAGE

In [None]:


MAE_solution2 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE_solution2 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.


item_avg = np.zeros(n_items)

for item_idx in range(n_items):
    item_rt_col = train_ds[:, item_idx]
    actual_ratings = item_rt_col[item_rt_col>0]
    if actual_ratings.size > 0:
        item_avg[item_idx] = np.mean(actual_ratings)
    else:
        item_avg[item_idx] = 0

item_avg
np_predictions = np.zeros((n_users,n_items))
for item_idx in range(n_items):
    np_predictions[:,item_idx] = item_avg[item_idx]

np_predictions

MAE_solution2, RMSE_solution2 = evaluate(test_ds, np_predictions)

In [8]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution2, RMSE_solution2))

MAE: 0.7961203950990416, RMSE: 1.0013142101585724


# Solution for using USER-BASED KNN

In [None]:

MAE_solution3 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE_solution3 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
EPSILON = 1e-9
DELTA = 25

user_pearson_corr = np.zeros((n_users, n_users)) 

for i, user_i_vec in enumerate(train_ds):
    for j, user_j_vec in enumerate(train_ds):

        if i == j:
            user_pearson_corr[i,j] = 1.0 #Sim with itself is 1
            continue
        
        # ratings corated by the current pair of users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0 
        
        # corrated index, skip if there are no corrated ratings
        corrated_item_idices = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_item_idices) == 0:
            continue

        # get ratings for co-rated items
        r_u1_corrated = user_i_vec[corrated_item_idices]
        r_u2_corrated = user_j_vec[corrated_item_idices]

       
        
        # average value of i and j
        mean_u1 = user_averages[i]
        mean_u2 = user_averages[j]

        #Pearson formula
        u1_sub_mean = r_u1_corrated - mean_u1
        u2_sub_mean = r_u2_corrated - mean_u2


        numerator = np.sum(u1_sub_mean * u2_sub_mean)
        
        denominator_u1_sq_sum = np.sum(np.square(u1_sub_mean))
        denominator_u2_sq_sum = np.sum(np.square(u2_sub_mean))

        denominator = np.sqrt(denominator_u1_sq_sum) * np.sqrt(denominator_u2_sq_sum)

        if denominator < EPSILON:
            sim = 0.0
        else:
            sim = numerator / (denominator + EPSILON)


        #significant weighting
        weighted_sim =(min(len(corrated_item_idices), DELTA) / DELTA) * sim

        user_pearson_corr[i,j] = weighted_sim



##prediction part
np_predictions = np.zeros((n_users, n_items))
K = 0
EPSILON = 1e-9
k_vals = range(10,60,10)
for k in k_vals:
    K=k
    for (i, j), rating in np.ndenumerate(test_ds):
        if rating > 0:
            # find top-k most similar items as the current item, remove itself
            sim_user_ids = np.argsort(user_pearson_corr[i])[-(K + 1):-1]

            # the coefficient values of similar items
            sim_val = user_pearson_corr[i][sim_user_ids]

            # the average value of the current item's ratings
            sim_users = train_ds[sim_user_ids]
            user_mean = user_averages[i]
            sim_user_mean = user_averages[sim_user_ids]

            # sim(u, v) * (r_v - mean_v)
            sim_r_sum_mean = sim_val * (sim_users[:, j] - sim_user_mean)


            # filter unrated items       
            w = np.clip(sim_users[:, j], 0, 1)
            sim_r_sum_mean *= w  

            
            denominator = np.sum(sim_val * w)
            prediction = user_mean + np.sum(sim_r_sum_mean) / (denominator + EPSILON)
            np_predictions[i, j] = np.clip(prediction, 0, 5)
    print(evaluate(test_ds, np_predictions), K)

MAE_solution3, RMSE_solution3 = evaluate(test_ds, np_predictions)



(0.8176868089593603, 1.0537452742361622) 10
(0.7789155496292159, 1.006261237768326) 20
(0.7585247507403177, 0.9788151143128819) 30
(0.7465332179252864, 0.9635091063494504) 40
(0.7398983623469628, 0.9549637706107575) 50


In [10]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution3, RMSE_solution3))

MAE: 0.7398983623469628, RMSE: 0.9549637706107575


# Solution for using ITEM-BASED KNN

In [None]:


MAE_solution4 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE_solution4 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
GAMMA = 30
EPSILON = 1e-9

np_item_pearson_corr = np.zeros((n_items, n_items))

for i, item_i_vec in enumerate(train_ds.T):
    for j, item_j_vec in enumerate(train_ds.T):

        # ratings corated by the current pair od items
        mask_i = item_i_vec > 0
        mask_j = item_j_vec > 0

        # corrated index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of item_i_vec and item_j_vec
        mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)
        mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i
        item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j

        r_ui_sub_ri_sq = np.square(item_i_sub_mean)
        r_uj_sub_rj_sq = np.square(item_j_sub_mean)

        r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))
        r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))

        sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        np_item_pearson_corr[i][j] = weighted_sim

#prediction
np_predictions = np.zeros((n_users, n_items))

K = 0
EPSILON = 1e-9
k_vals = range(10,60,10)
for k in k_vals:
    K = k
    for (i, j), rating in np.ndenumerate(test_ds):
        if rating > 0:
            # find top-k most similar items as the current item, remove itself
            sim_item_ids = np.argsort(np_item_pearson_corr[j])[-(K + 1):-1]

            # the coefficient of similar items
            sim_val = np_item_pearson_corr[j][sim_item_ids]

            # the average value of the current item's ratings
            sim_items = train_ds.T[sim_item_ids]
            item_mean = np.sum(train_ds.T[j]) / (np.sum(np.clip(train_ds.T[j], 0, 1)) + EPSILON)
            sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

            # sim(u, v) * (r_v - mean_v)
            sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) 

            # filter unrated items
            w = np.clip(sim_items[:, i], 0, 1)
            sim_r_sum_mean *= w

            np_predictions[i][j] = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)    
            np_predictions[i][j] = np.clip(np_predictions[i][j], 0, 5)
    print(evaluate(test_ds, np_predictions), K)
MAE_solution4, RMSE_solution4 = evaluate(test_ds, np_predictions)
    

(0.785626188371224, 1.022689983213929) 10
(0.7546639297859863, 0.9784834702365811) 20
(0.74187503536625, 0.9578082470400942) 30
(0.7299099695112868, 0.9399649580052881) 40
(0.7250457199849071, 0.9326101988117486) 50


In [12]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution4, RMSE_solution4))

MAE: 0.7250457199849071, RMSE: 0.9326101988117486


# Solution for using HYBRID-KNN

In [13]:
# Write your code here for Method 5
# You are required to implement the required solution 1 here. 
# Then, evaluate your implementation by predicting the ratings in the test set (test_ds).
# Finally, save the corresponding MAE and RMSE of your implementation 
# into the following defined corresponding variable. 

MAE_solution5 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.
RMSE_solution5 = 0 # 0 is an intial value, you need to update this with the actual perofrmance of your implementation.

#copied from method 3
EPSILON = 1e-9
DELTA = 25

user_pearson_corr = np.zeros((n_users, n_users)) 

for i, user_i_vec in enumerate(train_ds):
    for j, user_j_vec in enumerate(train_ds):

        if i == j:
            user_pearson_corr[i,j] = 1.0 #Sim with itself is 1
            continue
        
        # ratings corated by the current pair of users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0 
        
        # corrated index, skip if there are no corrated ratings
        corrated_item_idices = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_item_idices) == 0:
            continue

        # get ratings for co-rated items
        r_u1_corrated = user_i_vec[corrated_item_idices]
        r_u2_corrated = user_j_vec[corrated_item_idices]

       
        
        # average value of i and j
        mean_u1 = user_averages[i]
        mean_u2 = user_averages[j]

        #Pearson formula
        u1_sub_mean = r_u1_corrated - mean_u1
        u2_sub_mean = r_u2_corrated - mean_u2


        numerator = np.sum(u1_sub_mean * u2_sub_mean)
        
        denominator_u1_sq_sum = np.sum(np.square(u1_sub_mean))
        denominator_u2_sq_sum = np.sum(np.square(u2_sub_mean))

        denominator = np.sqrt(denominator_u1_sq_sum) * np.sqrt(denominator_u2_sq_sum)

        if denominator < EPSILON:
            sim = 0.0
        else:
            sim = numerator / (denominator + EPSILON)


        #significant weighting
        weighted_sim =(min(len(corrated_item_idices), DELTA) / DELTA) * sim

        user_pearson_corr[i,j] = weighted_sim



##prediction part
u_np_predictions = np.zeros((n_users, n_items))
K = 40
EPSILON = 1e-9

for (i, j), rating in np.ndenumerate(test_ds):
    if rating > 0:
        # find top-k most similar items as the current item, remove itself
        sim_user_ids = np.argsort(user_pearson_corr[i])[-(K + 1):-1]

        # the coefficient values of similar items
        sim_val = user_pearson_corr[i][sim_user_ids]

         # the average value of the current item's ratings
        sim_users = train_ds[sim_user_ids]
        user_mean = user_averages[i]
        sim_user_mean = user_averages[sim_user_ids]

        # sim(u, v) * (r_v - mean_v)
        sim_r_sum_mean = sim_val * (sim_users[:, j] - sim_user_mean)


        # filter unrated items       
        w = np.clip(sim_users[:, j], 0, 1)
        sim_r_sum_mean *= w  

        
        denominator = np.sum(sim_val * w)
        prediction = user_mean + np.sum(sim_r_sum_mean) / (denominator + EPSILON)
        #name it as 
        u_np_predictions[i, j] = np.clip(prediction, 0, 5)



#copy from method 4
DELTA = 25
EPSILON = 1e-9

np_item_pearson_corr = np.zeros((n_items, n_items))

for i, item_i_vec in enumerate(train_ds.T):
    for j, item_j_vec in enumerate(train_ds.T):

        # ratings corated by the current pair od items
        mask_i = item_i_vec > 0
        mask_j = item_j_vec > 0

        # corrated index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of item_i_vec and item_j_vec
        mean_item_i = np.sum(item_i_vec) / (np.sum(np.clip(item_i_vec, 0, 1)) + EPSILON)
        mean_item_j = np.sum(item_j_vec) / (np.sum(np.clip(item_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        item_i_sub_mean = item_i_vec[corrated_index] - mean_item_i
        item_j_sub_mean = item_j_vec[corrated_index] - mean_item_j

        r_ui_sub_ri_sq = np.square(item_i_sub_mean)
        r_uj_sub_rj_sq = np.square(item_j_sub_mean)

        r_ui_sub_ri_sq_sum_sqrt = np.sqrt(np.sum(r_ui_sub_ri_sq))
        r_uj_sub_rj_sq_sum_sqrt = np.sqrt(np.sum(r_uj_sub_rj_sq))

        sim = np.sum(item_i_sub_mean * item_j_sub_mean) / (r_ui_sub_ri_sq_sum_sqrt * r_uj_sub_rj_sq_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), DELTA) / DELTA) * sim

        np_item_pearson_corr[i][j] = weighted_sim

#prediction
i_np_predictions = np.zeros((n_users, n_items))

K = 40
EPSILON = 1e-9

for (i, j), rating in np.ndenumerate(test_ds):
    if rating > 0:
        # find top-k most similar items as the current item, remove itself
        sim_item_ids = np.argsort(np_item_pearson_corr[j])[-(K + 1):-1]

        # the coefficient of similar items
        sim_val = np_item_pearson_corr[j][sim_item_ids]

        # the average value of the current item's ratings
        sim_items = train_ds.T[sim_item_ids]
        item_mean = np.sum(train_ds.T[j]) / (np.sum(np.clip(train_ds.T[j], 0, 1)) + EPSILON)
        sim_item_mean = np.sum(sim_items, axis=1) / (np.sum(np.clip(sim_items, 0, 1), axis=1) + EPSILON)

        # sim(u, v) * (r_v - mean_v)
        sim_r_sum_mean = sim_val * (sim_items[:, i] - sim_item_mean) 

        # filter unrated items
        w = np.clip(sim_items[:, i], 0, 1)
        sim_r_sum_mean *= w

        i_np_predictions[i][j] = item_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val * w) + EPSILON)    
        i_np_predictions[i][j] = np.clip(i_np_predictions[i][j], 0, 5)

#optimizing the best weight for item-based and user-based predictions
x_vals = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for x in x_vals:
    x = x
    np_predictions_combined = (x * u_np_predictions) + ((1-x) * i_np_predictions)
    print(evaluate(test_ds,  np_predictions_combined), x)

x = 0.5
np_predictions_combined = (x * u_np_predictions) + ((1-x) * i_np_predictions)
MAE_solution5, RMSE_solution5 = evaluate(test_ds, np_predictions_combined)


(0.7393043679551462, 0.9524984512063569) 0
(0.7314893191750428, 0.9411349734550902) 0.1
(0.7254761564109624, 0.9324523708423107) 0.2
(0.7212722076730144, 0.9265260149817204) 0.3
(0.7191411179050609, 0.9234089754305488) 0.4
(0.7189302242546299, 0.9231297104377113) 0.5
(0.7206124900618229, 0.92569078833013) 0.6
(0.7242200543656325, 0.9310687707280436) 0.7
(0.7297140413949159, 0.9392152697705308) 0.8
(0.7372141557616179, 0.9500590702869794) 0.9
(0.7465332179252864, 0.9635091063494504) 1


In [14]:
# Please don't change this cell

print("===================== The MAE and RMSE of Your Implementation =====================")
print("MAE: {}, RMSE: {}" .format(MAE_solution5, RMSE_solution5))

MAE: 0.7189302242546299, RMSE: 0.9231297104377113
