# Assignment 3

In [1]:
import pandas as pd
import numpy as np

# Load MovieLens 100K dataset into a dataframe of pandas
names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [2]:
# Select 500 most active users and 500 most active items from the dataset
n_most_active_users = 500
n_most_active_items = 500

user_ids = df.groupby('user_id').count().sort_values(by='rating', ascending=False).head(n_most_active_users).index
item_ids = df.groupby('item_id').count().sort_values(by='rating', ascending=False).head(n_most_active_items).index
df = df[(df['user_id'].isin(user_ids)) & (df['item_id'].isin(item_ids))]

In [3]:
# Map new internal ID for items
i_ids = df['item_id'].unique().tolist()
item_dict = dict(zip(i_ids, [i for i in range(len(i_ids))]))
df['item_id'] = df['item_id'].map(item_dict)

# Split Dataset

In [4]:
# The number of training users and active users
n_training_users = 300
n_active_users = n_most_active_users - n_training_users

# The number of GIVEN ratings for active users
GIVEN = 20

# Randomly select users from the most active users as training set
random_uids = np.random.choice(df.user_id.unique(), n_training_users, replace=False)
train_df = df[df['user_id'].isin(random_uids)]
# Map new internal ID for all users in the training set
u_ids = train_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
train_df['user_id'] = train_df['user_id'].map(user_dict)

# The rest of users are active users for testing
remain_df = df[~df['user_id'].isin(random_uids)]
# Map new internal ID for all active users
u_ids = remain_df['user_id'].unique().tolist()
user_dict = dict(zip(u_ids, [i for i in range(len(u_ids))]))
remain_df['user_id'] = remain_df['user_id'].map(user_dict)

# Randomly select GIVEN ratings for active users
active_df = remain_df.groupby('user_id').sample(n=GIVEN, random_state=1024)

test_df = remain_df[~remain_df.index.isin(active_df.index)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['user_id'] = train_df['user_id'].map(user_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  remain_df['user_id'] = remain_df['user_id'].map(user_dict)


In [5]:
# Convert the format of datasets to matrices
df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_training_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_training_users), 'rating': 0})
train_ds = df_zeros.merge(train_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

df_zeros = pd.DataFrame({'user_id': np.tile(np.arange(0, n_active_users), n_most_active_items), 'item_id': np.repeat(np.arange(0, n_most_active_items), n_active_users), 'rating': 0})
active_ds = df_zeros.merge(active_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')
test_ds = df_zeros.merge(test_df, how='left', on=['user_id', 'item_id']).fillna(0.).pivot_table(values='rating_y', index='user_id', columns='item_id')

train_ds, active_ds, test_ds

(item_id  0    1    2    3    4    5    6    7    8    9    ...  490  491  492  \
 user_id                                                    ...                  
 0        3.0  0.0  0.0  0.0  0.0  0.0  0.0  4.0  0.0  0.0  ...  0.0  0.0  0.0   
 1        0.0  2.0  0.0  4.0  0.0  4.0  4.0  0.0  0.0  2.0  ...  0.0  4.0  4.0   
 2        0.0  0.0  4.0  4.0  4.0  0.0  0.0  4.0  0.0  0.0  ...  0.0  0.0  0.0   
 3        4.0  0.0  0.0  2.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 4        0.0  0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  4.0  ...  0.0  0.0  0.0   
 ...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
 295      4.0  0.0  0.0  0.0  0.0  0.0  0.0  4.0  0.0  0.0  ...  0.0  0.0  4.0   
 296      0.0  0.0  5.0  0.0  0.0  0.0  4.0  0.0  0.0  4.0  ...  0.0  0.0  0.0   
 297      4.0  0.0  4.0  0.0  0.0  4.0  3.0  4.0  0.0  0.0  ...  0.0  0.0  0.0   
 298      0.0  0.0  0.0  5.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
 299      0.0  1

In [6]:
# Predicting All Missing Data in training set
imputed_train_ds = train_ds.values.copy()

# Your implementation to predict the missing values
(Put all your implementation for your algorithm in the following cell only to handle the missing values; )

In [7]:
## Put all your implementation for your solutioin in this cell only to predict the missing values; 
## NOTE 1: DO NOT change anything in the rest of the cells in this framework, 
## otherwise the changes might cause errors and make your implementation invalid.

## Note 2: 
## The user-item rating matrix is imputed_train_ds, 
## and the missing values are those 0s in imputed_train_ds. 
## You are required to predict them by using the solution in the given report. 

## The following parameters are required in the given report, 
## which is named "Effective Missing Data Prediction for Collaborative Filtering", 
## and you will need to use them. But, please do not change their values. 
LAMBDA = 0.7    # λ
GAMMA = 10      # γ
DELTA = 10      # δ
ITA = 0.7       # η
THETA = 0.7     # θ
EPSILON = 1e-9

#Pearson Correlation Coefficient for USER
user_pcc = np.zeros((n_training_users, n_training_users))

for i, user_vec_i in enumerate(imputed_train_ds):
    for j, user_vec_j in enumerate(imputed_train_ds):

        # Co-rated ratings by the pairs of users
        mask_i = user_vec_i > 0
        mask_j = user_vec_j > 0

        # Item index for co-rated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # Calculating average for user_mean_i and user_mean_j
        user_mean_i = np.sum(user_vec_i) / (np.sum(np.clip(user_vec_i, -1, 1)) + EPSILON)
        user_mean_j = np.sum(user_vec_j) / (np.sum(np.clip(user_vec_j, -1, 1)) + EPSILON)
        
        # PCC Compute
        user_sub_mean_i = user_vec_i[corrated_index] - user_mean_i     
        user_sub_mean_j = user_vec_j[corrated_index] - user_mean_j
        
        # Caluclating square
        user_sub_sq_i = np.square(user_sub_mean_i)
        user_sub_sq_j = np.square(user_sub_mean_j)
        
        # Calculating square root
        user_sub_sq_sum_sqrt_i = np.sqrt(np.sum(user_sub_sq_i))
        user_sub_sq_sum_sqrt_j = np.sqrt(np.sum(user_sub_sq_j))
        
        # Calculating similarity
        ij_sim = np.sum(user_sub_mean_i * user_sub_mean_j) / (user_sub_sq_sum_sqrt_i * user_sub_sq_sum_sqrt_j + EPSILON)

        # Weighted Significance calculation
        weight_sig = (min(len(corrated_index), GAMMA) / GAMMA) * ij_sim
        
        # Check similar pair of users
        user_pcc[i][j] = weight_sig
        
        # Selecting similar neighbours between the current user and neighbour 
        user_index=0
        sim_userid=[]
        for user in user_pcc[i]:
            if user>ITA and user_index!=i:
                sim_userid.append(user_index)
            user_index=user_index+1
                 

#Pearson Correlation Coefficient for ITEM

item_pcc = np.zeros((n_most_active_items, n_most_active_items))

for i, item_vec_i in enumerate(imputed_train_ds.T):
    for j, item_vec_j in enumerate(imputed_train_ds.T):

        # Co-rated ratings by the pairs of items
        mask_i = item_vec_i > 0
        mask_j = item_vec_j > 0

        # Item index for co-rated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # Calculating average for item_mean_i and item_mean_j
        item_mean_i = np.sum(item_vec_i) / (np.sum(np.clip(item_vec_i, -1, 1)) + EPSILON)
        item_mean_j = np.sum(item_vec_j) / (np.sum(np.clip(item_vec_j, -1, 1)) + EPSILON)

        # PCC Compute
        item_sub_mean_i = item_vec_i[corrated_index] - item_mean_i
        item_sub_mean_j = item_vec_j[corrated_index] - item_mean_j
        
        # Caluclating square
        item_sub_sq_i = np.square(item_sub_mean_i)
        item_sub_sq_j = np.square(item_sub_mean_j)
        
        # Calculating square root
        item_sub_sq_sum_sqrt_i = np.sqrt(np.sum(item_sub_sq_i))
        item_sub_sq_sum_sqrt_j = np.sqrt(np.sum(item_sub_sq_j))
        
        # Calculating similarity
        sim_ij = np.sum(item_sub_mean_i * item_sub_mean_j) / (item_sub_sq_sum_sqrt_i * item_sub_sq_sum_sqrt_j + EPSILON)

        # Weighted Significance calculation
        sig_weight = (min(len(corrated_index), DELTA) / DELTA) * sim_ij
        
        # Check similar pair of users
        item_pcc[i][j] = sig_weight
        
        # Selecting similar neighbours between the current user and neighbour
        item_index=0
        item_id_sim=[]
        for item in item_pcc[i]:
            if item>THETA and item_index!=i:
                item_id_sim.append(item_index)
            item_index=item_index+1


# Predicting the missing data

# Prediction of user-matrix
np_predictions = np.zeros((n_training_users, n_most_active_items))


K = 100

for (i, j), rating in np.ndenumerate(imputed_train_ds):
    if rating > 0:
    # Find k most similar users as the current user and sort them based on similar users
        sim_userid = np.argsort(user_pcc[i])[-(K + 1):-1]
    # The values of all similar users
        user_val_sim = user_pcc[i][sim_userid]
    # Current user's ratings average value count
        users_sim = train_ds.values[sim_userid]
    # Current user's mean value count
        avg_user = np.sum(train_ds.values[i]) / (np.sum(np.clip(train_ds.values[i], 0, 1)) + EPSILON)
    # Nearest similar neighbour mean value count
        avg_sim_user = np.sum(users_sim, axis=1) / (np.sum(np.clip(users_sim, 0, 1), axis=1) + EPSILON)
    # Neighbour users who have rated on item j
        mask_rated_j = users_sim[:, j] > 0
        
    # Subtracting the mean value from the product of similarity of two users and their rating on item j
        sim_user_r_sum_mean = user_val_sim[mask_rated_j] * (users_sim[mask_rated_j, j] - avg_sim_user[mask_rated_j])

    # Find most similar item as the current item and sort them based on similarity
        item_id_sim = np.argsort(item_pcc[j])[-(K + 1):-1]
   
    # Similar item coefficient values
        val_item_sim = item_pcc[j][item_id_sim]
        # Current item's rating average value count
        item_sim = train_ds.T.values[item_id_sim]
        # Current item mean values count
        avg_item = np.sum(train_ds.T.values[j]) / (np.sum(np.clip(train_ds.T.values[j], 0, 1)) + EPSILON)
        # Nearest similar neighbour mean values count
        avg_sim_item = np.sum(item_sim, axis=1) / (np.sum(np.clip(item_sim, 0, 1), axis=1) + EPSILON)
        # Subtracting the mean from the product of similarity of two items and the similar item
        sim_sum_mean_item_r = user_val_sim * (item_sim[:, i] - avg_sim_item) 
        # Unrated Filtering for items
        w = np.clip(item_sim[:, i], 0, 1)
        sim_sum_mean_item_r *= w

        # Predicting missing values using the conditions from the research paper
        
        # Similar user and item equal to 0
        if len(sim_userid)==0 and len(item_id_sim)==0:
            np_predictions[i][j]=0
            imputed_train_ds=np_predictions

        # Similar user and item not equal to zero
        elif len(sim_userid)!=0 and len(item_id_sim)!=0:
            user_based=avg_user+np.sum(sim_user_r_sum_mean)/(np.sum(user_val_sim[mask_rated_j]) + EPSILON)
            item_based=avg_item+np.sum(sim_sum_mean_item_r)/(np.sum(val_item_sim*w)+EPSILON)
            np_predictions[i][j]=(LAMBDA*user_based)+((1-LAMBDA)*item_based)
            np_predictions[i][j]=np.clip(np_predictions[i][j],0,5)  
            imputed_train_ds=np_predictions
        
        # Similar user equal to 0 and similar item not equal to 0
        elif len(sim_userid)==0 and len(item_id_sim)!=0:
            np_predictions[i][j]=avg_item+np.sum(sim_sum_mean_item_r)/(np.sum(val_item_sim*w)+EPSILON)
            np_predictions[i][j]=np.clip(np_predictions[i][j],0,5)
            imputed_train_ds=np_predictions
            
        # Similar item equal to 0 and similar user not equal to 0
        elif len(sim_userid)!=0 and len(item_id_sim)==0:
            np_predictions[i][j]=avg_user+np.sum(sim_user_r_sum_mean)/(np.sum(user_val_sim[mask_rated_j])+EPSILON)
            np_predictions[i][j]=np.clip(np_predictions[i][j],0,5)
            imputed_train_ds=np_predictions


# Evaluation

### Compute Pearson Correlation Coefficient of All Pairs of Items between active set and imputed training set

In [8]:
imputed_train_ds = pd.DataFrame(imputed_train_ds)
imputed_train_ds

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,3.928652,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.752353,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,2.814868,0.0,2.581883,0.0
1,0.000000,3.416942,0.000000,3.826805,0.000000,3.513604,3.871329,0.000000,0.0,3.360299,...,0.000000,2.930718,3.757410,0.0,2.959803,2.135975,0.000000,0.0,0.000000,0.0
2,0.000000,0.000000,4.427129,4.387463,4.201333,0.000000,0.000000,4.122487,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0
3,4.638417,0.000000,0.000000,3.513902,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0
4,0.000000,0.000000,0.000000,0.000000,4.193294,0.000000,0.000000,0.000000,0.0,4.087396,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,4.375828,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.726510,0.0,0.000000,...,0.000000,0.000000,3.968263,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0
296,0.000000,0.000000,4.786563,0.000000,0.000000,0.000000,4.176519,0.000000,0.0,3.681791,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0
297,4.390190,0.000000,4.228517,0.000000,0.000000,3.710075,3.767581,3.782153,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0
298,0.000000,0.000000,0.000000,4.871003,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.0


In [9]:
active_user_pearson_corr = np.zeros((active_ds.shape[0], train_ds.shape[0]))

# Compute Pearson Correlation Coefficient of All Pairs of Users between active set and imputed training set
for i, user_i_vec in enumerate(active_ds.values):
    for j, user_j_vec in enumerate(imputed_train_ds.values):
        
        # ratings corated by the current pair od users
        mask_i = user_i_vec > 0
        mask_j = user_j_vec > 0

        # corrated item index, skip if there are no corrated ratings
        corrated_index = np.intersect1d(np.where(mask_i), np.where(mask_j))
        if len(corrated_index) == 0:
            continue

        # average value of user_i_vec and user_j_vec
        mean_user_i = np.sum(user_i_vec) / (np.sum(np.clip(user_i_vec, 0, 1)) + EPSILON)
        mean_user_j = np.sum(user_j_vec) / (np.sum(np.clip(user_j_vec, 0, 1)) + EPSILON)

        # compute pearson corr
        user_i_sub_mean = user_i_vec[corrated_index] - mean_user_i
        user_j_sub_mean = user_j_vec[corrated_index] - mean_user_j

        r_ui_sub_r_i_sq = np.square(user_i_sub_mean)
        r_uj_sub_r_j_sq = np.square(user_j_sub_mean)

        r_ui_sum_sqrt = np.sqrt(np.sum(r_ui_sub_r_i_sq))
        r_uj_sum_sqrt = np.sqrt(np.sum(r_uj_sub_r_j_sq))

        sim = np.sum(user_i_sub_mean * user_j_sub_mean) / (r_ui_sum_sqrt * r_uj_sum_sqrt + EPSILON)

        # significance weighting
        weighted_sim = (min(len(corrated_index), GAMMA) / GAMMA) * sim

        active_user_pearson_corr[i][j] = weighted_sim

active_user_pearson_corr

array([[-0.0685823 ,  0.51640614,  0.41049662, ...,  0.29960603,
         0.08507505,  0.4188894 ],
       [-0.08481885,  0.40201782,  0.33002594, ...,  0.2040237 ,
        -0.05045505,  0.61634892],
       [ 0.28534399, -0.21855464,  0.23557669, ...,  0.26187237,
        -0.02680054,  0.60217245],
       ...,
       [-0.1       ,  0.53974145,  0.25504865, ...,  0.18606147,
         0.04876661,  0.06730294],
       [-0.32620436, -0.26592319, -0.36555965, ..., -0.52322042,
        -0.26485309, -0.342131  ],
       [-0.05631098,  0.07634945, -0.32696332, ..., -0.11201222,
         0.10228312, -0.21578345]])

## Predict Ratings of Testing Set

In [10]:
K = 10

test_ds_pred = np.zeros_like(test_ds.values)

for (i, j), rating in np.ndenumerate(test_ds.values):

    if rating > 0:

        sim_user_ids = np.argsort(active_user_pearson_corr[i])[-1:-(K + 1):-1]

        #==================user-based==================#
        # the coefficient values of similar users
        sim_val = active_user_pearson_corr[i][sim_user_ids]

        # the average value of the current user's ratings
        sim_users = imputed_train_ds.values[sim_user_ids]
        user_mean = np.sum(active_ds.values[i]) / (np.sum(np.clip(active_ds.values[i], 0, 1)) + EPSILON)
        sim_user_mean = np.sum(sim_users, axis=1) / (np.sum(np.clip(sim_users, 0, 1), axis=1) + EPSILON)

        # select the users who rated item j
        mask_rated_j = sim_users[:, j] > 0
        
        # sim(u, v) * (r_vj - mean_v)
        sim_r_sum_mean = sim_val[mask_rated_j] * (sim_users[mask_rated_j, j] - sim_user_mean[mask_rated_j])
        
        user_based_pred = user_mean + np.sum(sim_r_sum_mean) / (np.sum(sim_val[mask_rated_j]) + EPSILON)
        user_based_pred = np.clip(user_based_pred, 0, 5)

        test_ds_pred[i][j] = user_based_pred
        
test_ds_pred


array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [4.58710104, 0.        , 4.58283013, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 3.76280127, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [3.81162359, 0.        , 4.03412747, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Compute MAE and RMSE

In [11]:
# MAE
MAE = np.sum(np.abs(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1))

# RMSE
RMSE = np.sqrt(np.sum(np.square(test_ds_pred - test_ds.values)) / np.sum(np.clip(test_ds.values, 0, 1)))

print("MAE: {}, RMSE: {}" .format(MAE, RMSE))

MAE: 0.7421383272384069, RMSE: 0.9487228624044206
