In [52]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt



In [53]:
# Load the data
data = pd.read_csv('ml-100k/u.data', sep='\t', header=None,
                   names=['user_id', 'item_id', 'rating', 'timestamp'])

folds = []
for i in range(1, 6):
    train = pd.read_csv('ml-100k/u' + str(i) + '.base', sep='\t', header=None,
                        names=['user_id', 'item_id', 'rating', 'timestamp'])
    test = pd.read_csv('ml-100k/u' + str(i) + '.test', sep='\t', header=None,
                       names=['user_id', 'item_id', 'rating', 'timestamp'])
    folds.append({'train':train,'test': test})



In [54]:
# Define the user-based recommender system
def user_based_recommendation(train_data, test_data, K):
    # Compute the user-item matrix for the training data
    user_item_train = train_data.pivot(
        index='user_id', columns='item_id', values='rating').fillna(0)

    # Compute the mean rating for each user in the training data
    user_means = train_data.groupby('user_id')['rating'].mean()
    
    global_mean = train_data['rating'].mean()

    # Compute the cosine similarity between users
    user_sim = cosine_similarity(user_item_train)

    predicted_ratings = []
    ratings = []
    
    for _, row in test_data.iterrows():
        user_id = row['user_id']
        item_id = row['item_id']
        if user_id in user_item_train.index:
            if item_id in user_item_train.columns:
                user_item_train_sub = user_item_train.loc[:,item_id]
                
                users_rated_idx = user_item_train_sub[user_item_train_sub != 0].index
                
                user_ratings = user_item_train_sub[users_rated_idx]
                
                user_sims = user_sim[user_id-1][users_rated_idx-1]
                
                user_sims,user_ratings = zip(*sorted(
                    (zip(user_sims, user_ratings)), reverse=True)[:K])
                
                user_ratings = list(user_ratings)
                for i in range(len(user_ratings)):
                    user_ratings[i] = user_ratings[i] - \
                        user_means[users_rated_idx[i]]
                
                user_sims = np.array(user_sims)
                user_ratings = np.array(user_ratings)
                
                predicted_rating = (
                    user_ratings * user_sims).sum() / user_sims.sum()
                predicted_ratings.append(predicted_rating + user_means[user_id])
                ratings.append(row['rating'])
            else:
                predicted_ratings.append(user_means[user_id])
                ratings.append(row['rating'])
                
        else:
            predicted_ratings.append(global_mean)
            ratings.append(row['rating'])
    
    mae = mean_absolute_error(ratings , predicted_ratings)
    return mae
    


In [55]:
def item_based_recommendation(train_data, test_data, K):
    # Compute the user-item matrix for the training data
    user_item_train = train_data.pivot(
        index='user_id', columns='item_id', values='rating').fillna(0)
    
    item_sim = cosine_similarity(user_item_train.T)
    global_mean = train_data['rating'].mean()
    
    item_means = train_data.groupby('item_id')['rating'].mean()
    
    predicted_ratings = []
    ratings = []
    
    for _, row in test_data.iterrows():
        user_id = row['user_id']
        item_id = row['item_id']
        idxx = user_item_train.columns.get_loc(100)
        
        
        if item_id in user_item_train.columns:
            if user_id in user_item_train.index:
                user_item_train_sub = user_item_train.loc[user_id,:]

                item_idx = user_item_train_sub[user_item_train_sub != 0].index
                
                real_idx = []

                for i in item_idx:
                    real_idx.append(user_item_train_sub.index.get_loc(i))
                    
                # item_idx = real_idx

                item_ratings = user_item_train_sub[item_idx]
                
                
                real_idx = np.array(real_idx)
                # item_sims = item_sim[item_id-1][item_idx-1]
                item_sims = item_sim[user_item_train_sub.index.get_loc(
                    item_id)][real_idx]

                item_sims, item_ratings = zip(*sorted(
                    (zip(item_sims, item_ratings)), reverse=True)[:K])

                item_ratings = list(item_ratings)
                for i in range(len(item_ratings)):
                    item_ratings[i] = item_ratings[i] - \
                        item_means[item_idx[i]]

                item_sims = np.array(item_sims)
                item_ratings = np.array(item_ratings)

                predicted_rating = (
                    item_ratings * item_sims).sum() / item_sims.sum()
                predicted_ratings.append(
                    predicted_rating + item_means[item_id])
                ratings.append(row['rating'])
                
                
                
            else:
                predicted_ratings.append(item_means[item_id])
                ratings.append(row['rating'])
        
        else:
            predicted_ratings.append(global_mean)
            ratings.append(row['rating'])
            
    mae = mean_absolute_error(ratings, predicted_ratings)
    return mae

    


In [56]:
train_data = folds[2]['train']
test_data = folds[2]['test']

for _, row in test_data.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']
    


user_item_train = train_data.pivot(
    index='user_id', columns='item_id', values='rating').fillna(0)

item_sim = cosine_similarity(user_item_train.T)

user_item_train_sub = user_item_train.loc[100, :]

item_idx = user_item_train_sub[user_item_train_sub != 0].index

real_idx = []
for i in item_idx:
    real_idx.append(user_item_train_sub.index.get_loc(i))
real_idx = np.array(real_idx)


item_ratings = user_item_train_sub[item_idx]
item_ratings = user_item_train_sub.iloc[real_idx]

item_ratings

item_sims = item_sim[user_item_train_sub.index.get_loc(100)][real_idx]
item_sims
# real_idx




array([0.0544435 , 0.35678835, 0.27460033, 0.43308358, 0.40204893,
       0.27148951, 0.14284582, 0.3402773 , 0.33620133, 0.2061753 ,
       0.31223473, 0.23812454, 0.23207875, 0.24850205, 0.23455783,
       0.28462826, 0.28496758, 0.22411936, 0.10917016, 0.10424581,
       0.2202853 , 0.20903456, 0.08383688, 0.04528133, 0.18261675,
       0.10243166, 0.270386  , 0.17361464, 0.19877264, 0.04856571,
       0.2193436 , 0.21044207, 0.09335747, 0.1252894 , 0.16820545,
       0.08245681, 0.02746908, 0.12270175, 0.09707112, 0.1449671 ,
       0.11044014, 0.09489157, 0.02263798, 0.07480155, 0.00937023,
       0.        , 0.        , 0.        , 0.        ])

In [57]:
def cross_validation(data, algorithm, K):
    # Initialize the list of MAEs
    maes = []

    # For each fold
    for fold in data:
        # Compute the MAE for the algorithm and the fold
        mae = algorithm(fold['train'], fold['test'], K)

        # Add the MAE to the list of MAEs
        maes.append(mae)

    avg_mae = np.mean(maes)
    maes.append(avg_mae)
    
    # Return the list of MAEs
    return maes

In [58]:
K = [10, 20, 30, 40, 50]

user_maes = []
item_maes = []
for k in K:
    user_maes.append(cross_validation(folds, user_based_recommendation, k))
    item_maes.append(cross_validation(folds, item_based_recommendation, k))


In [59]:
user_maes = np.array(user_maes)
results = pd.DataFrame({'K': K, 'Fold 1': user_maes[:,0], 'Fold 2': user_maes[:,1], 'Fold 3': user_maes[:,2], 'Fold 4': user_maes[:,3], 'Fold 5': user_maes[:,4], 'Average': user_maes[:,5]})
results.set_index('K', inplace=True)
results

Unnamed: 0_level_0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Average
K,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,0.784967,0.770047,0.768103,0.759655,0.766609,0.769876
20,0.766772,0.753878,0.748906,0.743244,0.749272,0.752414
30,0.764448,0.749988,0.744862,0.740816,0.745723,0.749167
40,0.763632,0.749495,0.743992,0.74022,0.746297,0.748727
50,0.762808,0.749746,0.74404,0.740529,0.746177,0.74866


In [60]:
item_maes = np.array(item_maes)
results = pd.DataFrame({'K': K, 'Fold 1': item_maes[:, 0], 'Fold 2': item_maes[:, 1], 'Fold 3': item_maes[:,
                       2], 'Fold 4': item_maes[:, 3], 'Fold 5': item_maes[:, 4], 'Average': item_maes[:, 5]})
results.set_index('K', inplace=True)
results


Unnamed: 0_level_0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,Average
K,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10,0.75949,0.75104,0.760182,0.757165,0.762456,0.758067
20,0.744422,0.735169,0.74279,0.740425,0.743442,0.74125
30,0.741219,0.731949,0.737895,0.7364,0.738931,0.737279
40,0.73957,0.7308,0.735328,0.734636,0.73788,0.735643
50,0.738131,0.729627,0.734264,0.734168,0.737493,0.734737
