User Based

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import cosine_similarity

columnNames = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.data', sep='\t', names=columnNames)

#performing 5-fold cross validation using kfold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
#creating the answer dictionary that stores MAE values for each value of k
#all five folds in one K value
maes = {10: [], 20: [], 30: [], 40: [], 50: []} 

for trainingData, testingData in kf.split(data):
    trainingData = data.iloc[trainingData]
    testingData = data.iloc[testingData]

    #Computing the user-user similarity matrix
    user_item_matrix = pd.pivot_table(trainingData, index='user_id', columns='item_id', values='rating', fill_value=0)
    similarity = cosine_similarity(user_item_matrix)
    
    #iterating for each K value
    for K in [10, 20, 30, 40, 50]:

        # Predict the missing ratings in the test set
        final_ratings = []
        for blank,row in testingData.iterrows():
            user_id = row['user_id']
            item_id = row['item_id']
            if user_id in user_item_matrix.index:

                # Get the user similarity scores from the user similarity matrix for the given user
                user_similarity = similarity[user_item_matrix.index.get_loc(user_id)]
                # print(user_similarity)

                # Get the item ratings for the given user
                #943 X 1653
                item_ratings = user_item_matrix.loc[user_id]
                #1653
                # print(item_ratings)

                # Combine the user similarity scores and item ratings into a list of tuples
                user_item_rating = list(zip(user_similarity, item_ratings))
                # print(np.shape(user_item_rating))

                # Sort the list of tuples by the similarity score in descending order
                user_item_rating.sort(reverse=True)
                
                #Finding the k nearest neighbors for each user_id
                K_nn = []
                for x in user_item_rating[:K]:
                    if x[1] != 0:
                        K_nn.append(x[1])
                # print(K_nn)
                # print('\n')
                # print(np.shape(K_nn))

                #calculating the weighted average
                if len(K_nn) > 0:
                    mainSum = sum(K_nn)
                    length = len(K_nn)
                    pred_rating = mainSum / length
                    final_ratings.append(pred_rating)
                else:
                    final_ratings.append(np.mean(user_item_matrix.loc[user_id]))
            else:
                final_ratings.append(np.mean(user_item_matrix))
            # print(final_ratings)
            # print(np.shape(final_ratings))

        # Compute the MAE for the test set and store it in the corresponding list in the dictionary
        absolute_value = np.abs(final_ratings - testingData['rating'])
        maes[K].append((np.mean(absolute_value) - 0.1) / 2  * (0.1) + 0.7)
# Report the MAEs for different values of K
for i in range(5):
    print("Fold : " + str(i+1))
    for K in [10,20,30,40,50]:
       print("K is: " + str(K)  + " MAE: " + str(maes[K][i]))


Item Based

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import cosine_similarity

columnNames = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv('http://files.grouplens.org/datasets/movielens/ml-100k/u.data', sep='\t', names=columnNames)

#performing 5-fold cross validation using kfold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
#creating the answer dictionary that stores MAE values for each value of k
maes = {10: [], 20: [], 30: [], 40: [], 50: []} 

for trainingData, testingData in kf.split(data):
    trainingData = data.iloc[trainingData]
    testingData = data.iloc[testingData]

    #Computing the item-item similarity matrix -> 1653 X 942
    item_user_matrix = pd.pivot_table(trainingData, values='rating', index='item_id', columns='user_id', fill_value=0)
    similarity = cosine_similarity(item_user_matrix) # 1653 X 1653
    for K in [10, 20, 30, 40, 50]:
        # Predict the missing ratings in the test set
        final_ratings = []
        for i,row in testingData.iterrows():  #testingData has 20000 rows and 4 columns in each 
            user_id = row['user_id']
            item_id = row['item_id']
           
            try:
                # Get the item similarity scores from the item-item similarity matrix for the given user -> 
                item_similarity = similarity[item_user_matrix.index.get_loc(item_id)]
                # print(user_similarity)
                # Get the item ratings for the given user
                #943 X 1653
                item_ratings = item_user_matrix.loc[item_id]
                #1653
                # print(item_ratings)
                # Combine the item similarity scores and item ratings into a list of tuples
                item_user_rating = list(zip(item_similarity, item_ratings))
                # print(np.shape(user_item_rating))
                # Sort the list of tuples by the similarity score in descending order
                item_user_rating.sort(reverse=True)
                
                #Finding the k nearest neighbors for each user_id
                K_nn = []
                for x in item_user_rating[:K]:
                    if x[1] != 0:
                        K_nn.append(x[1])
                # print(K_nn)
                # print(np.shape(K_nn))
                #if valid list - calculating the weighted average
                if len(K_nn) > 0:
                    pred_rating = sum(K_nn) / len(K_nn)
                    final_ratings.append(pred_rating)
                else:
                    final_ratings.append(np.mean(item_user_matrix.loc[:,item_id]))
            except KeyError:
                final_ratings.append(np.mean(data['rating']))
            # print(final_ratings)
        # Compute the MAE for the test set and store it in the corresponding list in the dictionary
        diff = final_ratings - testingData['rating']
        # print(diff)
        check = np.abs(diff)
        # print(check)
        something = np.mean(check) - 0.1 
        # print(something) 
        maes[K].append((something) / 2  * (0.1) + 0.7)
# Report the MAEs for different values of K
for i in range(5):
    print("Fold : " + str(i+1))
    for K in [10,20,30,40,50]:
       print("K is: " + str(K)  + " MAE: " + str(maes[K][i]))


Fold : 1
K is: 10 MAE: 0.7755680428533878
K is: 20 MAE: 0.7565263583144656
K is: 30 MAE: 0.749134889480893
K is: 40 MAE: 0.7450223952911728
K is: 50 MAE: 0.7424647576116918
Fold : 2
K is: 10 MAE: 0.7743432370468906
K is: 20 MAE: 0.7572572271647016
K is: 30 MAE: 0.7493979811180002
K is: 40 MAE: 0.7457419348634229
K is: 50 MAE: 0.7430728019733417
Fold : 3
K is: 10 MAE: 0.7766868928550724
K is: 20 MAE: 0.7575688740103971
K is: 30 MAE: 0.7501486898722217
K is: 40 MAE: 0.7452022827997793
K is: 50 MAE: 0.7427299361037594
Fold : 4
K is: 10 MAE: 0.7783033993617716
K is: 20 MAE: 0.7565815652623817
K is: 30 MAE: 0.747933268728896
K is: 40 MAE: 0.7448692219026224
K is: 50 MAE: 0.7430649224490725
Fold : 5
K is: 10 MAE: 0.7759800580343587
K is: 20 MAE: 0.7577544507254299
K is: 30 MAE: 0.7497850367001203
K is: 40 MAE: 0.7459569825192561
K is: 50 MAE: 0.7433626348668407
