In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

In [8]:
def compute_similarity_matrix(interaction_matrix):
    interaction_matrix_filled = interaction_matrix.fillna(0)

    user_similarity = cosine_similarity(interaction_matrix_filled)

    return pd.DataFrame(user_similarity,index=interaction_matrix.index,columns = interaction_matrix.index)

In [9]:
def make_predictions(interaction_matrix, user_similarity_matrix):
    # Create a copy of the interaction matrix to store predicted ratings
    predicted_matrix = interaction_matrix.copy()

    for userid in interaction_matrix.index:
        similar_users = user_similarity_matrix[userid]

        for itemid in interaction_matrix.columns:
            if pd.notna(interaction_matrix.loc[userid, itemid]):
                continue

            ratings_by_similar_users = interaction_matrix[itemid][similar_users.index]

            numerator = np.dot(ratings_by_similar_users.fillna(0), similar_users)
            denominator = similar_users[ratings_by_similar_users.notna()].sum()

            if denominator != 0:
                predicted_matrix.loc[userid, itemid] = numerator / denominator
            else:
                predicted_matrix.loc[userid, itemid] = np.nan  # Leave as NaN if no similar users have rated this item

    return predicted_matrix


In [None]:
def calculate_mse(predicted_train_matrix,test_interaction_matrix):
  test_interaction_matrix = test_interaction_matrix.fillna(0)
  # Calculate MSE
  mse_test = mean_squared_error(test_interaction_matrix.values, predicted_train_matrix.values)
  return mse_test

In [11]:
def get_top_N_recommendations(predicted,original,n=5):
  top_n_recommendations = {}

  for userid in predicted.index:
    user_ratings = predicted.loc[userid]
    already_rated = original.loc[userid]

    user_ratings = user_ratings[~already_rated]

    top_n_items = user_ratings.nlargest(n).index
    top_n_recommendations[userid] = top_n_items.tolist()

  return top_n_recommendations

In [12]:
# load the ratings
ratings = pd.read_csv("traina.csv")
ratings = ratings.drop(columns="timestamp")
ratings.head()

Unnamed: 0,userid,itemid,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3


In [13]:
# convert into interaction matrix
interaction_matrix = ratings.pivot(index="userid",columns="itemid",values="rating")
interaction_matrix.head()

itemid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [14]:
# compute similarity matrix
user_similarity_matrix = compute_similarity_matrix(interaction_matrix)
user_similarity_matrix.head()

userid,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.146751,0.050677,0.051298,0.364836,0.412213,0.438001,0.295494,0.082464,0.361966,...,0.349292,0.087165,0.260172,0.169418,0.196876,0.103213,0.301227,0.038705,0.172718,0.380396
2,0.146751,1.0,0.125808,0.117674,0.049376,0.223628,0.102842,0.086079,0.095941,0.122703,...,0.116036,0.261259,0.302436,0.338562,0.293885,0.239819,0.191536,0.173707,0.173185,0.080943
3,0.050677,0.125808,1.0,0.236743,0.023378,0.072965,0.062271,0.073452,0.0,0.053468,...,0.035452,0.052882,0.138932,0.076569,0.085671,0.017493,0.152912,0.027988,0.124816,0.029693
4,0.051298,0.117674,0.236743,1.0,0.013061,0.0,0.050802,0.154807,0.0,0.01713,...,0.014671,0.054709,0.132054,0.142586,0.092633,0.0,0.125996,0.10424,0.133524,0.015798
5,0.364836,0.049376,0.023378,0.013061,1.0,0.232726,0.36129,0.22667,0.079715,0.188558,...,0.317783,0.035441,0.091655,0.092367,0.140991,0.053634,0.244862,0.086167,0.136153,0.302223


In [15]:
# make predictions and fill the interaction matrix
predicted_ratings = make_predictions(interaction_matrix,user_similarity_matrix)
predicted_ratings.head()

itemid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0
2,4.0,3.15553,2.981008,3.537192,3.225757,3.43767,3.758011,4.075177,3.874298,2.0,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0
3,3.837115,3.124045,3.089907,3.514577,3.163248,3.207143,3.805887,4.017196,3.862357,3.746249,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0
4,3.833326,3.103673,3.038599,3.500304,3.142846,3.257575,3.811564,4.007069,3.863064,3.584306,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0
5,3.919051,3.185836,3.007481,3.561022,3.257549,3.369034,3.877851,4.030065,3.886632,3.819993,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0


In [16]:
predicted_ratings.isna().sum().sum()

3221

In [17]:
pred_ratings = predicted_ratings.copy()

In [18]:
pred_ratings = np.round(pred_ratings)
pred_ratings.head()

itemid,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0
2,4.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,4.0,2.0,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0
3,4.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0
4,4.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0
5,4.0,3.0,3.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0,...,3.0,4.0,3.0,2.0,3.0,1.0,3.0,2.0,3.0,3.0


In [19]:
test_data = pd.read_csv("testa.csv")
test_data = test_data.drop(columns=['timestamp'])
test_data.head()

Unnamed: 0,userid,itemid,rating
0,1,20,4
1,1,33,4
2,1,61,4
3,1,117,3
4,1,155,2


In [20]:
test_interaction_matrix = test_data.pivot(index='userid',columns='itemid',values='rating')
test_interaction_matrix.head()

itemid,1,2,3,4,5,6,7,8,9,10,...,1591,1592,1600,1612,1617,1646,1653,1656,1662,1664
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [21]:
pred_ratings = pred_ratings.reindex_like(test_interaction_matrix)
pred_ratings = pred_ratings.fillna(0)
pred_ratings.head()
print(pred_ratings.isna().sum().sum())

0


In [24]:
mse_test = calculate_mse(pred_ratings,test_interaction_matrix)
print("Mean Squared Error on test set: ",mse_test)

Mean Squared Error on test set:  11.111615399282577
