In [1]:
import pandas as pd
import numpy as np
from math import sqrt

dataPath = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
columnNames = ["user_id", "item_id", "rating", "timestamp"]
data = pd.read_csv(dataPath, sep="\t", names=columnNames)

# Remove timestamp column
data.drop(columns=["timestamp"], inplace=True)

print(data.head())
print(data.info())
print(f"Number of unique users: {data['user_id'].nunique()}")
print(f"Number of unique items: {data['item_id'].nunique()}")


   user_id  item_id  rating
0      196      242       3
1      186      302       3
2       22      377       1
3      244       51       2
4      166      346       1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   user_id  100000 non-null  int64
 1   item_id  100000 non-null  int64
 2   rating   100000 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB
None
Number of unique users: 943
Number of unique items: 1682


In [2]:
# Create a user-item matrix
userItemMatrix = data.pivot(index='user_id', columns='item_id', values='rating')

print(userItemMatrix.head())

item_id  1     2     3     4     5     6     7     8     9     10    ...  \
user_id                                                              ...   
1         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         4.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   2.0  ...   
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         4.0   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

item_id  1673  1674  1675  1676  1677  1678  1679  1680  1681  1682  
user_id                                                              
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
3         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
4         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5         NaN   NaN   NaN   NaN   NaN   NaN   N

In [3]:
testSize = 0.2
trainMatrix = userItemMatrix.copy()

# Select 20% of users and mask 20% of their ratings
testUsers = np.random.choice(userItemMatrix.index, size=int(testSize * userItemMatrix.shape[0]), replace=False)

testData = []
for user in testUsers:
    userRatings = trainMatrix.loc[user]
    ratedItems = userRatings[userRatings.notna()].index
    testItems = np.random.choice(ratedItems, size=int(testSize * len(ratedItems)), replace=False)
    for item in testItems:
        testData.append((user, item, userItemMatrix.loc[user, item]))
        # Mask the rating
        trainMatrix.loc[user, item] = np.nan

testData = pd.DataFrame(testData, columns=["user_id", "item_id", "rating"])
print(testData.head())


   user_id  item_id  rating
0      304      294     4.0
1      304      300     5.0
2      304      323     3.0
3      304      259     1.0
4      304      274     4.0


In [4]:
def calculateSimilarityMatrix(matrix):
    users = matrix.index
    userData = matrix.to_numpy()
    #Initialize the similarity matrix
    similarityMatrix = np.zeros((len(users), len(users)))

    for i in range(len(users)):
        for j in range(i + 1, len(users)):
            user1 = userData[i]
            user2 = userData[j]

            # Fetch items rated by both users
            commonItems = (~np.isnan(user1) & ~np.isnan(user2))
            # Continue if users have no common ratings
            if np.sum(commonItems) == 0:
                continue

            # Cosine similarity
            vec1 = user1[commonItems]
            vec2 = user2[commonItems]
            norm1 = np.sqrt(np.sum(vec1**2))
            norm2 = np.sqrt(np.sum(vec2**2))


            if norm1 != 0 and norm2 != 0:
                similarity = np.dot(vec1, vec2)/(norm1*norm2)
                similarityMatrix[i,j] = similarity
                similarityMatrix[j,i] = similarity

    return pd.DataFrame(similarityMatrix, index=users, columns=users)

userSimilarity = pd.DataFrame(calculateSimilarityMatrix(trainMatrix), index=trainMatrix.index, columns=trainMatrix.index)

print(userSimilarity.head())


user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        0.000000  0.960582  0.857075  0.919264  0.932614  0.953845  0.949546   
2        0.960582  0.000000  0.935601  0.946756  0.984803  0.955547  0.980502   
3        0.857075  0.935601  0.000000  0.919528  1.000000  0.890713  0.966636   
4        0.919264  0.946756  0.919528  0.000000  0.994692  0.936996  0.932050   
5        0.932614  0.984803  1.000000  0.994692  0.000000  0.933627  0.918380   

user_id       8         9         10   ...       934       935       936  \
user_id                                ...                                 
1        0.979932  0.976404  0.968304  ...  0.942682  0.875338  0.965889   
2        0.946414  0.940056  0.982614  ...  0.950765  0.957073  0.967400   
3        0.914207  0.919145  0.902444  ...  0.964901  0.948683  0.896328   
4        0.966988  0.993884  0.969117  ...  0.982905

In [50]:
def predictRating(user, item, k):
    # Sort similar users
    similarUsers = userSimilarity[user].sort_values(ascending=False)

    # Filter only users who have rated the target item
    similarUsers = similarUsers[~userItemMatrix.loc[similarUsers.index, item].isna()]
    # Filter top k most similar users
    similarUsers = similarUsers[:k]

    # Compute weighted average of ratings
    rating = userItemMatrix.loc[similarUsers.index, item]
    weight = similarUsers.values

    numerator = np.dot(rating, weight)
    denominator = np.sum(np.abs(weight))
    if denominator != 0:
      return numerator / denominator
    return np.nan

testData["predicted_rating"] = testData.apply(lambda row: predictRating(row["user_id"], row["item_id"], 5), axis=1)


In [51]:
def calculateRMSE(actualRatings, predictedRatings):
    mask = ~np.isnan(predictedRatings)
    return sqrt((np.mean((actualRatings[mask] - predictedRatings[mask])** 2)))

rmse = calculateRMSE(testData["rating"], testData["predicted_rating"])
print("RMSE:" + str(rmse))

RMSE:1.0670423125067228


Analysis:
My model achieved a RMSE value of between 1.05 - 1.1
This means that the actual ratings deviate from the predicted ratings by around 1.05 to 1.1 points. For example if the actual rating of a movie is 4.0 then the predicted rating could be in the range 2.95 to 5.0. Obviously a lower RMSE value is better, guaranteeing a closer/better prediction. This RMSE value is decent, but could be further optimized by tuning the k value or other parameters.

Strengths:
It effectively recommends items to users that they might not have discovered on their own. It doesn't require specific information/data about the items themselves like item-based collaborative filtering does, meaning it can be applicable to variety of different areas.

Weaknesses:
Not good at recommending items to new users that don't have much data to indicate their preferences. Not good if the data is too sparse, because there will not be many similar users to generate recommendations. The scalability can be an issue when the number of users grows, since you need to calculate similarity between every user.