<a href="https://colab.research.google.com/github/alienverarslan/Movie_Recommender_Systems/blob/main/Item_to_Item_CollaborativeFilterng_with_GlobalBaseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors as nei
from sklearn import metrics as met

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,userID,movieID,rating
0,4490,2109,4
1,5839,3471,4
2,5382,150,3
3,1262,1237,5
4,6005,2273,4


In [3]:
data.describe()

Unnamed: 0,userID,movieID,rating
count,990209.0,990209.0,990209.0
mean,3016.430061,1865.5635,3.581463
std,1728.237347,1096.00108,1.117173
min,1.0,1.0,1.0
25%,1565.0,1030.0,3.0
50%,2970.0,1835.0,4.0
75%,4535.0,2770.0,4.0
max,6040.0,3952.0,5.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990209 entries, 0 to 990208
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   userID   990209 non-null  int64
 1   movieID  990209 non-null  int64
 2   rating   990209 non-null  int64
dtypes: int64(3)
memory usage: 22.7 MB


In [5]:
#Train test split
test = data.sample(2000, random_state=42)
train = data.drop(test.index)
U_train = train.pivot("movieID", "userID", "rating")

In [6]:
print("All Data Utility Matrix:", data.pivot("movieID", "userID", "rating").shape)
print("Train Utility Matrix:", U_train.shape)

All Data Utility Matrix: (3705, 6040)
Train Utility Matrix: (3705, 6040)


In [60]:
#Define the Algorithm
def CollaborativeFiltering_GB(train_df, test_df, k=20, met='minkowski'):
  global_avereage = np.nanmean(np.array(train_df))
  myPreds = []
  realLabels = []
  for idx in test_df.index:
    movieId = test_df["movieID"].loc[idx]
    userId = test_df["userID"].loc[idx]

    user_avg_rating = np.nanmean(train_df.loc[:,userId].values)
    user_deviation = user_avg_rating - global_avereage 

    movie_avg_rating = train_df.loc[:,movieId].mean()
    
    movie_deviation = movie_avg_rating - global_avereage

    baseline_estimate = global_avereage + movie_deviation + user_deviation    

    filtered_train = train_df.dropna(subset=[userId])
    filtered_train = filtered_train - filtered_train.mean(axis=1).values.reshape(-1,1)
    filtered_train.fillna(0, inplace=True)    

    if k < len(filtered_train):
      nn = nei.NearestNeighbors(n_neighbors=k, metric=met, n_jobs=-1)
      nn.fit(filtered_train)   

      test_row = train_df.loc[movieId:movieId, :].copy()
      test_row[userId] = np.nan
      test_row = test_row - test_row.mean(axis=1).values.reshape(-1,1)
      test_row.fillna(0, inplace=True)

      distances, indices = nn.kneighbors(test_row, return_distance=True)

      movieIds = filtered_train.iloc[indices[0], :].index

      local_neighborhood = np.average(train_df.loc[movieIds, userId].values - baseline_estimate, weights=(1 - distances[0]))

      pred = baseline_estimate + local_neighborhood
      
    elif len(filtered_train) >= 10:
      nn = nei.NearestNeighbors(n_neighbors=len(filtered_train), metric=met, n_jobs=-1)
      nn.fit(filtered_train) 

      test_row = train_df.loc[movieId:movieId, :].copy()
      test_row[userId] = np.nan
      test_row = test_row - test_row.mean(axis=1).values.reshape(-1,1)
      test_row.fillna(0, inplace=True)

      distances, indices = nn.kneighbors(test_row, return_distance=True)

      movieIds = filtered_train.iloc[indices[0], :].index
      
      local_neighborhood = np.average(train_df.loc[movieIds, userId].values - baseline_estimate, weights=(1 - distances[0]))

      pred = baseline_estimate + local_neighborhood

    else:
      pred = baseline_estimate


    if pred > 5:
      pred = 5
    if pred < 1:
      pred = 1

    myPreds.append(pred)

    real_label = test_df["rating"].loc[idx]
    realLabels.append(real_label)
  
  return [realLabels, myPreds] 
  

In [62]:
#Finding the optimal parameters with the first 500 ratings of the test data
metrics = ['cosine', "minkowski"]
k_size = [10, 15, 20, 25, 30, 40, 50]
params = []
for m in metrics:
  for k in k_size:
    params.append([m, k])

print("params                    rmse")
results = []
for i in range(len(params)):
  labels, preds = CollaborativeFiltering_GB(train_df=U_train, test_df=test[:500], met=params[i][0], k=params[i][1])
  results.append(met.mean_squared_error(labels, preds, squared=False))  
  print(params[i], results[i])

params                    rmse
['cosine', 10] 0.8645186302458518
['cosine', 15] 0.8601450469690083
['cosine', 20] 0.8586520211991988
['cosine', 25] 0.8636390257226119
['cosine', 30] 0.8666540054375494
['cosine', 40] 0.8724940715913292
['cosine', 50] 0.8810464938283162
['minkowski', 10] 1.1528769671466366
['minkowski', 15] 1.1327905293224705
['minkowski', 20] 1.1194356635582652
['minkowski', 25] 1.1056510948466876
['minkowski', 30] 1.092826077291299
['minkowski', 40] 1.0877109935708014
['minkowski', 50] 1.0813488842681993


In [63]:
best_params = params[np.argmin(results)]
best_score = results[np.argmin(results)]
best_params

['cosine', 20]

In [64]:
#Fine Tune the optimal parameters 
metrics = ['cosine']
k_size = [17, 18, 19, 21, 22, 23]
params = []
for m in metrics:
  for k in k_size:
    params.append([m, k])

print("params                    rmse")
results2 = []
for i in range(len(params)):
  labels, preds = CollaborativeFiltering_GB(train_df=U_train, test_df=test[:500], met=params[i][0], k=params[i][1])
  results2.append(met.mean_squared_error(labels, preds, squared=False))  
  print(params[i], results2[i])

params                    rmse
['cosine', 17] 0.8586020631867272
['cosine', 18] 0.8575839777786033
['cosine', 19] 0.8577686564961469
['cosine', 21] 0.8604309355958599
['cosine', 22] 0.862125718203779
['cosine', 23] 0.8619870076275947


In [68]:
best_params2 = params[np.argmin(results2)]
best_score2 = results2[np.argmin(results2)]
print("Best Parameters:", best_params2)
print("Best RMSE:", best_score2)

Best Parameters: ['cosine', 18]
Best RMSE: 0.8575839777786033


In [70]:
#Predict the whole test data
labels, preds = CollaborativeFiltering_GB(train_df=U_train, test_df=test, met=best_params2[0], k=best_params2[1])
print("RMSE on test data:",met.mean_squared_error(labels, preds, squared=False))  


RMSE on test data: 0.8721163224820848
