#**Download The Dataset From Kaggle**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)  # Force remount if already mounted
!pip install kaggle
!mkdir -p ~/.kaggle  # Use -p to avoid error if directory exists
# Make sure to place your kaggle.json file in the specified path in your Google Drive
!cp /content/drive/MyDrive/kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d shubhammehta21/movie-lens-small-latest-dataset --force  # Use --force to overwrite existing files
!unzip -o movie-lens-small-latest-dataset.zip  # Use -o to overwrite existing files


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘/root/.kaggle’: File exists
Dataset URL: https://www.kaggle.com/datasets/shubhammehta21/movie-lens-small-latest-dataset
License(s): unknown
movie-lens-small-latest-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  movie-lens-small-latest-dataset.zip
replace README.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

#**Imports**

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb


# **Load The Datasets**

In [None]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

#**Model Implementation**

#**Encode movieId and userId**

In [None]:
user_enc = LabelEncoder()
ratings['user'] = user_enc.fit_transform(ratings['userId'].values)
movie_enc = LabelEncoder()
ratings['movie'] = movie_enc.fit_transform(ratings['movieId'].values)
num_users = ratings['user'].nunique()
num_movies = ratings['movie'].nunique()

#**Create item-user matrix**

In [None]:
Y = np.zeros((num_movies, num_users))
for row in ratings.itertuples():
    Y[row.movie, row.user] = row.rating

R = (Y > 0).astype(int)

#**Normalize Ratings**

In [None]:
def normalizeRatings(Y, R):
    Ymean = np.sum(Y, axis=1) / np.sum(R, axis=1)
    Ymean = Ymean.reshape(-1, 1)
    Ynorm = (Y - Ymean) * R
    return Ynorm, Ymean

Ynorm, Ymean = normalizeRatings(Y, R)

#**Train-Validation-Test Split**

In [None]:
train_indices, temp_indices = train_test_split(np.arange(num_users), test_size=0.4, random_state=42)
val_indices, test_indices = train_test_split(temp_indices, test_size=0.5, random_state=42)


Y_train = Y[:, train_indices]
R_train = R[:, train_indices]
Y_val = Y[:, val_indices]
R_val = R[:, val_indices]
Y_test = Y[:, test_indices]
R_test = R[:, test_indices]

#**For Consistent Results**

In [None]:
tf.random.set_seed(1234)

#**Define Number Of Features**

In [None]:
num_features = 100

#**Initial Parameters (W, X, b)**

In [None]:
W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features), dtype=tf.float64), name='X')
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name='b')

#**Define Cost function**

In [None]:
def cost_func(X, W, b, Y, R, lambda_):
    Y = np.nan_to_num(Y)
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y) * R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

#**Instantiate The optimizer**

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=1e-1)
epochs = 200
lambda_ = 1

#**KNN & DT**

In [None]:
Ymean = np.mean(Y, axis=1)

user_ids_train, item_ids_train = np.where(R_train == 1)
ratings_train = Y_train[R_train == 1]

ratings_train_normalized = ratings_train - Ymean[item_ids_train]

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(np.column_stack((user_ids_train, item_ids_train)), ratings_train_normalized)

dt = DecisionTreeRegressor(random_state=42)
dt.fit(np.column_stack((user_ids_train, item_ids_train)), ratings_train_normalized)

user_ids_test, item_ids_test = np.where(R_test == 1)
ratings_test = Y_test[R_test == 1]

Y_pred_knn = knn.predict(np.column_stack((user_ids_test, item_ids_test))) + Ymean[item_ids_test]
Y_pred_knn = np.clip(Y_pred_knn, 0.5, 5.0)

Y_pred_knn_flat = Y_pred_knn.flatten()
ratings_test_flat = ratings_test.flatten()

rmse_knn = np.sqrt(mean_squared_error(ratings_test_flat, Y_pred_knn_flat))
mse_knn = mean_squared_error(ratings_test_flat, Y_pred_knn_flat)
mae_knn = mean_absolute_error(ratings_test_flat, Y_pred_knn_flat)
r2_knn = r2_score(ratings_test_flat, Y_pred_knn_flat)
relative_mae_knn = mae_knn / np.mean(ratings_test_flat)

Y_pred_dt = dt.predict(np.column_stack((user_ids_test, item_ids_test))) + Ymean[item_ids_test]
Y_pred_dt = np.clip(Y_pred_dt, 0.5, 5.0)

Y_pred_dt_flat = Y_pred_dt.flatten()

rmse_dt = np.sqrt(mean_squared_error(ratings_test_flat, Y_pred_dt_flat))
mse_dt = mean_squared_error(ratings_test_flat, Y_pred_dt_flat)
mae_dt = mean_absolute_error(ratings_test_flat, Y_pred_dt_flat)
r2_dt = r2_score(ratings_test_flat, Y_pred_dt_flat)
relative_mae_dt = mae_dt / np.mean(ratings_test_flat)

print(f"KNN RMSE: {rmse_knn}, MSE: {mse_knn}, MAE: {mae_knn}, Relative MAE: {relative_mae_knn}")
print("__________________________________")
print(f"Decision Tree RMSE: {rmse_dt}, MSE: {mse_dt}, MAE: {mae_dt}, Relative MAE: {relative_mae_dt}")


KNN RMSE: 1.1305569458931535, MSE: 1.2781590079072547, MAE: 0.896549606769004, Relative MAE: 0.25543482191137723
__________________________________
Decision Tree RMSE: 1.4545092597654767, MSE: 2.115597186743515, MAE: 1.1316111124567982, Relative MAE: 0.3224059001319848


#**SVR**

In [None]:
Ymean = np.mean(Y, axis=1)

user_ids_train, item_ids_train = np.where(R_train == 1)
ratings_train = Y_train[R_train == 1]

ratings_train_normalized = ratings_train - Ymean[item_ids_train]

svr = SVR(kernel='rbf', C=1.0, epsilon=0.2)
svr.fit(np.column_stack((user_ids_train, item_ids_train)), ratings_train_normalized)

user_ids_test, item_ids_test = np.where(R_test == 1)
ratings_test = Y_test[R_test == 1]

Y_pred_svr = svr.predict(np.column_stack((user_ids_test, item_ids_test))) + Ymean[item_ids_test]
Y_pred_svr = np.clip(Y_pred_svr, 0.5, 5.0)

rmse_svr = np.sqrt(mean_squared_error(ratings_test, Y_pred_svr))
mse_svr = mean_squared_error(ratings_test, Y_pred_svr)
mae_svr = mean_absolute_error(ratings_test, Y_pred_svr)
mean_actual_ratings = np.mean(ratings_test)
relative_mae = mae_svr / mean_actual_ratings

print(f"SVR Model RMSE: {rmse_svr}, MSE: {mse_svr}, MAE: {mae_svr},Relative MAE: {relative_mae}")


SVR Model RMSE: 1.0669038200834173, MSE: 1.1382837613085888, MAE: 0.8444090950877963,Relative MAE: 0.24057953424508205


#**Training loop**

In [None]:
Ynorm, Ymean = normalizeRatings(Y, R)
training_losses = []
validation_losses = []

for epoch in range(epochs):
    with tf.GradientTape() as tape:
        cost_value = cost_func(X, W, b, Ynorm, R, lambda_)
    grads = tape.gradient(cost_value, [X, W, b])
    training_losses.append(cost_value.numpy())
    optimizer.apply_gradients(zip(grads, [X, W, b]))
    val_cost_value = cost_func(X, tf.gather(W, val_indices, axis=0), tf.gather(b, val_indices, axis=1), Ynorm[:, val_indices], R_val, lambda_)
    validation_losses.append(val_cost_value.numpy())
    if epoch % 20 == 0:
        print(f"Training loss at epoch {epoch+20}: {cost_value:0.1f}")
        print(f"Validation loss at epoch {epoch+20}: {val_cost_value:0.1f}")
        print("_______________________________________________________________")


Training loss at epoch 20: 5673114.3
Validation loss at epoch 20: 1037053.3
_______________________________________________________________
Training loss at epoch 40: 286796.8
Validation loss at epoch 40: 216710.6
_______________________________________________________________
Training loss at epoch 60: 110854.0
Validation loss at epoch 60: 91674.2
_______________________________________________________________
Training loss at epoch 80: 54378.4
Validation loss at epoch 80: 44372.1
_______________________________________________________________
Training loss at epoch 100: 31172.3
Validation loss at epoch 100: 24716.7
_______________________________________________________________
Training loss at epoch 120: 19957.1
Validation loss at epoch 120: 15357.3
_______________________________________________________________
Training loss at epoch 140: 13910.6
Validation loss at epoch 140: 10377.6
_______________________________________________________________
Training loss at epoch 160: 10421.3

#**Make predictions**

In [None]:
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()
pm = p + Ymean
pm = np.clip(pm, 0.5, 5.0)


# **Evaluation**

In [None]:
Y_pred = pm[:, test_indices]
Y_true = Y[:, test_indices]

Y_pred_flat = Y_pred[R_test == 1]
Y_true_flat = Y_true[R_test == 1]

rmse = np.sqrt(mean_squared_error(Y_true_flat, Y_pred_flat))
mse = mean_squared_error(Y_true_flat, Y_pred_flat)
mae = mean_absolute_error(Y_true_flat, Y_pred_flat)
r2 = r2_score(Y_true_flat, Y_pred_flat)

print(f"RMSE: {rmse}")
print("__________________________________")
print(f"MSE: {mse}")
print("__________________________________")
print(f"MAE: {mae}")
print("__________________________________")
print(f"R2 Score: {r2}")


RMSE: 0.088557551509485
__________________________________
MSE: 0.007842439929355089
__________________________________
MAE: 0.06425721929156199
__________________________________
R2 Score: 0.9925705810833888


In [None]:
# Calculate baseline MAE
baseline_predictions = np.tile(Ymean, (1, Y.shape[1]))
baseline_predictions_flat = baseline_predictions[R == 1]
baseline_mae = mean_absolute_error(Y[R == 1], baseline_predictions_flat)
# Calculate model MAE
model_mae = mean_absolute_error(Y_true_flat, Y_pred_flat)
# Calculate relative MAE
relative_mae = model_mae / baseline_mae


print(f"Relative MAE: {relative_mae}")

Relative MAE: 0.09637277060766568


#**Plots**

# **Training loss**


In [None]:
plt.plot(range(epochs), training_losses, label='Training Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss vs. Epochs')
plt.legend()
plt.show()

# **Training and validation loss**

In [None]:
plt.plot(range(epochs), training_losses, label='Training Loss')
plt.plot(range(epochs), validation_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss vs. Epochs')
plt.legend()
plt.show()

#**Predicted vs. Actual Ratings**

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(Y_true_flat, Y_pred_flat, alpha=0.5)
plt.plot([min(Y_true_flat), max(Y_true_flat)], [min(Y_true_flat), max(Y_true_flat)], color='red', linestyle='--')
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title('Predicted vs. Actual Ratings')
plt.show()

#**Histogram Of Residuals**

In [None]:
residuals = Y_true_flat - Y_pred_flat

plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=50, alpha=0.75)
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.show()

#**Predicted vs. Actual Ratings for A Sample User**

In [None]:
user_id = test_indices[0]  # any user in the test set

actual_ratings = Y[:, user_id]
predicted_ratings = pm[:, user_id]

rated_indices = R[:, user_id] == 1
actual_ratings = actual_ratings[rated_indices]
predicted_ratings = predicted_ratings[rated_indices]

plt.figure(figsize=(10, 6))
plt.scatter(actual_ratings, predicted_ratings, alpha=0.5)
plt.plot([min(actual_ratings), max(actual_ratings)], [min(actual_ratings), max(actual_ratings)], color='red', linestyle='--')
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title(f'Predicted vs. Actual Ratings for User {user_id}')
plt.show()

#**Enhanced visualization for predicted and actual ratings**

In [None]:
user_id = test_indices[22]  # any user in the test set

actual_ratings = Y[:, user_id]
predicted_ratings = pm[:, user_id]

rated_indices = R[:, user_id] == 1
actual_ratings = actual_ratings[rated_indices]
predicted_ratings = predicted_ratings[rated_indices]
movie_ids = np.arange(len(Y))[rated_indices]

data = pd.DataFrame({
    'Movie ID': movie_ids,
    'Actual Ratings': actual_ratings,
    'Predicted Ratings': predicted_ratings
})

data_melted = data.melt(id_vars='Movie ID', value_vars=['Actual Ratings', 'Predicted Ratings'], var_name='Type', value_name='Rating')

plt.figure(figsize=(14, 7))
sns.barplot(x='Movie ID', y='Rating', hue='Type', data=data_melted)

plt.xticks(rotation=90)

plt.xlabel('Movie ID')
plt.ylabel('Ratings')
plt.title(f'Actual vs. Predicted Ratings for User {user_id}')
plt.legend()
plt.show()

#**Recommending Movies To The Random User From Test set**
recommend the top 10 movies for a random user from the test set.

The recommended movies are among the
top-rated movies with at least 50 ratings

In [None]:
random_user = np.random.choice(test_indices)
user_ratings = pm[:, random_user]
watched_movies = R[:, random_user] == 1
unwatched_movies = np.where(watched_movies == 0)[0]

movie_ratings = ratings.groupby('movieId').agg({'rating': ['mean', 'count']})
movie_ratings.columns = ['mean_rating', 'count_rating']
top_rated_movies = movie_ratings[movie_ratings['count_rating'] >= 50].sort_values(by='mean_rating', ascending=False).index

recommended_movies = np.array([movie for movie in unwatched_movies if movie in top_rated_movies])
top_10_recommendations = np.argsort(user_ratings[recommended_movies])[-10:][::-1]

recommendations = pd.DataFrame({
    'movieId': recommended_movies[top_10_recommendations],
    'predicted_rating': user_ratings[recommended_movies][top_10_recommendations]
})

recommendations = recommendations.merge(movies, left_on='movieId', right_on='movieId')
recommendations = recommendations.merge(movie_ratings, left_on='movieId', right_index=True)

print(f"Top 10 movie recommendations for user {random_user}:")
print(recommendations[['title', 'mean_rating', 'count_rating', 'predicted_rating']])


# **Ploting the recommendations**

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='predicted_rating', y='title', data=recommendations, palette='viridis')
plt.xlabel('Predicted Rating')
plt.title(f'Top 10 Movie Recommendations for User {random_user}')
plt.show()
