In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import nltk
import torch
import torch.nn as nn
import torch.optim as optim

from surprise import Dataset, Reader, SVD, NMF, accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split
from sklearn.model_selection import train_test_split as sklearn_train_test_split
from sklearn.metrics import mean_squared_error, precision_recall_fscore_support, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.tokenize import word_tokenize

# For visualization
sns.set_style("darkgrid")

In [3]:
# Load MovieLens Dataset
df_ratings = pd.read_csv("ratings.csv")
df_movies = pd.read_csv("movies.csv")

# Merge ratings with movies for exploration
df = df_ratings.merge(df_movies, on="movieId")

# Display dataset info
print(df.head())

# Check missing values
print(df.isnull().sum())

# Basic stats
print(df.describe())

   userId  movieId  rating   timestamp          title       genres
0       1       16     4.0  1217897793  Casino (1995)  Crime|Drama
1       9       16     4.0   842686699  Casino (1995)  Crime|Drama
2      12       16     1.5  1144396284  Casino (1995)  Crime|Drama
3      24       16     4.0   963468757  Casino (1995)  Crime|Drama
4      29       16     3.0   836820223  Casino (1995)  Crime|Drama
userId       0
movieId      0
rating       0
timestamp    0
title        0
genres       0
dtype: int64
              userId        movieId         rating     timestamp
count  105339.000000  105339.000000  105339.000000  1.053390e+05
mean      364.924539   13381.312477       3.516850  1.130424e+09
std       197.486905   26170.456869       1.044872  1.802660e+08
min         1.000000       1.000000       0.500000  8.285650e+08
25%       192.000000    1073.000000       3.000000  9.711008e+08
50%       383.000000    2497.000000       3.500000  1.115154e+09
75%       557.000000    5991.000000     

In [5]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split as surprise_train_test_split

# Convert ratings to binary (1 = liked, 0 = not liked)
df["liked"] = np.where(df["rating"] >= 3.5, 1, 0)

# Define Surprise Reader
reader = Reader(rating_scale=(0, 5))

# Load data from the DataFrame
data = Dataset.load_from_df(df[["userId", "movieId", "rating"]], reader)

# Use Surprise's train_test_split to split the data into training and testing sets
trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

# Check the size of the trainset and testset
print(f"Training set size: {trainset.n_users} users, {trainset.n_items} items")
print(f"Test set size: {len(testset)} ratings")

Training set size: 668 users, 9504 items
Test set size: 21068 ratings


In [7]:
# Train Matrix Factorization Models
# Singular Value Decomposition (SVD)

svd = SVD()
svd.fit(trainset)
predictions_svd = svd.test(testset)

# Evaluate SVD
rmse_svd = accuracy.rmse(predictions_svd)
mae_svd = accuracy.mae(predictions_svd)

RMSE: 0.8726
MAE:  0.6720


In [9]:
# Non-Negative Matrix Factorization (NMF)

nmf = NMF()
nmf.fit(trainset)
predictions_nmf = nmf.test(testset)

# Evaluate NMF
rmse_nmf = accuracy.rmse(predictions_nmf)
mae_nmf = accuracy.mae(predictions_nmf)

RMSE: 0.9341
MAE:  0.7165


In [11]:
# Train XGBoost Model

# Convert ratings to binary (1 = liked, 0 = not liked)
df["liked"] = np.where(df["rating"] >= 3.5, 1, 0)

# Define Surprise Reader
reader = Reader(rating_scale=(0, 5))

# Load data from the DataFrame
data = Dataset.load_from_df(df[["userId", "movieId", "rating"]], reader)

# Use Surprise's train_test_split to split the data into training and testing sets
trainset, testset = surprise_train_test_split(data, test_size=0.2, random_state=42)

# Check the size of the trainset and testset
print(f"Training set size: {len(trainset.all_users())}")
print(f"Test set size: {len(testset)}")

# Prepare data for XGBoost using userId and movieId
features = df[["userId", "movieId"]]
labels = df["rating"]

# Train-test split for XGBoost using sklearn
X_train, X_test, y_train, y_test = sklearn_train_test_split(features, labels, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print(f"XGBoost RMSE: {rmse_xgb}")

Training set size: 668
Test set size: 21068
XGBoost RMSE: 0.9447678298359136


In [17]:
class NCFModel(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=50):
        super(NCFModel, self).__init__()
        # Embedding layers for users and movies
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)
        
        # Fully connected layers
        self.fc1 = nn.Linear(embedding_size * 2, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, 1)
    
    def forward(self, user_input, movie_input):
        # Embedding lookups for users and movies
        user_embedded = self.user_embedding(user_input)
        movie_embedded = self.movie_embedding(movie_input)
        
        # Concatenate the embeddings
        x = torch.cat([user_embedded, movie_embedded], dim=-1)
        
        # Pass through fully connected layers
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.output(x))  # Output between 0 and 1 for like/dislike prediction
        
        return x

# Prepare the data
df_ratings = pd.read_csv("ratings.csv")
df_movies = pd.read_csv("movies.csv")
df_ratings['liked'] = np.where(df_ratings['rating'] >= 3.5, 1, 0)

# Remap user and movie IDs to consecutive integers starting from 0
df_ratings['userId'] = pd.factorize(df_ratings['userId'])[0]
df_ratings['movieId'] = pd.factorize(df_ratings['movieId'])[0]

# Split data into train and test sets
train, test = sklearn_train_test_split(df_ratings, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
train_users = torch.tensor(train['userId'].values, dtype=torch.long)
train_movies = torch.tensor(train['movieId'].values, dtype=torch.long)
train_labels = torch.tensor(train['liked'].values, dtype=torch.float32)

test_users = torch.tensor(test['userId'].values, dtype=torch.long)
test_movies = torch.tensor(test['movieId'].values, dtype=torch.long)
test_labels = torch.tensor(test['liked'].values, dtype=torch.float32)

# Initialize the model with updated number of users and movies
num_users = df_ratings['userId'].nunique()
num_movies = df_ratings['movieId'].nunique()

model = NCFModel(num_users=num_users, num_movies=num_movies, embedding_size=50)

# Define loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
epochs = 5
batch_size = 256
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(train_users, train_movies).squeeze()
    
    # Compute loss
    loss = criterion(outputs, train_labels)
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Evaluate the model
model.eval()
with torch.no_grad():
    test_outputs = model(test_users, test_movies).squeeze()
    test_predictions = (test_outputs >= 0.5).float()  # Convert probabilities to binary predictions
    accuracy = (test_predictions == test_labels).float().mean()
    print(f"Test Accuracy: {accuracy.item()}")

Epoch 1/5, Loss: 0.7129961252212524
Epoch 2/5, Loss: 0.7060458064079285
Epoch 3/5, Loss: 0.7003841996192932
Epoch 4/5, Loss: 0.6954838037490845
Epoch 5/5, Loss: 0.6910406351089478
Test Accuracy: 0.604138970375061


In [22]:
#content based filtering with evaluation

from sklearn.metrics import precision_score, recall_score, f1_score

# Function to evaluate Precision, Recall, and F1 for content-based filtering
def evaluate_content_based_recommendation(user_ratings, top_k=10):
    # Initialize Precision, Recall, F1 score lists
    precision_list = []
    recall_list = []
    f1_list = []

    # For each user, calculate Precision@K, Recall@K, F1-Score
    for user_id in user_ratings['userId'].unique():
        # Get the ratings of the current user
        user_data = user_ratings[user_ratings['userId'] == user_id]
        liked_movies = user_data[user_data['rating'] >= 4]['movieId'].values
        
        # Generate top K recommendations based on content-based model (using nearest neighbors)
        top_recommendations = recommend_top_k_movies(user_id, top_k)  # This function needs to return the top K movie ids
        
        # Check how many of the top K recommended movies are in the list of liked movies
        relevant_recommendations = [movie for movie in top_recommendations if movie in liked_movies]
        
        # Calculate Precision@K, Recall@K
        precision = len(relevant_recommendations) / top_k
        recall = len(relevant_recommendations) / len(liked_movies) if len(liked_movies) > 0 else 0
        
        # Calculate F1 Score
        f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
        
        # Append metrics to lists
        precision_list.append(precision)
        recall_list.append(recall)
        f1_list.append(f1)
    
    # Calculate average Precision, Recall, F1 score across all users
    avg_precision = np.mean(precision_list)
    avg_recall = np.mean(recall_list)
    avg_f1 = np.mean(f1_list)
    
    return avg_precision, avg_recall, avg_f1

# Example function to recommend top K movies for a user based on content-based filtering
def recommend_top_k_movies(user_id, top_k=10):
    # Get movies rated by the user
    user_data = df_ratings[df_ratings['userId'] == user_id]
    rated_movie_ids = user_data['movieId'].values
    
    # For each movie, compute the cosine similarity with the user's rated movies (Content-Based)
    movie_indices = df_movies[df_movies['movieId'].isin(rated_movie_ids)].index.tolist()
    distances, indices = nn_model.kneighbors(tfidf_matrix[movie_indices], n_neighbors=top_k)
    
    # Flatten and get movie indices for top K recommendations
    recommended_movies = [df_movies.iloc[i]["movieId"] for i in indices.flatten()]
    
    return recommended_movies

# Evaluate the Content-Based Filtering model with Precision@K, Recall@K, F1-Score
precision, recall, f1 = evaluate_content_based_recommendation(df_ratings, top_k=10)

print(f"Content-Based Filtering Metrics:")
print(f"Precision@10: {precision:.4f}")
print(f"Recall@10: {recall:.4f}")
print(f"F1-Score@10: {f1:.4f}")

Content-Based Filtering Metrics:
Precision@10: 4.2216
Recall@10: 0.3310
F1-Score@10: 0.5322


In [23]:
#Content-Based Filtering (TF-IDF) - without evaluation

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(df_movies["genres"])

# Nearest Neighbors Model
nn_model = NearestNeighbors(metric="cosine", algorithm="brute")
nn_model.fit(tfidf_matrix)

# Function to recommend movies
def recommend_movies(movie_title, n=5):
    idx = df_movies[df_movies["title"] == movie_title].index[0]
    distances, indices = nn_model.kneighbors(tfidf_matrix[idx], n_neighbors=n+1)
    
    print("Recommended Movies:")
    for i in indices.flatten()[1:]:
        print(df_movies.iloc[i]["title"])

# Example Recommendation
recommend_movies("Toy Story (1995)")

Recommended Movies:
Monsters, Inc. (2001)
DuckTales: The Movie - Treasure of the Lost Lamp (1990)
Shrek the Third (2007)
Antz (1998)
Toy Story (1995)
