# Content-based Recommendation

In [None]:
# Grid Search on Matrix Factorization
## Import Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
## Load MovieLens Small dataset
data = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest-small\ratings.csv", sep=",")

data.head()
## Check shape of data
data.shape
## Map user and movie IDs to unique consecutive indices starting from 0
user_ids = data['userId'].unique()
movie_ids = data['movieId'].unique()

user_mapping = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_mapping = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

data['userId'] = data['userId'].map(user_mapping)
data['movieId'] = data['movieId'].map(movie_mapping)
## Create the matrix factorization model
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=20):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)
        self.linear = nn.Linear(2*embedding_size, 1)

    def forward(self, X):
        user_embedding = self.user_embedding(X[:,0])
        movie_embedding = self.movie_embedding(X[:,1])
        prediction = torch.sum(user_embedding * movie_embedding, dim=1)
        return prediction
## Custom Dataset to use Dataloaders
class MovieLensDataset(Dataset):
    def __init__(self, dataframe):
        self.users = torch.tensor(dataframe['userId'].values, dtype=torch.long)
        self.movies = torch.tensor(dataframe['movieId'].values, dtype=torch.long)
        self.ratings = torch.tensor(dataframe['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return (self.users[idx], self.movies[idx]), self.ratings[idx]
## Define parameter for grid search
embedding_sizes = [5, 10, 20, 50]
learning_rates = [0.01, 0.03, 0.1, 0.001]
regularizations = [0, 0.001, 0.01, 0.1]
## Generate all combinations
param_grid = list(itertools.product(embedding_sizes, learning_rates, regularizations))
## Store Results
results = []
## Use gpu for calculations
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
## Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
## Prepare datasets and data loaders
batch_size = 64
train_dataset = MovieLensDataset(train_data)
test_dataset = MovieLensDataset(test_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
## Loop over all combinations
for embedding_size, lr, reg in param_grid:
    num_users = len(user_ids)
    num_movies = len(movie_ids)
    model = MatrixFactorization(num_users, num_movies, embedding_size).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=reg)
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for (users, movies), ratings in train_loader:
            users = users.to(device)
            movies = movies.to(device)
            ratings = ratings.to(device)
            
            optimizer.zero_grad()
            outputs = model(torch.stack((users, movies), dim=1))
            loss = criterion(outputs, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    model.eval()
    with torch.no_grad():
        all_preds = []
        all_ratings = []
        for (users, movies), ratings in test_loader:
            users = users.to(device)
            movies = movies.to(device)
            ratings = ratings.to(device)
            
            outputs = model(torch.stack((users, movies), dim=1))
            all_preds.append(outputs.cpu())
            all_ratings.append(ratings.cpu())
        
        predictions = torch.cat(all_preds)
        y_test = torch.cat(all_ratings)
        test_mae = nn.L1Loss()(predictions, y_test).item()
        test_rmse = torch.sqrt(nn.MSELoss()(predictions, y_test)).item()
    results.append({
        'Embedding Size': embedding_size,
        'Learning Rate': lr,
        'Regularization': reg,
        'Test MAE': test_mae,
        'Test RMSE': test_rmse
    })
    
    print(f"Params: Embedding Size={embedding_size}, LR={lr}, Reg={reg}")
    print(f"Test MAE: {test_mae:.4f}, Test RMSE: {test_rmse:.4f}")
## Summarize Results
results_df = pd.DataFrame(results)
sorted_results = results_df.sort_values(by='Test MAE')
print("Top 10 configurations based on Test MAE:")
print(sorted_results.head(10))

## Visualizing the Results
plt.figure(figsize=(8, 6))
sns.boxplot(x='Embedding Size', y='Test MAE', data=results_df)
plt.title('Effect of Embedding Size on Test MAE')
plt.show()
plt.figure(figsize=(8, 6))
sns.boxplot(x='Learning Rate', y='Test MAE', data=results_df)
plt.title('Effect of Learning Rate on Test MAE')
plt.show()
plt.figure(figsize=(8, 6))
sns.boxplot(x='Regularization', y='Test MAE', data=results_df)
plt.title('Effect of Regularization on Test MAE')
plt.show()

## Import Libraries

In [8]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import torch
import os
device = torch.device('cuda')


## Initialize OMBD api to fetch plot summaries then save it as plots.csv file

## Load Datasets

In [2]:
# ratings = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest-small\ratings.csv")
# links = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest-small\links.csv")
# tags = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest-small\tags.csv")
# movies = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest-small\movies.csv")
# plots = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest-small\plots.csv")
ratings = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest\ratings.csv")
links = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest\links.csv")
tags = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest\tags.csv")
movies = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest\movies.csv")
plots = pd.read_csv(r"C:\Users\yineh\OneDrive\Masaüstü\ml-latest\plots.csv")

## Merge movie tags into a dataset

In [3]:
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()
movies = movies.merge(movie_tags, on='movieId', how='inner')
movies = movies.merge(plots[['movieId', 'plot_summary']], on='movieId', how='inner')
movies = movies.dropna(subset=['tag', 'plot_summary'])

## Combine Tags and Plot Summary

In [4]:
def combine_features(row):
    return f"Movie Plot: {row['plot_summary']} Keywords: {row['tag']}"

movies['plots_tags'] = movies.apply(combine_features, axis=1)

## Clean text data

In [None]:
# def clean_text_data(text):
#     text = text.lower()
#     text = re.sub(r'[^a-z0-9\s]', '', text)
#     return text

# tqdm.pandas(desc="Cleaning Text")
# movies['plots_tags'] = movies['plots_tags'].progress_apply(clean_text_data)


## Reset Movie Index

In [5]:
movies.reset_index(drop=True, inplace=True)
movie_indices = pd.Series(movies.index, index=movies['movieId']).drop_duplicates()

## Initialize the pre-trained language model

In [6]:
model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True)

## Generate embeddings for all movies

In [9]:
embedding_path = 'movie_embeddings.pt'

# Function to save embeddings
def save_embeddings(embeddings, path=embedding_path):
    torch.save(embeddings, path)
    print(f"Embeddings saved to {path}")

# Function to load embeddings
def load_embeddings(path=embedding_path):
    if os.path.exists(path):
        embeddings = torch.load(path, map_location=device)
        print(f"Embeddings loaded from {path}")
        return embeddings
    else:
        print("Embedding file not found. Generating embeddings from scratch.")
        return None

# Check if embeddings are already saved; load if available, otherwise generate and save
movie_embeddings = load_embeddings()
movie_embeddings = movie_embeddings.type(torch.FloatTensor).to(device)
if movie_embeddings is None:
    # Generate embeddings as before
    documents = movies['plots_tags'].tolist()
    batch_size = 64
    movie_embeddings = []
    
    for i in tqdm(range(0, len(documents), batch_size), desc="Generating Embeddings in Batches"):
        batch = documents[i:i + batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
        movie_embeddings.append(batch_embeddings.to(device))

    movie_embeddings = torch.cat(movie_embeddings)
    
    # Save embeddings after generating them
    save_embeddings(movie_embeddings)


Embedding file not found. Generating embeddings from scratch.


Generating Embeddings in Batches: 100%|██████████| 829/829 [35:27<00:00,  2.57s/it]


Embeddings saved to movie_embeddings.pt


## Cosine Similarity

In [None]:
# cosine_sim = cosine_similarity(movie_embeddings.cpu())  

array([1.0000001 , 0.56306165, 0.5670217 , ..., 0.49139744, 0.5040304 ,
       0.4888566 ], dtype=float32)

## Filter ratings to include only movies present in the movies DataFrame

In [10]:
ratings = ratings[ratings['movieId'].isin(movies['movieId'])]

## Predict rating

In [None]:
def predict_rating(user_id, movie_id):
    user_ratings = ratings[ratings['userId'] == user_id]
    user_movie_ids = user_ratings['movieId'].values

    if movie_id not in movie_indices.index or len(user_movie_ids) == 0:
        return ratings['rating'].mean()

    idx = movie_indices[movie_id]
    target_vector = movie_embeddings[idx].unsqueeze(0)

    user_indices = user_ratings['movieId'].map(movie_indices).dropna().astype(int)
    if len(user_indices) == 0:
        return ratings['rating'].mean()

    user_indices_list = user_indices.tolist()
    user_vectors = movie_embeddings[user_indices_list]
    ratings_values = user_ratings['rating'].values

    # Convert ratings to tensor
    ratings_tensor = torch.tensor(ratings_values, device=device)

    # Compute similarities on GPU
    similarities = torch.nn.functional.cosine_similarity(target_vector, user_vectors)

    # Handle zero similarity case
    if torch.sum(similarities) == 0:
        return ratings['rating'].mean()

    # Compute predicted rating
    predicted_rating = torch.dot(similarities, ratings_tensor) / torch.sum(similarities)
    return predicted_rating.item()


ratings_list = ratings.to_dict('records')

def predict_rating_wrapper(row):
    return predict_rating(row['userId'], row['movieId'])

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(tqdm(executor.map(predict_rating_wrapper, ratings_list), total=len(ratings_list), desc="Predicting Ratings"))

ratings['predicted_rating'] = results

## Calculate Mean Absolute Error

In [None]:
mae = mean_absolute_error(ratings['rating'], ratings['predicted_rating'])
print(f"Mean Absolute Error (MAE): {mae}")

Mean Absolute Error (MAE): 0.663473670155574


## Top-N Recommendation and Hit Ratio Setup

In [None]:
np.random.seed(42) 

positive_preferences = ratings
test_size = 1000
test_indices = np.random.choice(positive_preferences.index, size=test_size, replace=False)
test_set = positive_preferences.loc[test_indices]

train_set = ratings.drop(test_indices)
ratings = train_set


In [None]:
def get_top_n_recommendations(user_id, N=10):
    user_rated_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    user_indices = ratings[ratings['userId'] == user_id]['movieId'].map(movie_indices).dropna().astype(int)
    user_ratings_values = ratings[ratings['userId'] == user_id]['rating'].values

    if len(user_indices) == 0:
        return []

    user_indices_list = user_indices.tolist()
    user_feature_vectors = movie_embeddings[user_indices_list]

    # Convert ratings to tensor
    user_ratings_tensor = torch.tensor(user_ratings_values, device=device)

    # Compute user profile on GPU
    user_profile = torch.sum(user_feature_vectors * user_ratings_tensor.unsqueeze(1), dim=0) / torch.sum(user_ratings_tensor)

    # Compute similarities on GPU
    similarities = torch.nn.functional.cosine_similarity(user_profile.unsqueeze(0), movie_embeddings).squeeze(0)

    # Move similarities to CPU for further processing
    similarities = similarities.cpu().numpy()

    # Filter out movies already rated by the user
    candidate_indices = [idx for idx in range(len(movies)) if movies.loc[idx, 'movieId'] not in user_rated_movie_ids]
    candidate_similarities = similarities[candidate_indices]

    # Get top N recommendations
    top_N_indices = np.argsort(candidate_similarities)[-N:][::-1]
    top_N_movie_indices = [candidate_indices[i] for i in top_N_indices]
    top_N_movie_ids = movies.loc[top_N_movie_indices, 'movieId'].tolist()

    return top_N_movie_ids


def evaluate_top_n_recommendations(test_set, N=10):
    total = len(test_set)

    def evaluate_row(row):
        user_id = row['userId']
        test_movie_id = row['movieId']
        recommended_movie_ids = get_top_n_recommendations(user_id, N)
        return 1 if test_movie_id in recommended_movie_ids else 0

    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(tqdm(executor.map(evaluate_row, (row for _, row in test_set.iterrows())), total=total, desc="Evaluating Recommendations"))

    hits = sum(results)
    hit_ratio = hits / total
    return hit_ratio

N = 10
hit_ratio = evaluate_top_n_recommendations(test_set, N)
print(f'Hit Ratio: {hit_ratio}')
