In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
rounakbanik_the_movies_dataset_path = kagglehub.dataset_download('rounakbanik/the-movies-dataset')

print('Data source import complete.')


# 🦾Different Types of Recommendation Systems :

It is likely that you will concur that there are multiple approaches to determining what to suggest or recommend when a friend seeks our opinion. This principle is equally applicable to artificial intelligence.

In the realm of `machine learning`, three predominant techniques for constructing recommendation engines are
1. Content-based filtering.
2. Collaborative filtering.
3. Hybrid filtering.

## 1- content-based filtering:
This method generates suggestions *based on items* you have previously liked. It uses historical data such as purchase records and search history to identify similar products.

> For example, if you rated the movie "Inception" with five stars, the system will recommend similar movies such as "Interstellar".

![image.png](attachment:36e37efe-734b-44e0-addf-56e148240783.png)

However, if all recommendation systems solely relied on your viewing history, discovering new genres and films would be challenging. This is where **Collaborative filtering** comes into play. But what exactly does it entail?.

## 2- Collaborative filtering:
This method identifies other users who have similar preferences to you, and recommends items **based on their choices**.

> For example, if you and your friend both like the movie "The Shawshank Redemption", and your friend also likes "Forrest Gump", the system will recommend "Forrest Gump" to you.

![image.png](attachment:c069599b-e451-4388-befb-b8a251d590da.png)

In the following sections, I will show you how to build a movie recommendation engine using Python and **collaborative filtering techniques**.

## Summary:
![image.png](attachment:45c4702a-aa37-401f-b6c8-8703db115be1.png)

![image.png](attachment:54a4168b-86ce-4141-bf38-9bcbeb1b9fa7.png)

_____

*I'm going to attempt to create a Recommendation Systems based on `collaborative filtering` using `matrix factorization` to get all my embeddings straight and then I'm going to top it off with a `k-means` clustering algorithm to get my prediction to see in general how well i have done collaborative filtering .*

____

## What Is Matrix Factorization ??

[*Matrix Factorization*](https://medium.com/@ilyurek/understanding-matrix-factorization-a-simple-guide-20e2b32989eb) is an advanced **collaborative filtering** technique used to solve the problem of missing data (such as movies that have not been rated by a user yet). We divide the large rating matrix into smaller matrices that represent the relationships between users and movies.

**How does it work?**

We take a rating matrix **R** where the rows contain users and the columns contain movies. The matrix stores the users' ratings for the movies. If a user has not rated a particular movie, this cell will be empty. Matrix analysis breaks this matrix into:

User matrix **U**: Each user is represented by a vector that reflects their hidden preferences.
Movie matrix **V**: Each movie is represented by a vector that reflects the hidden characteristics associated with that movie.

We multiply these two matrices to recover the original rating matrix **R**.

The main idea is to predict the missing ratings through the matrix recovery process.

![image.png](attachment:77dbbbed-eac6-48f7-9936-035ef3656349.png)

-----------

# 🎞️ Start To Build Recommendation Systems Based On Collaborative Filtering:

---------------------

# 🗂️ Import necessary libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pandas as pd
import numpy as np


import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from sklearn.decomposition import PCA

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# 📈 Download DataSet

I will use `MovieLens` dataset for my project, which is a popular dataset for movie ratings.
I will download it from the GroupLens website.

In [None]:
! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

In [None]:
# extracts the contents of the ml-latest-small.zip file to a directory named data.
# This is useful when you need to access the files inside the ZIP archive.
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

In [None]:
# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

# 📊 Data analysis with the MovieLens dataset.

In [None]:
print('The dimensions of movies dataframe are:', movies_df.shape,'\nThe dimensions of ratings dataframe are:', ratings_df.shape)

In [None]:
movies_df

In [None]:
ratings_df

In [None]:
print(movies_df.info())

In [None]:
print(ratings_df.info())

In [None]:
# Distribution of ratings
sns.histplot(ratings_df['rating'], bins=25, color='green', edgecolor='black')
plt.title('Distribution of Ratings')
plt.xlabel('Rating')
plt.ylabel('Count')

# Sort unique values
unique_ratings = np.sort(ratings_df['rating'].unique())

# Modify the label on the X axis using ordered values
plt.xticks(ticks=unique_ratings, labels=unique_ratings, fontsize=12, rotation=0)

plt.show()

In [None]:
# Calculating some important statistics
stats = {
    'Total number of ratings': len(ratings_df),
    'Average overall rating': ratings_df['rating'].mean(),
    'Standard deviation of ratings': ratings_df['rating'].std(),
    'Number of active users': len(ratings_df['userId'].unique()),
    'Average number of ratings per user': len(ratings_df) / len(ratings_df['userId'].unique())
}

for key, value in stats.items():
    print(f"{key}: {value:.2f}")

In [None]:
# Movie ID to movie name mapping
movie_names = movies_df.set_index('movieId')['title'].to_dict()

# Calculate the number of unique users and movies
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())

# Print the results
print("Number of unique users:", n_users)
print("Number of unique movies:", n_items)
print("The full rating matrix will have:", n_users * n_items, 'elements.')
print('---------------------------------------------------------------------')
print("Number of ratings:", len(ratings_df))
print("Therefore: ", len(ratings_df) / (n_users * n_items) * 100, '% of the matrix is filled.')

We have an incredibly sparse matrix to work with here.
And... as the number of users and products grow, the number of elements will increase by **n*2**

You are going to need a lot of memory to work with global scale... storing a full matrix in memory would be a challenge.
One advantage here is that matrix factorization can realize the rating matrix implicitly, thus we don't need all the data.

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# 📖 Matrix Factorization, Model Initialization & Training Model

In [None]:
# 1. Define Dataset
class MovieDataset(Dataset):
    def __init__(self, ratings_df, user_to_idx, movie_to_idx):
        self.users = torch.tensor([user_to_idx[user] for user in ratings_df['userId']], dtype=torch.long)
        self.movies = torch.tensor([movie_to_idx[movie] for movie in ratings_df['movieId']], dtype=torch.long)
        self.ratings = torch.tensor(ratings_df['rating'].values, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]

In [None]:
# 2. Define Matrix Factorization model
class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_factors=50):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.user_biases = nn.Embedding(n_users, 1)
        self.movie_biases = nn.Embedding(n_movies, 1)

    def forward(self, user, movie):
        user_embedding = self.user_factors(user)
        movie_embedding = self.movie_factors(movie)
        user_bias = self.user_biases(user)
        movie_bias = self.movie_biases(movie)

        prediction = (user_embedding * movie_embedding).sum(dim=1, keepdim=True)
        prediction = prediction + user_bias + movie_bias
        return prediction.squeeze()

    def get_embeddings(self, user, movie):
        """Extract user and movie embeddings"""
        user_embedding = self.user_factors(user)
        movie_embedding = self.movie_factors(movie)
        return user_embedding, movie_embedding

In [None]:
# 3. Combined model with Matrix Factorization and K-means
class CombinedRecommender:
    def __init__(self, n_users, n_movies, n_factors=50, n_clusters=5):
        self.mf_model = MatrixFactorization(n_users, n_movies, n_factors)
        self.kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        self.n_factors = n_factors
        self.n_clusters = n_clusters

    def train_mf(self, train_loader, test_loader, n_epochs=10):
        """Train Matrix Factorization model"""
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.mf_model.parameters(), lr=0.01)

        train_losses = []
        test_losses = []

        for epoch in range(n_epochs):
            # Train the model
            self.mf_model.train()
            total_train_loss = 0
            for users, movies, ratings in train_loader:
                optimizer.zero_grad()
                predictions = self.mf_model(users, movies)
                loss = criterion(predictions, ratings)
                loss.backward()
                optimizer.step()
                total_train_loss += loss.item()

            # Evaluate the model
            self.mf_model.eval()
            total_test_loss = 0
            with torch.no_grad():
                for users, movies, ratings in test_loader:
                    predictions = self.mf_model(users, movies)
                    loss = criterion(predictions, ratings)
                    total_test_loss += loss.item()

            train_losses.append(total_train_loss / len(train_loader))
            test_losses.append(total_test_loss / len(test_loader))

            print(f'Epoch {epoch+1}/{n_epochs}')
            print(f'Training Loss: {train_losses[-1]:.4f}')
            print(f'Test Loss: {test_losses[-1]:.4f}')

        return train_losses, test_losses

    def get_combined_embeddings(self, users, movies):
        """Combine user and movie embeddings"""
        self.mf_model.eval()
        with torch.no_grad():
            user_emb, movie_emb = self.mf_model.get_embeddings(users, movies)
            combined_emb = torch.cat([user_emb, movie_emb], dim=1)
            return combined_emb.numpy()

    def train_kmeans(self, train_loader):
        """Train K-means on combined embeddings"""
        all_embeddings = []
        all_ratings = []

        self.mf_model.eval()
        with torch.no_grad():
            for users, movies, ratings in train_loader:
                combined_emb = self.get_combined_embeddings(users, movies)
                all_embeddings.append(combined_emb)
                all_ratings.extend(ratings.numpy())

        all_embeddings = np.vstack(all_embeddings)
        all_ratings = np.array(all_ratings)

        # Train K-means
        self.kmeans.fit(all_embeddings)
        self.cluster_ratings = {}

        # Calculate the average rating per cluster
        clusters = self.kmeans.predict(all_embeddings)
        for i in range(self.n_clusters):
            self.cluster_ratings[i] = np.mean(all_ratings[clusters == i])

    def predict(self, users, movies):
        """Predict using the combined model"""
        combined_emb = self.get_combined_embeddings(users, movies)
        clusters = self.kmeans.predict(combined_emb)

        # Get predictions from Matrix Factorization
        mf_predictions = self.mf_model(users, movies).numpy()

        # Adjust predictions using cluster information
        cluster_predictions = np.array([self.cluster_ratings[c] for c in clusters])

        # Combine predictions (weights can be adjusted)
        final_predictions = 0.7 * mf_predictions + 0.3 * cluster_predictions
        return final_predictions

In [None]:
# 4. Training and evaluation
def train_and_evaluate(ratings_df, n_factors=50, n_clusters=5):
    # Prepare data
    user_ids = ratings_df['userId'].unique()
    movie_ids = ratings_df['movieId'].unique()

    user_to_idx = {user: idx for idx, user in enumerate(user_ids)}
    movie_to_idx = {movie: idx for idx, movie in enumerate(movie_ids)}

    # Split data
    train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

    # Create DataLoaders
    train_dataset = MovieDataset(train_df, user_to_idx, movie_to_idx)
    test_dataset = MovieDataset(test_df, user_to_idx, movie_to_idx)

    train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

    # Create and train the combined model
    model = CombinedRecommender(len(user_ids), len(movie_ids), n_factors, n_clusters)

    # 1. Train Matrix Factorization
    print("Training Matrix Factorization...")
    train_losses, test_losses = model.train_mf(train_loader, test_loader)

    # 2. Train K-means
    print("\nTraining K-means...")
    model.train_kmeans(train_loader)

    # Evaluate the model
    print("\nEvaluating model...")
    model.mf_model.eval()
    all_predictions = []
    all_actuals = []

    with torch.no_grad():
        for users, movies, ratings in test_loader:
            predictions = model.predict(users, movies)
            all_predictions.extend(predictions)
            all_actuals.extend(ratings.numpy())

    # Compute and display performance metrics
    mse = mean_squared_error(all_actuals, all_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(all_actuals, all_predictions)

    print(f"\nFinal Results:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")

    # Visualize the results
    plt.figure(figsize=(15, 5))

    # Plot learning curve
    plt.subplot(1, 3, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.plot(test_losses, label='Test Loss')
    plt.title('Learning Curve')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # Plot Predictions vs Actuals
    plt.subplot(1, 3, 2)
    plt.scatter(all_actuals, all_predictions, alpha=0.1)
    plt.plot([1, 5], [1, 5], 'r--')
    plt.title('Predictions vs Actuals')
    plt.xlabel('Actual Ratings')
    plt.ylabel('Predicted Ratings')

    # Plot cluster distribution
    plt.subplot(1, 3, 3)
    combined_emb = model.get_combined_embeddings(
        train_dataset.users[:1000],
        train_dataset.movies[:1000]
    )
    clusters = model.kmeans.predict(combined_emb)

    # Use t-SNE for dimensionality reduction
    from sklearn.manifold import TSNE
    tsne = TSNE(n_components=2, random_state=42)
    reduced_emb = tsne.fit_transform(combined_emb)

    plt.scatter(reduced_emb[:, 0], reduced_emb[:, 1], c=clusters, cmap='viridis')
    plt.title('Cluster Distribution')

    plt.tight_layout()
    plt.show()

    return model

# 🔁 Train The Model

In [None]:
model = train_and_evaluate(ratings_df)

# 💿 Save The Model

In [None]:
torch.save({
    'mf_model_state_dict': model.mf_model.state_dict(),
    'kmeans_model': model.kmeans,
    'cluster_ratings': model.cluster_ratings
}, 'combined_recommender_model.pth')

#### Let's also add a comprehensive evaluation that includes `confusion matrix` and additional analysis:

# 🚀 comprehensive evaluation

In [None]:
class AdvancedModelEvaluation:
    def __init__(self, model, test_loader):
        self.model = model
        self.test_loader = test_loader
        self.evaluate()

    def evaluate(self):
        """Comprehensive model evaluation"""
        self.model.mf_model.eval()
        self.predictions = []
        self.actuals = []
        self.embeddings = []
        self.clusters = []

        with torch.no_grad():
            for users, movies, ratings in self.test_loader:
                # Get predictions and embeddings
                preds = self.model.predict(users, movies)
                emb = self.model.get_combined_embeddings(users, movies)
                clusters = self.model.kmeans.predict(emb)

                self.predictions.extend(preds)
                self.actuals.extend(ratings.numpy())
                self.embeddings.append(emb)
                self.clusters.extend(clusters)

        self.predictions = np.array(self.predictions)
        self.actuals = np.array(self.actuals)
        self.embeddings = np.vstack(self.embeddings)
        self.clusters = np.array(self.clusters)

    def create_confusion_matrix(self, threshold=3.5):
        """Create a confusion matrix using a specified threshold"""
        pred_classes = (self.predictions >= threshold).astype(int)
        actual_classes = (self.actuals >= threshold).astype(int)

        conf_matrix = confusion_matrix(actual_classes, pred_classes)

        plt.figure(figsize=(10, 8))
        sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

        # Print classification report
        print("\nClassification Report:")
        print(classification_report(actual_classes, pred_classes))

    def analyze_clusters(self):
        """Analyze clusters and their impact on performance"""
        cluster_metrics = {}

        for cluster in range(self.model.n_clusters):
            # Get the indices of the data points in the current cluster
            cluster_indices = np.where(self.clusters == cluster)[0]
            cluster_predictions = self.predictions[cluster_indices]
            cluster_actuals = self.actuals[cluster_indices]

            # Calculate metrics for the current cluster
            mse = mean_squared_error(cluster_actuals, cluster_predictions)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(cluster_actuals, cluster_predictions)

            # Store metrics in the cluster_metrics dictionary
            cluster_metrics[cluster] = {
                'MSE': mse,
                'RMSE': rmse,
                'MAE': mae,
                'Num Samples': len(cluster_indices)
            }

        # Print out cluster metrics
        for cluster, metrics in cluster_metrics.items():
            print(f"\nCluster {cluster} Metrics:")
            print(f"Number of Samples: {metrics['Num Samples']}")
            print(f"MSE: {metrics['MSE']:.4f}")
            print(f"RMSE: {metrics['RMSE']:.4f}")
            print(f"MAE: {metrics['MAE']:.4f}")

    def visualize_embeddings(self):
        """Visualize the embeddings of the users and movies using PCA"""
        pca = PCA(n_components=2)
        reduced_embeddings = pca.fit_transform(self.embeddings)

        plt.figure(figsize=(12, 8))
        scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=self.clusters, cmap='viridis', alpha=0.6)
        plt.title('PCA of User and Movie Embeddings')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plt.colorbar(scatter, label='Cluster')
        plt.show()

In [None]:
# Split the data into training and test set
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Define user_to_idx and movie_to_idx based on the full dataset
user_ids = ratings_df['userId'].unique()
movie_ids = ratings_df['movieId'].unique()
user_to_idx = {user: idx for idx, user in enumerate(user_ids)}
movie_to_idx = {movie: idx for idx, movie in enumerate(movie_ids)}

# Create a Dataset for the test set
test_dataset = MovieDataset(test_df, user_to_idx, movie_to_idx)

# Create a DataLoader for the test set
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [None]:
evaluation = AdvancedModelEvaluation(model, test_loader)

In [None]:
evaluation.create_confusion_matrix(threshold=3.5)

In [None]:
evaluation.analyze_clusters()

In [None]:
evaluation.visualize_embeddings()

# 🤗 Thank You

________