# Collaborative Filtering Evaluation with Confusion Matrix

This notebook demonstrates how to evaluate collaborative filtering algorithms using confusion matrix metrics by treating the problem as a classification task.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# Import our collaborative filtering algorithm
from algorithm import get_recommendations, CollaborativeFilter

## Load Movie Dataset

In [None]:
# Load the movie dataset
df = pd.read_csv('dataset/movies.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

## Create Sample User Ratings for Testing

In [None]:
# Create sample user ratings for evaluation
np.random.seed(42)

# Select a subset of movies for testing
test_movies = df.sample(100, random_state=42)
movie_ids = test_movies['id'].astype(str).tolist()

# Create sample user ratings (simulate multiple users)
def create_sample_ratings(movie_ids: List[str], n_users: int = 5) -> Dict[str, Dict[str, int]]:
    """Create sample user ratings for testing."""
    users_ratings = {}
    
    for user_id in range(n_users):
        user_name = f"user_{user_id}"
        user_ratings = {}
        
        # Each user rates 20-40 movies randomly
        n_ratings = np.random.randint(20, min(41, len(movie_ids)))
        rated_movies = np.random.choice(movie_ids, n_ratings, replace=False)
        
        for movie_id in rated_movies:
            # Generate ratings with some preference patterns
            rating = np.random.choice([1,2,3,4,5,6,7,8,9,10], 
                                    p=[0.05,0.05,0.1,0.1,0.15,0.15,0.2,0.1,0.05,0.05])
            user_ratings[movie_id] = rating
            
        users_ratings[user_name] = user_ratings
    
    return users_ratings

# Create sample data
sample_users = create_sample_ratings(movie_ids, n_users=5)
print(f"Created ratings for {len(sample_users)} users")
for user, ratings in sample_users.items():
    print(f"{user}: {len(ratings)} ratings, avg: {np.mean(list(ratings.values())):.1f}")

## Evaluation Method 1: Rating Prediction as Multi-Class Classification

In [None]:
def evaluate_rating_prediction_multiclass(df: pd.DataFrame, 
                                         train_ratings: Dict[str, int],
                                         test_ratings: Dict[str, int]) -> Dict[str, any]:
    """
    Evaluate collaborative filtering as multi-class classification.
    Classes: Poor (1-3), Average (4-6), Good (7-10)
    """
    
    def rating_to_class(rating: int) -> str:
        if rating <= 3:
            return "Poor"
        elif rating <= 6:
            return "Average" 
        else:
            return "Good"
    
    # Get recommendations based on training data
    recommender = CollaborativeFilter(df)
    recommender.update_user_ratings(train_ratings)
    
    # For each test movie, predict its class based on recommendation score
    y_true = []
    y_pred = []
    
    for movie_id, actual_rating in test_ratings.items():
        # Get movie data
        movie_row = df[df['id'].astype(str) == str(movie_id)]
        if movie_row.empty:
            continue
            
        movie = movie_row.iloc[0]
        
        # Calculate recommendation score
        rated_movies, user_avg_rating, top_genres, genre_scores = recommender._get_user_preferences()
        if not top_genres:
            predicted_class = "Average"  # Default prediction
        else:
            score = recommender._calculate_movie_score(movie, top_genres, genre_scores, user_avg_rating)
            
            # Convert score to class prediction
            if score >= 8.0:
                predicted_class = "Good"
            elif score >= 5.0:
                predicted_class = "Average"
            else:
                predicted_class = "Poor"
        
        y_true.append(rating_to_class(actual_rating))
        y_pred.append(predicted_class)
    
    if not y_true:
        return {"error": "No valid predictions made"}
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    
    # Create confusion matrix
    labels = ["Poor", "Average", "Good"]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    
    # Calculate per-class metrics
    report = classification_report(y_true, y_pred, labels=labels, output_dict=True)
    
    return {
        "confusion_matrix": cm,
        "labels": labels,
        "accuracy": accuracy,
        "classification_report": report,
        "predictions": list(zip(y_true, y_pred))
    }

# Test with one user
test_user = "user_0"
user_ratings = sample_users[test_user]

# Split ratings into train/test (80/20)
movie_list = list(user_ratings.keys())
np.random.shuffle(movie_list)
split_point = int(0.8 * len(movie_list))

train_ratings = {k: user_ratings[k] for k in movie_list[:split_point]}
test_ratings = {k: user_ratings[k] for k in movie_list[split_point:]}

print(f"Train set: {len(train_ratings)} movies")
print(f"Test set: {len(test_ratings)} movies")

# Evaluate
results_multiclass = evaluate_rating_prediction_multiclass(df, train_ratings, test_ratings)
print(f"\nMulti-class Classification Results:")
print(f"Accuracy: {results_multiclass['accuracy']:.3f}")

In [None]:
# Visualize confusion matrix for multi-class
plt.figure(figsize=(8, 6))
sns.heatmap(results_multiclass['confusion_matrix'], 
            annot=True, fmt='d', 
            xticklabels=results_multiclass['labels'], 
            yticklabels=results_multiclass['labels'],
            cmap='Blues')
plt.title('Confusion Matrix - Multi-class Rating Prediction')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print detailed classification report
print("\nDetailed Classification Report:")
for label in results_multiclass['labels']:
    metrics = results_multiclass['classification_report'][label]
    print(f"{label:>8}: Precision={metrics['precision']:.3f}, Recall={metrics['recall']:.3f}, F1={metrics['f1-score']:.3f}")

## Evaluation Method 2: Binary Classification (Good vs Not Good)

In [None]:
def evaluate_binary_classification(df: pd.DataFrame, 
                                 train_ratings: Dict[str, int],
                                 test_ratings: Dict[str, int],
                                 threshold: int = 7) -> Dict[str, any]:
    """
    Evaluate collaborative filtering as binary classification.
    Positive class: Rating >= threshold (Good movies)
    Negative class: Rating < threshold (Not good movies)
    """
    
    # Get recommendations based on training data
    recommender = CollaborativeFilter(df)
    recommender.update_user_ratings(train_ratings)
    
    y_true = []
    y_pred = []
    
    for movie_id, actual_rating in test_ratings.items():
        # Get movie data
        movie_row = df[df['id'].astype(str) == str(movie_id)]
        if movie_row.empty:
            continue
            
        movie = movie_row.iloc[0]
        
        # Calculate recommendation score
        rated_movies, user_avg_rating, top_genres, genre_scores = recommender._get_user_preferences()
        if not top_genres:
            predicted_good = 0  # Default: not good
        else:
            score = recommender._calculate_movie_score(movie, top_genres, genre_scores, user_avg_rating)
            
            # Convert score to binary prediction
            predicted_good = 1 if score >= 6.0 else 0  # Threshold for "good" recommendation
        
        actual_good = 1 if actual_rating >= threshold else 0
        
        y_true.append(actual_good)
        y_pred.append(predicted_good)
    
    if not y_true:
        return {"error": "No valid predictions made"}
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    return {
        "confusion_matrix": cm,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "threshold": threshold,
        "predictions": list(zip(y_true, y_pred))
    }

# Evaluate binary classification
results_binary = evaluate_binary_classification(df, train_ratings, test_ratings, threshold=7)

print("Binary Classification Results (Good vs Not Good):")
print(f"Threshold: {results_binary['threshold']} stars")
print(f"Accuracy: {results_binary['accuracy']:.3f}")
print(f"Precision: {results_binary['precision']:.3f}")
print(f"Recall: {results_binary['recall']:.3f}")
print(f"F1-Score: {results_binary['f1_score']:.3f}")

In [None]:
# Visualize binary confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(results_binary['confusion_matrix'], 
            annot=True, fmt='d', 
            xticklabels=['Not Good', 'Good'], 
            yticklabels=['Not Good', 'Good'],
            cmap='Blues')
plt.title('Confusion Matrix - Binary Classification')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Calculate additional metrics from confusion matrix
tn, fp, fn, tp = results_binary['confusion_matrix'].ravel()
print(f"\nConfusion Matrix Breakdown:")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}")
print(f"True Positives (TP): {tp}")

# Calculate specificity
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
print(f"\nSpecificity (True Negative Rate): {specificity:.3f}")
print(f"Sensitivity (True Positive Rate/Recall): {results_binary['recall']:.3f}")

## Evaluation Method 3: Recommendation as Binary Classification

In [None]:
def evaluate_recommendation_binary(df: pd.DataFrame, 
                                 train_ratings: Dict[str, int],
                                 test_ratings: Dict[str, int],
                                 n_recommendations: int = 10,
                                 threshold: int = 7) -> Dict[str, any]:
    """
    Evaluate collaborative filtering recommendations as binary classification.
    Positive: Movie is in top-N recommendations
    Negative: Movie is not in top-N recommendations
    Ground truth based on actual ratings >= threshold
    """
    
    # Get recommendations based on training data
    recommended_movies = get_recommendations(df, train_ratings, n_recommendations)
    recommended_ids = set(recommended_movies['id'].astype(str).tolist())
    
    y_true = []  # 1 if user actually liked the movie (rating >= threshold)
    y_pred = []  # 1 if movie was recommended
    
    # Evaluate all test movies
    for movie_id, actual_rating in test_ratings.items():
        actual_liked = 1 if actual_rating >= threshold else 0
        was_recommended = 1 if movie_id in recommended_ids else 0
        
        y_true.append(actual_liked)
        y_pred.append(was_recommended)
    
    if not y_true:
        return {"error": "No valid predictions made"}
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    return {
        "confusion_matrix": cm,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "n_recommendations": n_recommendations,
        "threshold": threshold,
        "recommended_movies": recommended_movies,
        "predictions": list(zip(y_true, y_pred))
    }

# Evaluate recommendation system
results_recommendation = evaluate_recommendation_binary(df, train_ratings, test_ratings, 
                                                      n_recommendations=5, threshold=7)

print("Recommendation System Evaluation:")
print(f"Number of recommendations: {results_recommendation['n_recommendations']}")
print(f"Threshold for 'liked': {results_recommendation['threshold']} stars")
print(f"Accuracy: {results_recommendation['accuracy']:.3f}")
print(f"Precision: {results_recommendation['precision']:.3f}")
print(f"Recall: {results_recommendation['recall']:.3f}")
print(f"F1-Score: {results_recommendation['f1_score']:.3f}")

# Show recommended movies
print(f"\nRecommended Movies:")
for _, movie in results_recommendation['recommended_movies'].head().iterrows():
    print(f"- {movie['title']} (Rating: {movie['vote_average']})")

In [None]:
# Visualize recommendation confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(results_recommendation['confusion_matrix'], 
            annot=True, fmt='d', 
            xticklabels=['Not Recommended', 'Recommended'], 
            yticklabels=['Not Liked', 'Liked'],
            cmap='Blues')
plt.title('Confusion Matrix - Recommendation System')
plt.ylabel('True Label (User Preference)')
plt.xlabel('Predicted Label (System Recommendation)')
plt.show()

## Cross-User Evaluation

In [None]:
def evaluate_all_users(df: pd.DataFrame, users_ratings: Dict[str, Dict[str, int]]) -> pd.DataFrame:
    """
    Evaluate collaborative filtering across multiple users.
    """
    results = []
    
    for user_id, user_ratings in users_ratings.items():
        if len(user_ratings) < 10:  # Skip users with too few ratings
            continue
            
        # Split ratings
        movie_list = list(user_ratings.keys())
        np.random.shuffle(movie_list)
        split_point = int(0.8 * len(movie_list))
        
        train_ratings = {k: user_ratings[k] for k in movie_list[:split_point]}
        test_ratings = {k: user_ratings[k] for k in movie_list[split_point:]}
        
        if len(test_ratings) < 3:  # Need at least 3 test ratings
            continue
        
        # Evaluate binary classification
        binary_results = evaluate_binary_classification(df, train_ratings, test_ratings)
        
        # Evaluate recommendation system
        rec_results = evaluate_recommendation_binary(df, train_ratings, test_ratings, 
                                                   n_recommendations=5)
        
        results.append({
            'user_id': user_id,
            'n_train': len(train_ratings),
            'n_test': len(test_ratings),
            'binary_accuracy': binary_results['accuracy'],
            'binary_precision': binary_results['precision'],
            'binary_recall': binary_results['recall'],
            'binary_f1': binary_results['f1_score'],
            'rec_accuracy': rec_results['accuracy'],
            'rec_precision': rec_results['precision'],
            'rec_recall': rec_results['recall'],
            'rec_f1': rec_results['f1_score']
        })
    
    return pd.DataFrame(results)

# Evaluate all users
all_results = evaluate_all_users(df, sample_users)
print("Cross-User Evaluation Results:")
print(all_results)

# Summary statistics
print("\nSummary Statistics:")
print("Binary Classification:")
print(f"  Mean Accuracy: {all_results['binary_accuracy'].mean():.3f} ± {all_results['binary_accuracy'].std():.3f}")
print(f"  Mean Precision: {all_results['binary_precision'].mean():.3f} ± {all_results['binary_precision'].std():.3f}")
print(f"  Mean Recall: {all_results['binary_recall'].mean():.3f} ± {all_results['binary_recall'].std():.3f}")
print(f"  Mean F1-Score: {all_results['binary_f1'].mean():.3f} ± {all_results['binary_f1'].std():.3f}")

print("\nRecommendation System:")
print(f"  Mean Accuracy: {all_results['rec_accuracy'].mean():.3f} ± {all_results['rec_accuracy'].std():.3f}")
print(f"  Mean Precision: {all_results['rec_precision'].mean():.3f} ± {all_results['rec_precision'].std():.3f}")
print(f"  Mean Recall: {all_results['rec_recall'].mean():.3f} ± {all_results['rec_recall'].std():.3f}")
print(f"  Mean F1-Score: {all_results['rec_f1'].mean():.3f} ± {all_results['rec_f1'].std():.3f}")

## Visualization of Results

In [None]:
# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Accuracy comparison
axes[0, 0].boxplot([all_results['binary_accuracy'], all_results['rec_accuracy']], 
                   labels=['Binary Classification', 'Recommendation System'])
axes[0, 0].set_title('Accuracy Comparison')
axes[0, 0].set_ylabel('Accuracy')

# Precision comparison
axes[0, 1].boxplot([all_results['binary_precision'], all_results['rec_precision']], 
                   labels=['Binary Classification', 'Recommendation System'])
axes[0, 1].set_title('Precision Comparison')
axes[0, 1].set_ylabel('Precision')

# Recall comparison
axes[1, 0].boxplot([all_results['binary_recall'], all_results['rec_recall']], 
                   labels=['Binary Classification', 'Recommendation System'])
axes[1, 0].set_title('Recall Comparison')
axes[1, 0].set_ylabel('Recall')

# F1-Score comparison
axes[1, 1].boxplot([all_results['binary_f1'], all_results['rec_f1']], 
                   labels=['Binary Classification', 'Recommendation System'])
axes[1, 1].set_title('F1-Score Comparison')
axes[1, 1].set_ylabel('F1-Score')

plt.tight_layout()
plt.show()

## Additional Metrics: Precision@K and Recall@K

In [None]:
def calculate_precision_at_k(df: pd.DataFrame, 
                            train_ratings: Dict[str, int],
                            test_ratings: Dict[str, int],
                            k_values: List[int] = [1, 3, 5, 10],
                            threshold: int = 7) -> Dict[int, float]:
    """
    Calculate Precision@K for different values of K.
    """
    precision_at_k = {}
    
    # Get liked movies from test set
    liked_movies = set([movie_id for movie_id, rating in test_ratings.items() if rating >= threshold])
    
    for k in k_values:
        # Get top-K recommendations
        recommendations = get_recommendations(df, train_ratings, k)
        recommended_ids = set(recommendations['id'].astype(str).tolist())
        
        # Calculate precision@k
        if recommended_ids:
            relevant_recommendations = recommended_ids.intersection(liked_movies)
            precision_at_k[k] = len(relevant_recommendations) / len(recommended_ids)
        else:
            precision_at_k[k] = 0.0
    
    return precision_at_k

def calculate_recall_at_k(df: pd.DataFrame, 
                         train_ratings: Dict[str, int],
                         test_ratings: Dict[str, int],
                         k_values: List[int] = [1, 3, 5, 10],
                         threshold: int = 7) -> Dict[int, float]:
    """
    Calculate Recall@K for different values of K.
    """
    recall_at_k = {}
    
    # Get liked movies from test set
    liked_movies = set([movie_id for movie_id, rating in test_ratings.items() if rating >= threshold])
    
    if not liked_movies:
        return {k: 0.0 for k in k_values}
    
    for k in k_values:
        # Get top-K recommendations
        recommendations = get_recommendations(df, train_ratings, k)
        recommended_ids = set(recommendations['id'].astype(str).tolist())
        
        # Calculate recall@k
        relevant_recommendations = recommended_ids.intersection(liked_movies)
        recall_at_k[k] = len(relevant_recommendations) / len(liked_movies)
    
    return recall_at_k

# Calculate Precision@K and Recall@K for test user
k_values = [1, 3, 5, 10]
precision_at_k = calculate_precision_at_k(df, train_ratings, test_ratings, k_values)
recall_at_k = calculate_recall_at_k(df, train_ratings, test_ratings, k_values)

print("Precision@K and Recall@K Results:")
for k in k_values:
    print(f"K={k}: Precision@{k}={precision_at_k[k]:.3f}, Recall@{k}={recall_at_k[k]:.3f}")

# Plot Precision@K and Recall@K
plt.figure(figsize=(10, 6))
plt.subplot(1, 2, 1)
plt.plot(k_values, [precision_at_k[k] for k in k_values], 'bo-', linewidth=2, markersize=8)
plt.title('Precision@K')
plt.xlabel('K (Number of Recommendations)')
plt.ylabel('Precision@K')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(k_values, [recall_at_k[k] for k in k_values], 'ro-', linewidth=2, markersize=8)
plt.title('Recall@K')
plt.xlabel('K (Number of Recommendations)')
plt.ylabel('Recall@K')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary and Conclusions

This notebook demonstrates three different approaches to evaluate collaborative filtering algorithms using confusion matrix metrics:

1. **Multi-class Classification**: Converting ratings to classes (Poor/Average/Good)
2. **Binary Classification**: Good vs Not Good movies
3. **Recommendation Evaluation**: Whether recommended movies were actually liked

Key metrics calculated:
- **Confusion Matrix**: Shows true vs predicted classifications
- **Accuracy**: Overall correctness of predictions
- **Precision**: Of predicted positives, how many were actually positive
- **Recall**: Of actual positives, how many were predicted positive
- **F1-Score**: Harmonic mean of precision and recall
- **Precision@K**: Precision of top-K recommendations
- **Recall@K**: Recall of top-K recommendations

These evaluation methods help assess how well the collaborative filtering algorithm predicts user preferences and makes relevant recommendations.