# ðŸ““ Notebook 04: Model Evaluation
## Há»‡ thá»‘ng Recommendation Phim

**Má»¥c tiÃªu:**
1. Load trained models
2. Load test data
3. TÃ­nh metrics cho má»—i model (RMSE, MAE, Precision@K, Recall@K, etc.)
4. So sÃ¡nh hiá»‡u suáº¥t cÃ¡c models
5. LÆ°u metrics vÃ o MongoDB

---

## 1. Import Libraries

In [None]:
# Import libraries
import os
import sys
import pickle
import warnings
from datetime import datetime
from collections import defaultdict

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# MongoDB
from pymongo import MongoClient

# Progress bar
from tqdm import tqdm

# Suppress warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

print("âœ… Libraries imported successfully!")

## 2. Configuration & Load Data

In [None]:
# Configuration
MONGO_URI = "mongodb://localhost:27017"
DATABASE_NAME = "movie_recommendation"
MODELS_DIR = "../models_saved"
PROCESSED_DATA_DIR = "../data/processed"

# Evaluation parameters
TOP_K_VALUES = [5, 10, 20]  # For Precision@K, Recall@K, etc.
RATING_THRESHOLD = 3.5  # Threshold for considering a movie as "liked"

# Connect to MongoDB
client = MongoClient(MONGO_URI)
db = client[DATABASE_NAME]

print(f"Connected to MongoDB: {DATABASE_NAME}")
print(f"Models directory: {MODELS_DIR}")
print(f"Processed data directory: {PROCESSED_DATA_DIR}")

In [None]:
# Load movies and ratings data
movies_df = pd.read_csv(f"{PROCESSED_DATA_DIR}/movies_processed.csv")
ratings_df = pd.read_csv(f"{PROCESSED_DATA_DIR}/ratings_processed.csv")

# Load test data (if available, otherwise create train/test split)
try:
    test_ratings = pd.read_csv(f"{PROCESSED_DATA_DIR}/test_ratings.csv")
    train_ratings = pd.read_csv(f"{PROCESSED_DATA_DIR}/train_ratings.csv")
    print("Loaded existing train/test split")
except FileNotFoundError:
    # Create train/test split
    from sklearn.model_selection import train_test_split
    train_ratings, test_ratings = train_test_split(
        ratings_df, test_size=0.2, random_state=42
    )
    train_ratings.to_csv(f"{PROCESSED_DATA_DIR}/train_ratings.csv", index=False)
    test_ratings.to_csv(f"{PROCESSED_DATA_DIR}/test_ratings.csv", index=False)
    print("Created new train/test split")

print(f"\nMovies: {len(movies_df)}")
print(f"Total ratings: {len(ratings_df)}")
print(f"Train ratings: {len(train_ratings)}")
print(f"Test ratings: {len(test_ratings)}")

## 3. Load Trained Models

In [None]:
# Load all trained models
def load_model(model_name):
    """Load a trained model from pickle file"""
    model_path = f"{MODELS_DIR}/{model_name}.pkl"
    try:
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        print(f"âœ“ Loaded {model_name}")
        return model
    except FileNotFoundError:
        print(f"âœ— Model not found: {model_path}")
        return None

# Load all models
models = {
    'content_based': load_model('content_based_model'),
    'item_based': load_model('item_based_model'),
    'user_based': load_model('user_based_model'),
    'hybrid': load_model('hybrid_model')
}

# Check which models are available
available_models = {k: v for k, v in models.items() if v is not None}
print(f"\nAvailable models: {list(available_models.keys())}")

## 4. Define Evaluation Metrics

### 4.1 Prediction Accuracy Metrics (RMSE, MAE)

In [None]:
def rmse(y_true, y_pred):
    """Root Mean Squared Error"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.sqrt(np.mean((y_true - y_pred) ** 2))

def mae(y_true, y_pred):
    """Mean Absolute Error"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.mean(np.abs(y_true - y_pred))

# Test the metrics
y_true_test = [4.0, 3.5, 5.0, 2.0, 4.5]
y_pred_test = [3.8, 3.7, 4.8, 2.3, 4.2]
print(f"Test RMSE: {rmse(y_true_test, y_pred_test):.4f}")
print(f"Test MAE: {mae(y_true_test, y_pred_test):.4f}")

### 4.2 Ranking Metrics (Precision@K, Recall@K, F1@K)

In [None]:
def precision_at_k(recommended, relevant, k):
    """
    Precision@K: Proportion of recommended items that are relevant
    
    Args:
        recommended: List of recommended item IDs (ordered by rank)
        relevant: Set of relevant item IDs (items user actually liked)
        k: Number of top recommendations to consider
    """
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(relevant))
    return hits / k if k > 0 else 0.0

def recall_at_k(recommended, relevant, k):
    """
    Recall@K: Proportion of relevant items that are recommended
    
    Args:
        recommended: List of recommended item IDs (ordered by rank)
        relevant: Set of relevant item IDs (items user actually liked)
        k: Number of top recommendations to consider
    """
    if len(relevant) == 0:
        return 0.0
    recommended_k = recommended[:k]
    hits = len(set(recommended_k) & set(relevant))
    return hits / len(relevant)

def f1_at_k(recommended, relevant, k):
    """
    F1@K: Harmonic mean of Precision@K and Recall@K
    """
    p = precision_at_k(recommended, relevant, k)
    r = recall_at_k(recommended, relevant, k)
    if p + r == 0:
        return 0.0
    return 2 * p * r / (p + r)

# Test the ranking metrics
recommended_test = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
relevant_test = {2, 5, 8, 11, 15}

for k in [5, 10]:
    print(f"K={k}:")
    print(f"  Precision@{k}: {precision_at_k(recommended_test, relevant_test, k):.4f}")
    print(f"  Recall@{k}: {recall_at_k(recommended_test, relevant_test, k):.4f}")
    print(f"  F1@{k}: {f1_at_k(recommended_test, relevant_test, k):.4f}")

### 4.3 NDCG@K (Normalized Discounted Cumulative Gain)

In [None]:
def dcg_at_k(relevance_scores, k):
    """
    Discounted Cumulative Gain at K
    
    Args:
        relevance_scores: List of relevance scores in ranked order
        k: Number of items to consider
    """
    relevance_scores = np.array(relevance_scores[:k])
    if len(relevance_scores) == 0:
        return 0.0
    
    # DCG = sum(rel_i / log2(i+2)) for i from 0 to k-1
    discounts = np.log2(np.arange(2, len(relevance_scores) + 2))
    return np.sum(relevance_scores / discounts)

def ndcg_at_k(recommended, relevant_with_scores, k):
    """
    Normalized Discounted Cumulative Gain at K
    
    Args:
        recommended: List of recommended item IDs (ordered by rank)
        relevant_with_scores: Dict mapping item_id -> relevance score (e.g., rating)
        k: Number of items to consider
    """
    # Get relevance scores for recommended items
    recommended_k = recommended[:k]
    relevance_scores = [relevant_with_scores.get(item, 0) for item in recommended_k]
    
    # Calculate DCG
    dcg = dcg_at_k(relevance_scores, k)
    
    # Calculate ideal DCG (sorted relevance scores)
    ideal_scores = sorted(relevant_with_scores.values(), reverse=True)[:k]
    idcg = dcg_at_k(ideal_scores, k)
    
    if idcg == 0:
        return 0.0
    return dcg / idcg

# Test NDCG
recommended_test = [1, 2, 3, 4, 5]
relevant_with_scores_test = {1: 5, 2: 3, 3: 1, 6: 4, 7: 2}

for k in [3, 5]:
    print(f"NDCG@{k}: {ndcg_at_k(recommended_test, relevant_with_scores_test, k):.4f}")

### 4.4 Coverage (Catalog Coverage)

In [None]:
def catalog_coverage(all_recommendations, total_items):
    """
    Catalog Coverage: Proportion of items that appear in any recommendation list
    
    Args:
        all_recommendations: List of all recommendation lists for all users
        total_items: Total number of items in catalog
    """
    unique_recommended = set()
    for recs in all_recommendations:
        unique_recommended.update(recs)
    
    return len(unique_recommended) / total_items if total_items > 0 else 0.0

def diversity(recommendations, item_similarity_matrix=None):
    """
    Intra-list Diversity: Average dissimilarity between recommended items
    
    Args:
        recommendations: List of recommended item IDs
        item_similarity_matrix: Pre-computed similarity matrix (optional)
    """
    if len(recommendations) < 2:
        return 0.0
    
    # If no similarity matrix, return 1 (maximum diversity assumed)
    if item_similarity_matrix is None:
        return 1.0
    
    total_dissimilarity = 0
    count = 0
    
    for i in range(len(recommendations)):
        for j in range(i + 1, len(recommendations)):
            item_i, item_j = recommendations[i], recommendations[j]
            if item_i in item_similarity_matrix.index and item_j in item_similarity_matrix.columns:
                similarity = item_similarity_matrix.loc[item_i, item_j]
                total_dissimilarity += (1 - similarity)
            else:
                total_dissimilarity += 1  # Assume dissimilar if not in matrix
            count += 1
    
    return total_dissimilarity / count if count > 0 else 0.0

# Test coverage
all_recs_test = [[1, 2, 3], [2, 3, 4], [1, 5, 6]]
print(f"Catalog Coverage: {catalog_coverage(all_recs_test, 10):.4f}")

## 5. Evaluation Helper Functions

In [None]:
def get_user_relevant_items(user_id, ratings_df, threshold=RATING_THRESHOLD):
    """Get items that user rated >= threshold (considered 'liked')"""
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    relevant = user_ratings[user_ratings['rating'] >= threshold]['movieId'].tolist()
    return set(relevant)

def get_user_ratings_dict(user_id, ratings_df):
    """Get dictionary of movie_id -> rating for a user"""
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    return dict(zip(user_ratings['movieId'], user_ratings['rating']))

def evaluate_model_for_user(model, model_name, user_id, train_ratings, test_ratings, 
                            movies_df, k_values=TOP_K_VALUES):
    """
    Evaluate a model for a single user
    
    Returns dict with metrics for each k value
    """
    results = {}
    
    # Get relevant items from test set
    relevant_items = get_user_relevant_items(user_id, test_ratings)
    relevant_with_scores = get_user_ratings_dict(user_id, test_ratings)
    
    if len(relevant_items) == 0:
        return None  # Skip users with no relevant items in test set
    
    # Get recommendations
    try:
        if hasattr(model, 'recommend_for_user'):
            recommendations = model.recommend_for_user(user_id, n=max(k_values))
        elif hasattr(model, 'recommend'):
            recommendations = model.recommend(user_id, n=max(k_values))
        else:
            return None
        
        # Extract movie IDs from recommendations
        if isinstance(recommendations, list) and len(recommendations) > 0:
            if isinstance(recommendations[0], dict):
                rec_ids = [r.get('movieId', r.get('movie_id')) for r in recommendations]
            elif isinstance(recommendations[0], tuple):
                rec_ids = [r[0] for r in recommendations]
            else:
                rec_ids = recommendations
        else:
            rec_ids = []
            
    except Exception as e:
        print(f"Error getting recommendations for user {user_id}: {e}")
        return None
    
    # Calculate metrics for each k
    for k in k_values:
        results[f'precision@{k}'] = precision_at_k(rec_ids, relevant_items, k)
        results[f'recall@{k}'] = recall_at_k(rec_ids, relevant_items, k)
        results[f'f1@{k}'] = f1_at_k(rec_ids, relevant_items, k)
        results[f'ndcg@{k}'] = ndcg_at_k(rec_ids, relevant_with_scores, k)
    
    results['recommendations'] = rec_ids
    
    return results

print("Evaluation helper functions defined successfully!")

## 6. Evaluate All Models

In [None]:
def evaluate_model(model, model_name, train_ratings, test_ratings, movies_df, 
                   sample_users=100, k_values=TOP_K_VALUES):
    """
    Evaluate a model across multiple users
    
    Args:
        model: The recommendation model
        model_name: Name of the model
        train_ratings: Training ratings DataFrame
        test_ratings: Test ratings DataFrame
        movies_df: Movies DataFrame
        sample_users: Number of users to sample for evaluation
        k_values: List of K values for ranking metrics
    
    Returns:
        Dictionary with aggregated metrics
    """
    # Get users who have ratings in both train and test sets
    train_users = set(train_ratings['userId'].unique())
    test_users = set(test_ratings['userId'].unique())
    common_users = list(train_users & test_users)
    
    # Sample users
    if len(common_users) > sample_users:
        np.random.seed(42)
        sample_user_ids = np.random.choice(common_users, sample_users, replace=False)
    else:
        sample_user_ids = common_users
    
    print(f"\nEvaluating {model_name} on {len(sample_user_ids)} users...")
    
    # Collect results
    all_results = []
    all_recommendations = []
    
    for user_id in tqdm(sample_user_ids, desc=f"Evaluating {model_name}"):
        result = evaluate_model_for_user(
            model, model_name, user_id, train_ratings, test_ratings, movies_df, k_values
        )
        if result is not None:
            all_results.append(result)
            all_recommendations.append(result['recommendations'])
    
    if len(all_results) == 0:
        print(f"No valid results for {model_name}")
        return None
    
    # Aggregate results
    aggregated = {'model': model_name, 'num_users': len(all_results)}
    
    # Average ranking metrics
    for k in k_values:
        for metric in ['precision', 'recall', 'f1', 'ndcg']:
            key = f'{metric}@{k}'
            values = [r[key] for r in all_results]
            aggregated[key] = np.mean(values)
    
    # Calculate coverage
    total_items = len(movies_df['movieId'].unique())
    aggregated['coverage'] = catalog_coverage(all_recommendations, total_items)
    
    return aggregated

print("Model evaluation function defined successfully!")

In [None]:
# Evaluate all available models
evaluation_results = []

for model_name, model in available_models.items():
    result = evaluate_model(
        model=model,
        model_name=model_name,
        train_ratings=train_ratings,
        test_ratings=test_ratings,
        movies_df=movies_df,
        sample_users=100,
        k_values=TOP_K_VALUES
    )
    if result:
        evaluation_results.append(result)
        print(f"\n{model_name} Results:")
        print(f"  Coverage: {result['coverage']:.4f}")
        for k in TOP_K_VALUES:
            print(f"  Precision@{k}: {result[f'precision@{k}']:.4f}")
            print(f"  Recall@{k}: {result[f'recall@{k}']:.4f}")
            print(f"  NDCG@{k}: {result[f'ndcg@{k}']:.4f}")

print(f"\n{'='*50}")
print(f"Evaluation completed for {len(evaluation_results)} models")

## 7. Results Comparison

In [None]:
# Create comparison DataFrame
if evaluation_results:
    results_df = pd.DataFrame(evaluation_results)
    
    # Reorder columns for better display
    col_order = ['model', 'num_users', 'coverage']
    for k in TOP_K_VALUES:
        col_order.extend([f'precision@{k}', f'recall@{k}', f'f1@{k}', f'ndcg@{k}'])
    
    # Keep only columns that exist
    col_order = [c for c in col_order if c in results_df.columns]
    results_df = results_df[col_order]
    
    print("=" * 80)
    print("MODEL COMPARISON RESULTS")
    print("=" * 80)
    display(results_df.round(4))
else:
    print("No evaluation results to display")

## 8. Visualize Results

In [None]:
if evaluation_results:
    # Set up the plotting style
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    models_list = [r['model'] for r in evaluation_results]
    colors = plt.cm.Set2(np.linspace(0, 1, len(models_list)))
    
    # Plot 1: Precision@K comparison
    ax1 = axes[0, 0]
    x = np.arange(len(TOP_K_VALUES))
    width = 0.2
    for i, model_name in enumerate(models_list):
        result = next(r for r in evaluation_results if r['model'] == model_name)
        precision_values = [result[f'precision@{k}'] for k in TOP_K_VALUES]
        ax1.bar(x + i * width, precision_values, width, label=model_name, color=colors[i])
    ax1.set_xlabel('K')
    ax1.set_ylabel('Precision@K')
    ax1.set_title('Precision@K Comparison')
    ax1.set_xticks(x + width * (len(models_list) - 1) / 2)
    ax1.set_xticklabels([f'K={k}' for k in TOP_K_VALUES])
    ax1.legend()
    
    # Plot 2: Recall@K comparison
    ax2 = axes[0, 1]
    for i, model_name in enumerate(models_list):
        result = next(r for r in evaluation_results if r['model'] == model_name)
        recall_values = [result[f'recall@{k}'] for k in TOP_K_VALUES]
        ax2.bar(x + i * width, recall_values, width, label=model_name, color=colors[i])
    ax2.set_xlabel('K')
    ax2.set_ylabel('Recall@K')
    ax2.set_title('Recall@K Comparison')
    ax2.set_xticks(x + width * (len(models_list) - 1) / 2)
    ax2.set_xticklabels([f'K={k}' for k in TOP_K_VALUES])
    ax2.legend()
    
    # Plot 3: NDCG@K comparison
    ax3 = axes[1, 0]
    for i, model_name in enumerate(models_list):
        result = next(r for r in evaluation_results if r['model'] == model_name)
        ndcg_values = [result[f'ndcg@{k}'] for k in TOP_K_VALUES]
        ax3.bar(x + i * width, ndcg_values, width, label=model_name, color=colors[i])
    ax3.set_xlabel('K')
    ax3.set_ylabel('NDCG@K')
    ax3.set_title('NDCG@K Comparison')
    ax3.set_xticks(x + width * (len(models_list) - 1) / 2)
    ax3.set_xticklabels([f'K={k}' for k in TOP_K_VALUES])
    ax3.legend()
    
    # Plot 4: Coverage comparison
    ax4 = axes[1, 1]
    coverage_values = [r['coverage'] for r in evaluation_results]
    bars = ax4.bar(models_list, coverage_values, color=colors[:len(models_list)])
    ax4.set_xlabel('Model')
    ax4.set_ylabel('Coverage')
    ax4.set_title('Catalog Coverage Comparison')
    ax4.set_ylim(0, 1)
    
    # Add value labels on bars
    for bar, val in zip(bars, coverage_values):
        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                f'{val:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('../data/processed/model_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("\nChart saved to: ../data/processed/model_comparison.png")
else:
    print("No results to visualize")

In [None]:
# Radar chart for overall comparison
if evaluation_results and len(evaluation_results) > 0:
    fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
    
    # Metrics for radar chart (using K=10)
    k = 10
    metrics = [f'precision@{k}', f'recall@{k}', f'f1@{k}', f'ndcg@{k}', 'coverage']
    metric_labels = [f'Precision@{k}', f'Recall@{k}', f'F1@{k}', f'NDCG@{k}', 'Coverage']
    
    # Number of metrics
    num_metrics = len(metrics)
    angles = np.linspace(0, 2 * np.pi, num_metrics, endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle
    
    for i, result in enumerate(evaluation_results):
        values = [result.get(m, 0) for m in metrics]
        values += values[:1]  # Complete the circle
        
        ax.plot(angles, values, 'o-', linewidth=2, label=result['model'], color=colors[i])
        ax.fill(angles, values, alpha=0.25, color=colors[i])
    
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(metric_labels)
    ax.set_title(f'Model Comparison Radar Chart (K={k})', size=14, y=1.1)
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))
    
    plt.tight_layout()
    plt.savefig('../data/processed/model_radar_chart.png', dpi=150, bbox_inches='tight')
    plt.show()
    print("\nRadar chart saved to: ../data/processed/model_radar_chart.png")

## 9. Save Metrics to MongoDB

In [None]:
# Save evaluation results to MongoDB
if evaluation_results:
    # Create metrics collection
    metrics_collection = db['model_metrics']
    
    # Prepare documents for insertion
    evaluation_documents = []
    
    for result in evaluation_results:
        doc = {
            'model_name': result['model'],
            'evaluation_date': datetime.now(),
            'num_users_evaluated': result['num_users'],
            'rating_threshold': RATING_THRESHOLD,
            'metrics': {},
            'parameters': {
                'k_values': TOP_K_VALUES,
                'sample_users': result['num_users']
            }
        }
        
        # Add metrics
        for k in TOP_K_VALUES:
            doc['metrics'][f'precision_at_{k}'] = result[f'precision@{k}']
            doc['metrics'][f'recall_at_{k}'] = result[f'recall@{k}']
            doc['metrics'][f'f1_at_{k}'] = result[f'f1@{k}']
            doc['metrics'][f'ndcg_at_{k}'] = result[f'ndcg@{k}']
        
        doc['metrics']['coverage'] = result['coverage']
        
        evaluation_documents.append(doc)
    
    # Insert into MongoDB
    result = metrics_collection.insert_many(evaluation_documents)
    
    print(f"âœ“ Saved {len(result.inserted_ids)} evaluation results to MongoDB")
    print(f"  Collection: model_metrics")
    print(f"  Inserted IDs: {result.inserted_ids}")
else:
    print("No results to save")

In [None]:
# Verify saved data
print("\n=== Verification: Data in MongoDB ===")
saved_metrics = list(metrics_collection.find().sort('evaluation_date', -1).limit(4))

for metric in saved_metrics:
    print(f"\nModel: {metric['model_name']}")
    print(f"  Evaluated on: {metric['evaluation_date']}")
    print(f"  Users evaluated: {metric['num_users_evaluated']}")
    print(f"  Coverage: {metric['metrics']['coverage']:.4f}")

## 10. Export Results to CSV

In [None]:
# Save results to CSV for reference
if evaluation_results:
    results_df['evaluation_date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    csv_path = '../data/processed/model_evaluation_results.csv'
    results_df.to_csv(csv_path, index=False)
    print(f"âœ“ Results saved to: {csv_path}")
    
    # Display final summary
    print("\n" + "=" * 80)
    print("FINAL EVALUATION SUMMARY")
    print("=" * 80)
    display(results_df.round(4))

## 11. Summary & Conclusions

### Key Findings:
- **Best Precision@K**: Model with highest precision correctly identifies relevant items
- **Best Recall@K**: Model with highest recall captures more relevant items
- **Best NDCG@K**: Model with highest NDCG provides better ranking quality
- **Best Coverage**: Model that recommends the most diverse set of items

### Recommendations:
1. **Hybrid model** typically performs best as it combines strengths of multiple approaches
2. **Content-based** is useful for cold-start (new users/items)
3. **Collaborative filtering** excels when sufficient rating data exists
4. Consider using **ensemble methods** to further improve performance

### Next Steps:
1. Fine-tune model hyperparameters based on evaluation results
2. Implement A/B testing for production deployment
3. Add more evaluation metrics (novelty, serendipity)
4. Monitor model performance over time

In [None]:
# Close MongoDB connection
client.close()
print("âœ“ MongoDB connection closed")
print("\nðŸŽ‰ Model evaluation complete!")