In [None]:
# Install visualization libraries if missing
%pip install matplotlib seaborn ipykernel

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add current directory to path so we can import our modules
sys.path.append(os.getcwd())

from data.data_loader import DataLoader
from models.recommender import RecommenderSystem

# Initialize
loader = DataLoader()
recommender = RecommenderSystem(loader)

### 1. View the Interaction Matrix (The 'Scoreboard')
This shows how users have rated movies based on their behavior (Favorites=5, Watch Later=2, Views=1-3).

In [None]:
# Get the raw interaction data
interactions_df = loader.get_interaction_matrix()

print(f"Total Interactions: {len(interactions_df)}")
print(f"Unique Users: {interactions_df['user_id'].nunique()}")
print(f"Unique Movies: {interactions_df['movie_id'].nunique()}")

# Show the first 10 rows
display(interactions_df)

# Visualize the distribution of scores
if not interactions_df.empty:
    plt.figure(figsize=(8, 4))
    sns.countplot(x='score', data=interactions_df)
    plt.title('Distribution of Interaction Scores')
    plt.show()

### 2. View Movie Metadata
This is the data used for Content-Based filtering (Genres, Ratings, etc.).

In [None]:
movies_df = loader.get_movies_metadata()
display(movies_df.shape)
display(movies_df)

### 3. Train the Model & Inspect Latent Features
We will train the model and then look at the hidden 'User Features' matrix.

In [None]:
# Train the model
metrics = recommender.train()
print("Training Metrics:", metrics)

# Inspect the User-Feature Matrix (Hidden Preferences)
if recommender.user_features is not None:
    user_features_df = pd.DataFrame(
        recommender.user_features,
        index=list(recommender.user_id_map.keys()),
        columns=[f"Feature_{i+1}" for i in range(recommender.n_components)]
    )
    
    print("\nUser Latent Features (Top 5 Users):")
    display(user_features_df.head())
    
    # Visualize the heatmap of features for first 10 users
    plt.figure(figsize=(12, 6))
    sns.heatmap(user_features_df.head(10), cmap="viridis")
    plt.title("Hidden User Preferences (Latent Features)")
    plt.show()

### 4. Test Recommendations
Pick a user ID from the interaction matrix above and see what the model recommends.

In [None]:
# Pick a user ID from the interactions dataframe
if not interactions_df.empty:
    test_user_id = interactions_df.iloc[1]['user_id']
    print(f"Generating recommendations for User ID: {test_user_id}")

    recommendations = recommender.get_recommendations(test_user_id, n_recommendations=5)
    
    # Convert to DataFrame for nice display
    recs_df = pd.DataFrame(recommendations)
    
    # Add movie titles
    if not recs_df.empty:
        # Helper to get title
        def get_title(mid):
            m = movies_df[movies_df['movie_id'] == mid]
            return m.iloc[0]['title'] if not m.empty else "Unknown"
        
        recs_df['title'] = recs_df['movie_id'].apply(get_title)
        display(recs_df[['title', 'score', 'reason', 'movie_id']])
    else:
        print("No recommendations found.")
else:
    print("No data available to generate recommendations.")

### 5. Model Evaluation Metrics
We check the mathematical health of the model.
* **Sparsity**: Percentage of empty cells in the user-movie matrix. If > 99.5%, the model struggles to find patterns.
* **Reconstruction Error**: How well the model approximates the original ratings.

In [None]:
from sklearn.metrics import mean_squared_error

# 1. Sparsity
n_users = interactions_df['user_id'].nunique() if not interactions_df.empty else 0
n_movies = interactions_df['movie_id'].nunique() if not interactions_df.empty else 0
n_interactions = len(interactions_df)
sparsity = 1 - (n_interactions / (n_users * n_movies)) if (n_users * n_movies) > 0 else 1.0

# 2. Reconstruction Error
reconstruction_err = 0.0
if recommender.model is not None and hasattr(recommender.model, 'reconstruction_err_'):
    reconstruction_err = recommender.model.reconstruction_err_

print(f"Matrix Sparsity: {sparsity:.2%}")
print(f"Reconstruction Error: {reconstruction_err:.4f}")

### 6. Content-Based Sanity Check
Does the model understand which movies are similar? We pick a movie and ask for similar ones based on genres.

In [None]:
# Helper to find similar movies
def check_content_similarity(movie_title, n=5):
    try:
        # Find movie ID
        target_movie = movies_df[movies_df['title'] == movie_title]
        if target_movie.empty:
            return f"Movie '{movie_title}' not found in database."
            
        movie_id = target_movie.iloc[0]['movie_id']
        target_genres = target_movie.iloc[0]['genres']
        
        print(f"Target: {movie_title} | Genres: {target_genres}")
        print("-" * 50)
        
        # Get similar movies from recommender
        similar_movies = recommender.get_similar_movies(movie_id, n_similar=n)
        
        results = []
        for m in similar_movies:
            # Get title for display
            m_info = movies_df[movies_df['movie_id'] == m['movie_id']]
            if not m_info.empty:
                title = m_info.iloc[0]['title']
                genres = m_info.iloc[0]['genres']
                results.append({"Title": title, "Genres": genres, "Score": m['score']})
            
        return pd.DataFrame(results)
    except Exception as e:
        return f"Error: {str(e)}"

# Pick the first movie in the DB to test
if not movies_df.empty:
    sample_movie = movies_df.iloc[0]['title']
    display(check_content_similarity(sample_movie))
else:
    print("No movies to check.")

### 7. Final Report Card
Is the model good? We evaluate based on data volume and metrics.

In [None]:
# Define Thresholds for "Good"
THRESHOLDS = {
    "min_users": 50,
    "min_movies": 100,
    "min_interactions": 500,
    "max_sparsity": 0.995
}

# Gather Stats
stats = {
    "Metric": ["Total Users", "Total Movies", "Total Interactions", "Sparsity", "Reconstruction Error"],
    "Value": [n_users, n_movies, n_interactions, f"{sparsity:.2%}", f"{reconstruction_err:.4f}"],
    "Status": []
}

# Evaluate
stats["Status"].append("Good" if n_users >= THRESHOLDS["min_users"] else "Low Data")
stats["Status"].append("Good" if n_movies >= THRESHOLDS["min_movies"] else "Low Data")
stats["Status"].append("Good" if n_interactions >= THRESHOLDS["min_interactions"] else "Low Data")
stats["Status"].append("Good" if sparsity < THRESHOLDS["max_sparsity"] else "Too Sparse")
stats["Status"].append("N/A") # Reconstruction error depends on scale

# Create Report DataFrame
report_df = pd.DataFrame(stats)

print("=== MODEL HEALTH REPORT ===")
display(report_df)

# Final Verdict
print("\n=== FINAL VERDICT ===")
if "Low Data" in stats["Status"] or "Too Sparse" in stats["Status"]:
    print("⚠️  MODEL IS UNDER-TRAINED")
    print("Reason: Insufficient data. The model is currently relying heavily on content-based filtering (genres) and popularity.")
    print("Recommendation: Add more users and interactions to enable Collaborative Filtering.")
else:
    print("✅ MODEL IS HEALTHY")
    print("The system has enough data to find meaningful user patterns.")