# Content-Based Filtering Model Development

This notebook implements and evaluates content-based recommendation algorithms using movie features like genres, titles, and metadata.

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
import sys
sys.path.append('../')
from src.data_loader import MovieDataLoader
from src.recommenders.content_based import ContentBasedRecommender
from src.utils import parse_genres, extract_year_from_title, get_unique_genres

In [None]:
# Load data
data_loader = MovieDataLoader()
movies_df, ratings_df, links_df = data_loader.load_data()

print(f"Dataset loaded:")
print(f"Movies: {len(movies_df)}")
print(f"Ratings: {len(ratings_df)}")
print(f"Users: {ratings_df['userId'].nunique()}")

## 2. Content Feature Analysis

In [None]:
# Analyze content features available
print("Available content features:")
print(f"1. Movie titles: {movies_df['title'].notna().sum()} / {len(movies_df)}")
print(f"2. Genres: {movies_df['genres'].notna().sum()} / {len(movies_df)}")

# Extract additional features
movies_enhanced = movies_df.copy()

# Extract year from title
title_year_data = movies_enhanced['title'].apply(extract_year_from_title)
movies_enhanced['clean_title'] = title_year_data.apply(lambda x: x[0])
movies_enhanced['year'] = title_year_data.apply(lambda x: x[1])

# Parse genres into lists
movies_enhanced['genre_list'] = movies_enhanced['genres'].apply(parse_genres)

print(f"\nEnhanced features:")
print(f"3. Release years: {movies_enhanced['year'].notna().sum()} / {len(movies_enhanced)}")
print(f"4. Clean titles: {movies_enhanced['clean_title'].notna().sum()} / {len(movies_enhanced)}")

display(movies_enhanced[['title', 'clean_title', 'year', 'genres', 'genre_list']].head())

## 3. Genre-Based Content Analysis

In [None]:
# Analyze genre distribution
all_genres = get_unique_genres(movies_df)
print(f"Total unique genres: {len(all_genres)}")
print(f"Genres: {', '.join(all_genres)}")

# Create genre binary matrix
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_enhanced['genre_list'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=movies_enhanced.index)

print(f"\nGenre matrix shape: {genre_df.shape}")
print(f"Average genres per movie: {genre_df.sum(axis=1).mean():.2f}")

# Visualize genre co-occurrence
genre_corr = genre_df.corr()

plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(genre_corr, dtype=bool))
sns.heatmap(genre_corr, mask=mask, annot=False, cmap='coolwarm', center=0,
            square=True, fmt='.2f', cbar_kws={"shrink": .8})
plt.title('Genre Co-occurrence Correlation Matrix')
plt.tight_layout()
plt.show()

# Most common genre combinations
genre_combinations = {}
for _, row in movies_enhanced.iterrows():
    genres = tuple(sorted(row['genre_list']))
    if len(genres) > 1:  # Only combinations
        genre_combinations[genres] = genre_combinations.get(genres, 0) + 1

top_combinations = sorted(genre_combinations.items(), key=lambda x: x[1], reverse=True)[:10]

print(f"\nTop 10 Genre Combinations:")
for combo, count in top_combinations:
    print(f"{' + '.join(combo)}: {count} movies")

## 4. TF-IDF Feature Engineering

In [None]:
# Prepare content features for TF-IDF
def prepare_content_features(movies_df, use_genres=True, use_titles=True, genre_weight=3):
    """Prepare combined content features for each movie"""
    content_features = []
    
    for _, movie in movies_df.iterrows():
        features = []
        
        # Add genres (with higher weight)
        if use_genres and pd.notna(movie['genres']):
            genres = parse_genres(movie['genres'])
            # Repeat genres to give them more weight
            features.extend(genres * genre_weight)
        
        # Add title words
        if use_titles and pd.notna(movie['title']):
            # Clean title (remove year)
            clean_title = movie['title']
            if '(' in clean_title:
                clean_title = clean_title[:clean_title.rfind('(')].strip()
            
            # Add title words
            title_words = clean_title.lower().replace('-', ' ').split()
            # Filter out common words that don't add meaning
            stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of'}
            title_words = [word for word in title_words if word not in stop_words and len(word) > 2]
            features.extend(title_words)
        
        content_features.append(' '.join(features))
    
    return content_features

# Generate content features with different configurations
content_configs = [
    {'use_genres': True, 'use_titles': False, 'name': 'Genres Only'},
    {'use_genres': False, 'use_titles': True, 'name': 'Titles Only'},
    {'use_genres': True, 'use_titles': True, 'name': 'Genres + Titles'}
]

tfidf_results = {}

for config in content_configs:
    print(f"\nProcessing: {config['name']}")
    
    # Prepare features
    content_features = prepare_content_features(
        movies_enhanced, 
        use_genres=config['use_genres'], 
        use_titles=config['use_titles']
    )
    
    # Create TF-IDF vectorizer
    tfidf = TfidfVectorizer(
        max_features=1000,
        min_df=2,
        max_df=0.8,
        ngram_range=(1, 2),
        stop_words='english'
    )
    
    # Fit and transform
    tfidf_matrix = tfidf.fit_transform(content_features)
    
    print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
    print(f"Feature names (first 10): {list(tfidf.get_feature_names_out()[:10])}")
    
    # Calculate cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix)
    
    # Store results
    tfidf_results[config['name']] = {
        'vectorizer': tfidf,
        'matrix': tfidf_matrix,
        'similarity': cosine_sim,
        'features': content_features
    }
    
    # Analyze similarity distribution
    # Get upper triangle of similarity matrix (excluding diagonal)
    upper_tri_indices = np.triu_indices_from(cosine_sim, k=1)
    similarity_values = cosine_sim[upper_tri_indices]
    
    print(f"Similarity statistics:")
    print(f"  Mean: {similarity_values.mean():.4f}")
    print(f"  Std: {similarity_values.std():.4f}")
    print(f"  Max: {similarity_values.max():.4f}")
    print(f"  95th percentile: {np.percentile(similarity_values, 95):.4f}")

## 5. Content-Based Recommender Implementation

In [None]:
# Initialize and train content-based recommender
content_recommender = ContentBasedRecommender()

print("Training content-based recommender...")
content_recommender.fit(ratings_df, movies_df, use_genres=True, use_titles=True)

print(f"Model trained successfully!")
print(f"Feature matrix shape: {content_recommender.feature_matrix.shape}")
print(f"Similarity matrix shape: {content_recommender.similarity_matrix.shape}")

## 6. Movie-to-Movie Similarity Analysis

In [None]:
# Test movie-to-movie similarity
def analyze_movie_similarity(movie_title, n_similar=10):
    """Analyze similarity for a specific movie"""
    # Find movie ID
    movie_match = movies_df[movies_df['title'].str.contains(movie_title, case=False, na=False)]
    
    if movie_match.empty:
        print(f"Movie '{movie_title}' not found")
        return
    
    movie_id = movie_match.iloc[0]['movieId']
    actual_title = movie_match.iloc[0]['title']
    movie_genres = movie_match.iloc[0]['genres']
    
    print(f"\nAnalyzing similarity for: {actual_title}")
    print(f"Genres: {movie_genres}")
    
    # Get similar movies
    similar_movies = content_recommender.recommend(
        movie_id=movie_id, 
        n_recommendations=n_similar
    )
    
    if not similar_movies:
        print("No similar movies found")
        return
    
    print(f"\nTop {n_similar} Similar Movies:")
    print("-" * 80)
    
    for i, movie in enumerate(similar_movies, 1):
        similarity_score = movie['score']
        title = movie['title']
        genres = movie['genres']
        avg_rating = movie.get('avg_rating', 0)
        rating_count = movie.get('rating_count', 0)
        
        print(f"{i:2d}. {title}")
        print(f"    Similarity: {similarity_score:.3f} | Genres: {genres}")
        if avg_rating > 0:
            print(f"    Rating: {avg_rating:.1f}/5.0 ({rating_count} ratings)")
        print()

# Test with different movie types
test_movies = [
    "Toy Story",
    "Matrix",
    "Pulp Fiction"
]

for test_movie in test_movies:
    analyze_movie_similarity(test_movie, n_similar=5)

## 7. User-Based Content Recommendations

In [None]:
# Test user-based content recommendations
def analyze_user_recommendations(user_id, n_recommendations=10):
    """Analyze content-based recommendations for a user"""
    
    # Get user's rating history
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    
    if user_ratings.empty:
        print(f"User {user_id} has no ratings")
        return
    
    # Show user profile
    user_movies = user_ratings.merge(movies_df, on='movieId')
    
    print(f"\nUser {user_id} Profile:")
    print(f"Total ratings: {len(user_ratings)}")
    print(f"Average rating: {user_ratings['rating'].mean():.2f}")
    print(f"Rating distribution: {dict(user_ratings['rating'].value_counts().sort_index())}")
    
    # Analyze genre preferences
    user_genre_prefs = {}
    for _, movie in user_movies.iterrows():
        genres = parse_genres(movie['genres'])
        rating = movie['rating']
        
        for genre in genres:
            if genre not in user_genre_prefs:
                user_genre_prefs[genre] = []
            user_genre_prefs[genre].append(rating)
    
    # Calculate average rating per genre
    genre_avg_ratings = {
        genre: np.mean(ratings) 
        for genre, ratings in user_genre_prefs.items() 
        if len(ratings) >= 2  # At least 2 ratings
    }
    
    if genre_avg_ratings:
        sorted_genres = sorted(genre_avg_ratings.items(), key=lambda x: x[1], reverse=True)
        print(f"\nGenre Preferences (avg rating):")
        for genre, avg_rating in sorted_genres[:5]:
            count = len(user_genre_prefs[genre])
            print(f"  {genre}: {avg_rating:.2f} ({count} movies)")
    
    # Show highly rated movies
    high_rated = user_movies[user_movies['rating'] >= 4].sort_values('rating', ascending=False)
    print(f"\nHighly Rated Movies (4+ stars):")
    for _, movie in high_rated.head(5).iterrows():
        print(f"  ⭐ {movie['rating']:.1f} - {movie['title']} [{movie['genres']}]")
    
    # Get content-based recommendations
    recommendations = content_recommender.recommend(
        user_id=user_id, 
        n_recommendations=n_recommendations
    )
    
    if not recommendations:
        print("\nNo recommendations available")
        return
    
    print(f"\nContent-Based Recommendations:")
    print("-" * 80)
    
    for i, rec in enumerate(recommendations, 1):
        title = rec['title']
        genres = rec['genres']
        score = rec['score']
        explanation = rec.get('explanation', '')
        avg_rating = rec.get('avg_rating', 0)
        
        print(f"{i:2d}. {title}")
        print(f"    Score: {score:.3f} | Genres: {genres}")
        if avg_rating > 0:
            print(f"    Avg Rating: {avg_rating:.1f}/5.0")
        if explanation:
            print(f"    Why: {explanation}")
        print()

# Test with different users
test_users = [1, 5, 10]

for user_id in test_users:
    analyze_user_recommendations(user_id, n_recommendations=5)

## 8. Feature Importance Analysis

In [None]:
# Analyze feature importance for specific movies
def analyze_feature_importance(movie_title, top_n=10):
    """Analyze most important features for a movie"""
    # Find movie
    movie_match = movies_df[movies_df['title'].str.contains(movie_title, case=False, na=False)]
    
    if movie_match.empty:
        print(f"Movie '{movie_title}' not found")
        return
    
    movie_id = movie_match.iloc[0]['movieId']
    actual_title = movie_match.iloc[0]['title']
    
    print(f"\nFeature Importance for: {actual_title}")
    
    # Get feature importance
    feature_importance = content_recommender.get_feature_importance(movie_id, top_n=top_n)
    
    if not feature_importance:
        print("No feature importance available")
        return
    
    print(f"\nTop {top_n} Important Features:")
    print("-" * 40)
    
    for i, (feature, importance) in enumerate(feature_importance, 1):
        print(f"{i:2d}. {feature}: {importance:.4f}")

# Test feature importance for different movies
test_movies = ["Toy Story", "Matrix", "Pulp Fiction"]

for movie in test_movies:
    analyze_feature_importance(movie, top_n=8)

## 9. Content-Based Model Evaluation

In [None]:
# Evaluation metrics for content-based recommendations
def evaluate_content_recommendations(sample_users=10, n_recommendations=10):
    """Evaluate content-based recommendations"""
    
    # Sample users with sufficient ratings
    user_rating_counts = ratings_df.groupby('userId').size()
    active_users = user_rating_counts[user_rating_counts >= 10].index
    
    if len(active_users) > sample_users:
        test_users = np.random.choice(active_users, sample_users, replace=False)
    else:
        test_users = active_users[:sample_users]
    
    results = {
        'coverage': [],
        'diversity': [],
        'genre_consistency': []
    }
    
    print(f"Evaluating content-based recommendations for {len(test_users)} users...")
    
    all_movie_ids = set(movies_df['movieId'])
    recommended_movies = set()
    
    for user_id in test_users:
        # Get user's genre preferences
        user_ratings = ratings_df[ratings_df['userId'] == user_id]
        user_movies = user_ratings.merge(movies_df, on='movieId')
        
        # Calculate user's favorite genres
        user_genres = set()
        for _, movie in user_movies[user_movies['rating'] >= 4].iterrows():
            user_genres.update(parse_genres(movie['genres']))
        
        # Get recommendations
        recommendations = content_recommender.recommend(
            user_id=user_id, 
            n_recommendations=n_recommendations
        )
        
        if not recommendations:
            continue
        
        rec_movie_ids = [rec['movieId'] for rec in recommendations]
        recommended_movies.update(rec_movie_ids)
        
        # Calculate genre consistency
        rec_genres = set()
        for rec in recommendations:
            rec_genres.update(parse_genres(rec.get('genres', '')))
        
        if user_genres and rec_genres:
            genre_overlap = len(user_genres.intersection(rec_genres)) / len(user_genres.union(rec_genres))
            results['genre_consistency'].append(genre_overlap)
        
        # Calculate diversity (average pairwise distance)
        if len(recommendations) > 1:
            diversities = []
            for i in range(len(recommendations)):
                for j in range(i+1, len(recommendations)):
                    genres1 = set(parse_genres(recommendations[i].get('genres', '')))
                    genres2 = set(parse_genres(recommendations[j].get('genres', '')))
                    
                    if genres1 or genres2:
                        jaccard_sim = len(genres1.intersection(genres2)) / len(genres1.union(genres2)) if genres1.union(genres2) else 0
                        diversity = 1 - jaccard_sim
                        diversities.append(diversity)
            
            if diversities:
                results['diversity'].append(np.mean(diversities))
    
    # Calculate coverage
    catalog_coverage = len(recommended_movies) / len(all_movie_ids)
    results['coverage'] = catalog_coverage
    
    # Print results
    print(f"\nContent-Based Recommendation Evaluation Results:")
    print("-" * 50)
    print(f"Catalog Coverage: {catalog_coverage:.3f} ({len(recommended_movies)} / {len(all_movie_ids)} movies)")
    
    if results['diversity']:
        avg_diversity = np.mean(results['diversity'])
        print(f"Average Diversity: {avg_diversity:.3f} (higher is better)")
    
    if results['genre_consistency']:
        avg_consistency = np.mean(results['genre_consistency'])
        print(f"Genre Consistency: {avg_consistency:.3f} (higher is better)")
    
    return results

# Run evaluation
evaluation_results = evaluate_content_recommendations(sample_users=15, n_recommendations=10)

## 10. Comparison of Different Content Features

In [None]:
# Compare different feature combinations
def compare_content_features():
    """Compare different content-based approaches"""
    
    configurations = [
        {'use_genres': True, 'use_titles': False, 'name': 'Genres Only'},
        {'use_genres': False, 'use_titles': True, 'name': 'Titles Only'},
        {'use_genres': True, 'use_titles': True, 'name': 'Genres + Titles'}
    ]
    
    results = {}
    
    # Test movie for similarity
    test_movie_id = movies_df[movies_df['title'].str.contains('Toy Story', case=False)].iloc[0]['movieId']
    
    print("Comparing different content feature configurations...\n")
    
    for config in configurations:
        print(f"Testing: {config['name']}")
        print("-" * 30)
        
        # Create and train recommender
        recommender = ContentBasedRecommender()
        recommender.fit(
            ratings_df, 
            movies_df, 
            use_genres=config['use_genres'],
            use_titles=config['use_titles']
        )
        
        # Get similar movies
        similar_movies = recommender.recommend(
            movie_id=test_movie_id, 
            n_recommendations=5
        )
        
        print(f"Similar to 'Toy Story':")
        for i, movie in enumerate(similar_movies[:5], 1):
            print(f"  {i}. {movie['title']} (sim: {movie['score']:.3f})")
        
        # Calculate average similarity
        if similar_movies:
            avg_similarity = np.mean([m['score'] for m in similar_movies])
            print(f"Average similarity: {avg_similarity:.3f}")
            results[config['name']] = avg_similarity
        
        print()
    
    # Visualize comparison
    if results:
        methods = list(results.keys())
        scores = list(results.values())
        
        plt.figure(figsize=(10, 6))
        bars = plt.bar(methods, scores, color=['skyblue', 'lightcoral', 'lightgreen'])
        plt.title('Content-Based Feature Comparison\n(Average Similarity Scores for "Toy Story")')
        plt.ylabel('Average Similarity Score')
        plt.ylim(0, max(scores) * 1.1)
        
        # Add value labels on bars
        for bar, score in zip(bars, scores):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                    f'{score:.3f}', ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.show()
    
    return results

# Run comparison
feature_comparison = compare_content_features()

## 11. Key Findings and Recommendations

In [None]:
print("CONTENT-BASED FILTERING ANALYSIS SUMMARY")
print("=" * 60)

print("\n🎯 KEY FINDINGS:")

# Dataset characteristics
n_genres = len(get_unique_genres(movies_df))
avg_genres_per_movie = movies_enhanced['genre_list'].apply(len).mean()

print(f"\n📊 Dataset Characteristics:")
print(f"   • {n_genres} unique genres available for content analysis")
print(f"   • Average {avg_genres_per_movie:.1f} genres per movie")
print(f"   • Rich genre information enables effective content-based filtering")

# Feature effectiveness
if feature_comparison:
    best_config = max(feature_comparison, key=feature_comparison.get)
    best_score = feature_comparison[best_config]
    
    print(f"\n🔍 Feature Effectiveness:")
    print(f"   • Best configuration: {best_config} (score: {best_score:.3f})")
    print(f"   • Genre information provides strong content signals")
    print(f"   • Title words add contextual information")
    print(f"   • Combined features offer balanced recommendations")

# Model performance
if evaluation_results:
    print(f"\n📈 Model Performance:")
    print(f"   • Catalog coverage: {evaluation_results['coverage']:.1%}")
    
    if evaluation_results['diversity']:
        avg_div = np.mean(evaluation_results['diversity'])
        print(f"   • Average recommendation diversity: {avg_div:.3f}")
    
    if evaluation_results['genre_consistency']:
        avg_cons = np.mean(evaluation_results['genre_consistency'])
        print(f"   • Genre consistency with user preferences: {avg_cons:.3f}")

print(f"\n💡 RECOMMENDATIONS:")
print(f"   • Use combined genre + title features for balanced recommendations")
print(f"   • Weight genres more heavily as they provide stronger content signals")
print(f"   • Content-based filtering works well for users with clear genre preferences")
print(f"   • Combine with collaborative filtering to address content limitations")
print(f"   • Consider movie metadata (year, director, cast) for enhanced content features")

print(f"\n🔄 NEXT STEPS:")
print(f"   • Implement collaborative filtering for comparison")
print(f"   • Develop hybrid approach combining content and collaborative methods")
print(f"   • Evaluate recommendations using offline metrics (precision, recall)")
print(f"   • Test with larger datasets and more diverse content features")

print("\n" + "=" * 60)
print("✅ Content-based filtering analysis complete!")