In [13]:
# Content-Based Movie Recommender System

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import warnings
warnings.filterwarnings('ignore')

print("CONTENT-BASED MOVIE RECOMMENDER SYSTEM")
print("="*50)

# =============================================================================
# CELL 1: Load and Explore Data
# =============================================================================

def find_movie_dataset():
    """Find the movie dataset in various possible locations"""
    possible_paths = [
        # Check in parent directory (since we're in algorithm/ folder)
        '../data/dataset/movie_features_engineered.csv',
        '../data/dataset/movies_clean.csv',
        '../movie_features_engineered.csv',
        '../movies_clean.csv',
        # Check in current directory
        'movie_features_engineered.csv',
        'movies_clean.csv', 
        'data/dataset/movie_features_engineered.csv',
        'data/dataset/movies_clean.csv',
        # Check common locations
        './movie_features_engineered.csv',
        './movies_clean.csv'
    ]
    
    print("Looking for movie dataset...")
    print("Current directory:", os.getcwd())
    print("Files in current directory:")
    for item in sorted(os.listdir('.')):
        print(f"  {item}")
    
    for path in possible_paths:
        if os.path.exists(path):
            print(f"Found dataset: {path}")
            return path
    
    return None

def load_movie_data():
    """Load the cleaned movie dataset"""
    dataset_path = find_movie_dataset()
    
    if dataset_path is None:
        print("ERROR: No movie dataset found!")
        return None
    
    try:
        movies_df = pd.read_csv(dataset_path)
        print(f"Successfully loaded: {dataset_path}")
        print(f"Dataset shape: {movies_df.shape}")
        
        print(f"\nDataset Info:")
        print(f"Movies: {len(movies_df):,}")
        print(f"Features: {len(movies_df.columns)}")
        
        print(f"\nKey columns available:")
        key_cols = ['id', 'title', 'weighted_rating', 'vote_count', 'popularity_score']
        for col in key_cols:
            if col in movies_df.columns:
                print(f"  ✓ {col}")
            else:
                print(f"  ✗ {col} (missing)")
        
        return movies_df
        
    except Exception as e:
        print(f"ERROR loading dataset: {e}")
        return None

# Load the data
movies_df = load_movie_data()

if movies_df is None:
    print("Cannot proceed without dataset. Please check your files.")
else:
    print(f"\nDataset loaded successfully! Ready to build recommender.")

# =============================================================================
# CELL 2: Content-Based Recommender Class
# =============================================================================

class ContentBasedRecommender:
    def __init__(self, movies_df):
        self.movies_df = movies_df
        self.feature_matrix = None
        self.similarity_matrix = None
        self.scaler = StandardScaler()
        
        # Create movie mappings
        self.movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movies_df['id'])}
        self.idx_to_movie_id = {idx: movie_id for movie_id, idx in self.movie_id_to_idx.items()}
    
    def prepare_features(self):
        """Prepare all features for content-based filtering"""
        print("Preparing features for content-based filtering...")
        
        # Get all feature columns (exclude id and title)
        feature_cols = [col for col in self.movies_df.columns if col not in ['id', 'title']]
        print(f"Total features available: {len(feature_cols)}")
        
        # Fill missing values
        feature_data = self.movies_df[feature_cols].fillna(0)
        
        # Scale the features
        self.feature_matrix = self.scaler.fit_transform(feature_data)
        
        print(f"Feature matrix shape: {self.feature_matrix.shape}")
        return self.feature_matrix

# Initialize recommender
recommender = ContentBasedRecommender(movies_df)

# Create feature matrix
feature_matrix = recommender.prepare_features()

# =============================================================================
# CELL 3: Compute Similarity Matrix
# =============================================================================

def compute_similarity_matrix(feature_matrix, method='cosine'):
    """Compute similarity matrix between all movies"""
    print(f"\nComputing {method} similarity matrix...")
    print(f"Feature matrix shape: {feature_matrix.shape}")
    
    if method == 'cosine':
        similarity_matrix = cosine_similarity(feature_matrix)
    elif method == 'euclidean':
        # Convert distances to similarities
        distances = euclidean_distances(feature_matrix)
        similarity_matrix = 1 / (1 + distances)
    
    print(f"Similarity matrix computed: {similarity_matrix.shape}")
    
    # Show some statistics
    print(f"   Average similarity: {similarity_matrix.mean():.4f}")
    print(f"   Max similarity: {similarity_matrix.max():.4f}")
    print(f"   Min similarity: {similarity_matrix.min():.4f}")
    
    return similarity_matrix

# Compute similarity matrix
similarity_matrix = compute_similarity_matrix(feature_matrix)
recommender.similarity_matrix = similarity_matrix

# =============================================================================
# CELL 4: Recommendation Functions
# =============================================================================

def find_movie_index(title, movies_df):
    """Find movie index by title"""
    # Exact match
    exact_matches = movies_df[movies_df['title'].str.lower() == title.lower()]
    if len(exact_matches) > 0:
        return exact_matches.index[0]
    
    # Partial match
    partial_matches = movies_df[movies_df['title'].str.lower().str.contains(title.lower(), na=False)]
    if len(partial_matches) > 0:
        print(f"Found partial match: '{partial_matches.iloc[0]['title']}'")
        return partial_matches.index[0]
    
    return None

def get_movie_recommendations(movie_title, n_recommendations=10, min_rating=0, min_votes=0):
    """Get content-based recommendations for a movie"""
    
    # Find movie index
    movie_idx = find_movie_index(movie_title, movies_df)
    if movie_idx is None:
        print(f"Movie '{movie_title}' not found!")
        return None
    
    # Get the movie details
    target_movie = movies_df.iloc[movie_idx]
    
    # Safely get movie details with available columns
    title = target_movie.get('title', 'Unknown')
    rating = target_movie.get('weighted_rating', target_movie.get('vote_average', 0))
    votes = target_movie.get('vote_count', 0)
    
    print(f"Finding movies similar to: '{title}'")
    print(f"   Rating: {rating:.1f} | Votes: {votes}")
    
    # Get similarity scores
    sim_scores = list(enumerate(similarity_matrix[movie_idx]))
    
    # Sort by similarity (excluding the movie itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]
    
    # Apply filters and collect recommendations
    recommendations = []
    for idx, score in sim_scores:
        movie_data = movies_df.iloc[idx]
        
        # Get movie rating (try multiple column names)
        movie_rating = movie_data.get('weighted_rating', movie_data.get('vote_average', 0))
        movie_votes = movie_data.get('vote_count', 0)
        
        # Apply filters
        if movie_rating < min_rating:
            continue
        if movie_votes < min_votes:
            continue
        
        # Get genre information
        genres = ""
        genre_cols = [col for col in movie_data.index if col.startswith('genre_')]
        active_genres = [col.replace('genre_', '').replace('_', ' ').title() 
                        for col in genre_cols if movie_data.get(col, 0) == 1]
        if active_genres:
            genres = ', '.join(active_genres[:5])  # Show top 5 genres
        
        recommendations.append({
            'title': movie_data.get('title', 'Unknown'),
            'rating': movie_rating,
            'votes': movie_votes,
            'genres': genres,
            'similarity': score
        })
        
        if len(recommendations) >= n_recommendations:
            break
    
    return recommendations

def display_recommendations(recommendations, title="Recommendations"):
    """Display recommendations in a nice format"""
    if not recommendations:
        print("No recommendations found!")
        return
    
    print(f"\n{title}")
    print("=" * 60)
    
    for i, rec in enumerate(recommendations, 1):
        print(f"\n{i}. {rec['title']}")
        print(f"   Rating: {rec['rating']:.1f} | Votes: {rec['votes']}")
        if rec['genres']:
            print(f"   Genres: {rec['genres']}")
        print(f"   Similarity: {rec['similarity']:.3f}")

# =============================================================================
# CELL 5: Example Recommendations
# =============================================================================

print("\n" + "="*70)
print("CONTENT-BASED MOVIE RECOMMENDATIONS")
print("="*70)

# Example 1: Get recommendations for Avatar
print("\nEXAMPLE 1: Movies similar to 'Avatar'")
avatar_recs = get_movie_recommendations(
    movie_title="Avatar", 
    n_recommendations=5,
    min_rating=6.0,
    min_votes=100
)

if avatar_recs:
    display_recommendations(avatar_recs, "Movies Similar to Avatar")

# Example 2: Get recommendations for The Dark Knight
print("\nEXAMPLE 2: Movies similar to 'The Dark Knight'")
dark_knight_recs = get_movie_recommendations(
    movie_title="The Dark Knight", 
    n_recommendations=5,
    min_rating=7.0,
    min_votes=500
)

if dark_knight_recs:
    display_recommendations(dark_knight_recs, "Movies Similar to The Dark Knight")

# =============================================================================
# CELL 6: Genre-Based Recommendations
# =============================================================================

def find_movies_by_genre(genre_name, min_rating=7.0, n_movies=10):
    """Find top movies by specific genre"""
    
    # Look for the genre column
    genre_col = f"genre_{genre_name.lower().replace(' ', '_')}"
    
    if genre_col not in movies_df.columns:
        print(f"Genre '{genre_name}' not found in dataset")
        available_genres = [col.replace('genre_', '').replace('_', ' ').title() 
                          for col in movies_df.columns if col.startswith('genre_')]
        print(f"Available genres: {', '.join(available_genres)}")
        return []
    
    # Filter movies by genre
    genre_movies = movies_df[movies_df[genre_col] == 1].copy()
    
    # Get rating column
    rating_col = 'weighted_rating' if 'weighted_rating' in movies_df.columns else 'vote_average'
    
    if rating_col in genre_movies.columns:
        # Filter by minimum rating
        genre_movies = genre_movies[genre_movies[rating_col] >= min_rating]
        
        # Sort by rating
        genre_movies = genre_movies.sort_values(rating_col, ascending=False)
    
    # Prepare results
    results = []
    for _, movie in genre_movies.head(n_movies).iterrows():
        # Get active genres for this movie
        genre_cols = [col for col in movie.index if col.startswith('genre_')]
        active_genres = [col.replace('genre_', '').replace('_', ' ').title() 
                        for col in genre_cols if movie.get(col, 0) == 1]
        genres = ', '.join(active_genres[:5])
        
        results.append({
            'title': movie.get('title', 'Unknown'),
            'rating': movie.get(rating_col, 0),
            'votes': movie.get('vote_count', 0),
            'genres': genres
        })
    
    return results

# Example: Find top Action movies
print("\nEXAMPLE 3: Top Action Movies (Rating > 7.0)")
action_movies = find_movies_by_genre(
    genre_name='Action',
    min_rating=7.0,
    n_movies=8
)

if action_movies:
    display_recommendations(action_movies, "Top Action Movies")

# =============================================================================
# CELL 7: Interactive Recommendation Function
# =============================================================================

def interactive_recommender():
    """Interactive function to get recommendations"""
    print("\nINTERACTIVE MOVIE RECOMMENDER")
    print("-" * 40)
    print("Enter a movie title to get recommendations:")
    print("(Or type 'quit' to exit)")
    
    while True:
        movie_input = input("\nMovie title: ").strip()
        
        if movie_input.lower() in ['quit', 'exit', 'q']:
            print("Thanks for using the recommender!")
            break
        
        if not movie_input:
            continue
        
        # Get recommendations
        recs = get_movie_recommendations(
            movie_title=movie_input,
            n_recommendations=5,
            min_rating=5.0,
            min_votes=50
        )
        
        if recs:
            display_recommendations(recs)
        
        print("\n" + "-" * 50)

# Uncomment the line below to run the interactive recommender
# interactive_recommender()

# =============================================================================
# CELL 8: Analysis and Utilities
# =============================================================================

def find_most_similar_pairs(n_pairs=10):
    """Find the most similar movie pairs"""
    print(f"\nTOP {n_pairs} MOST SIMILAR MOVIE PAIRS")
    print("="*50)
    
    # Get indices of upper triangle
    mask = np.triu(np.ones_like(similarity_matrix, dtype=bool), k=1)
    indices = np.where(mask)
    similarities = similarity_matrix[mask]
    
    # Get top similar pairs
    top_indices = np.argsort(similarities)[-n_pairs:][::-1]
    
    for rank, idx in enumerate(top_indices, 1):
        i, j = indices[0][idx], indices[1][idx]
        similarity = similarities[idx]
        
        movie1 = movies_df.iloc[i]
        movie2 = movies_df.iloc[j]
        
        print(f"\n{rank}. Similarity: {similarity:.4f}")
        print(f"   {movie1['title']}")
        print(f"   {movie2['title']}")

def show_available_genres():
    """Show all available genres in the dataset"""
    genre_cols = [col for col in movies_df.columns if col.startswith('genre_')]
    genres = [col.replace('genre_', '').replace('_', ' ').title() for col in genre_cols]
    
    print("\nAVAILABLE GENRES:")
    print("="*30)
    for i, genre in enumerate(sorted(genres), 1):
        print(f"{i:2d}. {genre}")

def show_top_actors():
    """Show all available actors in the dataset"""
    actor_cols = [col for col in movies_df.columns if col.startswith('actor_')]
    actors = [col.replace('actor_', '').replace('_', ' ').title() for col in actor_cols]
    
    print("\nTOP ACTORS IN DATASET:")
    print("="*30)
    for i, actor in enumerate(sorted(actors), 1):
        print(f"{i:2d}. {actor}")

# Show what's available
show_available_genres()
show_top_actors()

# Find most similar pairs
find_most_similar_pairs(5)

print("\n" + "="*70)


CONTENT-BASED MOVIE RECOMMENDER SYSTEM
Looking for movie dataset...
Current directory: C:\Users\User\Downloads\goodmovie-main\algorithm
Files in current directory:
  .ipynb_checkpoints
  content_based_movie.ipynb
Found dataset: ../data/dataset/movie_features_engineered.csv
Successfully loaded: ../data/dataset/movie_features_engineered.csv
Dataset shape: (3022, 179)

Dataset Info:
Movies: 3,022
Features: 179

Key columns available:
  ✓ id
  ✓ title
  ✓ weighted_rating
  ✓ vote_count
  ✓ popularity_score

Dataset loaded successfully! Ready to build recommender.
Preparing features for content-based filtering...
Total features available: 177
Feature matrix shape: (3022, 177)

Computing cosine similarity matrix...
Feature matrix shape: (3022, 177)
Similarity matrix computed: (3022, 3022)
   Average similarity: 0.0005
   Max similarity: 1.0000
   Min similarity: -0.3100

CONTENT-BASED MOVIE RECOMMENDATIONS

EXAMPLE 1: Movies similar to 'Avatar'
Finding movies similar to: 'Avatar'
   Rating: 

In [14]:
show_top_actors()


TOP ACTORS IN DATASET:
 1. Adam Sandler
 2. Arnold Schwarzenegger
 3. Ben Stiller
 4. Brad Pitt
 5. Bruce Willis
 6. Cameron Diaz
 7. Denzel Washington
 8. Eddie Murphy
 9. George Clooney
10. Johnny Depp
11. Julia Roberts
12. Mark Wahlberg
13. Matt Damon
14. Morgan Freeman
15. Nicolas Cage
16. Owen Wilson
17. Robert De Niro
18. Samuel L. Jackson
19. Tom Cruise
20. Tom Hanks


In [None]:
show_available_genres()