# Movie Recommendation System
## Content-Based Filtering using TMDB Movie Metadata

This notebook implements a movie recommendation system that suggests movies based on content similarity using genres, overview, keywords, and cast information.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import re
import warnings
warnings.filterwarnings('ignore')

## 2. Load Dataset

In [None]:
# Load the TMDB movie metadata
# Download from: https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge datasets on movie id
movies = movies.merge(credits, on='title')

# Select relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast']]

print(f"Dataset shape: {movies.shape}")
print(f"\nFirst few rows:")
movies.head()

## 3. Data Preprocessing

In [None]:
# Check for missing values
print("Missing values:")
print(movies.isnull().sum())

# Handle missing values
movies['overview'] = movies['overview'].fillna('')
movies['genres'] = movies['genres'].fillna('[]')
movies['keywords'] = movies['keywords'].fillna('[]')
movies['cast'] = movies['cast'].fillna('[]')

In [None]:
def extract_names(obj):
    """Extract names from JSON-like string format"""
    try:
        data = ast.literal_eval(obj)
        return [item['name'] for item in data]
    except:
        return []

def extract_cast(obj, limit=3):
    """Extract top cast members (limit to top 3)"""
    try:
        data = ast.literal_eval(obj)
        return [item['name'] for item in data[:limit]]
    except:
        return []

# Extract features
movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)
movies['cast'] = movies['cast'].apply(extract_cast)

print("Sample processed data:")
print(movies[['title', 'genres', 'keywords', 'cast']].head())

## 4. Feature Engineering - Create Tags Column

In [None]:
def clean_text(text):
    """Clean and normalize text"""
    if isinstance(text, list):
        text = ' '.join(text)
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Create tags column by combining all features
movies['tags'] = (
    movies['overview'] + ' ' +
    movies['genres'].apply(lambda x: ' '.join(x)) + ' ' +
    movies['keywords'].apply(lambda x: ' '.join(x)) + ' ' +
    movies['cast'].apply(lambda x: ' '.join(x))
)

# Clean the tags
movies['tags'] = movies['tags'].apply(clean_text)

print("Sample tags:")
print(movies[['title', 'tags']].head())

## 5. TF-IDF Vectorization

In [None]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    lowercase=True
)

# Fit and transform the tags
tfidf_matrix = tfidf.fit_transform(movies['tags'])

print(f"TF-IDF Matrix shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")

## 6. Cosine Similarity Computation

In [None]:
# Compute cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)

print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"Sample similarity scores for first movie: {similarity_matrix[0][:5]}")

## 7. Recommendation Function

In [None]:
def recommend(movie_title, num_recommendations=5):
    """
    Recommend movies based on content similarity
    
    Args:
        movie_title (str): Title of the movie to base recommendations on
        num_recommendations (int): Number of recommendations to return
    
    Returns:
        list: Top recommended movie titles
    """
    try:
        # Find the movie index
        movie_idx = movies[movies['title'].str.lower() == movie_title.lower()].index[0]
        
        # Get similarity scores for this movie
        sim_scores = list(enumerate(similarity_matrix[movie_idx]))
        
        # Sort by similarity score (descending)
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get top recommendations (excluding the input movie itself)
        sim_scores = sim_scores[1:num_recommendations+1]
        
        # Get movie indices
        movie_indices = [i[0] for i in sim_scores]
        
        # Return recommended movie titles
        recommendations = movies.iloc[movie_indices]['title'].tolist()
        
        return recommendations
    
    except IndexError:
        return f"Movie '{movie_title}' not found in the dataset."
    except Exception as e:
        return f"Error: {str(e)}"

## 8. Demonstration with Examples

In [None]:
# Test movies for demonstration
test_movies = ['The Dark Knight', 'Inception', 'Avatar']

print("🎬 MOVIE RECOMMENDATION SYSTEM DEMO\n")
print("=" * 50)

for movie in test_movies:
    print(f"\n🎯 Recommendations for '{movie}':")
    print("-" * 40)
    
    recommendations = recommend(movie)
    
    if isinstance(recommendations, list):
        for i, rec in enumerate(recommendations, 1):
            print(f"{i}. {rec}")
    else:
        print(recommendations)
    
    print()

## 9. Additional Analysis

In [None]:
# Function to get movie details
def get_movie_details(movie_title):
    """Get detailed information about a movie"""
    try:
        movie = movies[movies['title'].str.lower() == movie_title.lower()].iloc[0]
        return {
            'Title': movie['title'],
            'Genres': movie['genres'],
            'Cast': movie['cast'],
            'Overview': movie['overview'][:200] + '...' if len(movie['overview']) > 200 else movie['overview']
        }
    except:
        return f"Movie '{movie_title}' not found."

# Show details for one of the test movies
print("📋 Movie Details Example:")
print("=" * 30)
details = get_movie_details('Inception')
for key, value in details.items():
    print(f"{key}: {value}")

## 10. Interactive Recommendation

In [None]:
# Interactive function for custom movie recommendations
def interactive_recommend():
    """Interactive movie recommendation function"""
    print("🎬 Interactive Movie Recommendation System")
    print("Enter a movie title to get recommendations (or 'quit' to exit)\n")
    
    while True:
        movie_input = input("Enter movie title: ").strip()
        
        if movie_input.lower() == 'quit':
            print("Thanks for using the recommendation system!")
            break
        
        if movie_input:
            recommendations = recommend(movie_input)
            
            print(f"\n🎯 Recommendations for '{movie_input}':")
            print("-" * 40)
            
            if isinstance(recommendations, list):
                for i, rec in enumerate(recommendations, 1):
                    print(f"{i}. {rec}")
            else:
                print(recommendations)
            
            print("\n" + "=" * 50 + "\n")

# Uncomment the line below to run interactive mode
# interactive_recommend()

## Summary

This Movie Recommendation System implements content-based filtering using:

✅ **Data Loading**: TMDB movie metadata with focus on relevant columns  
✅ **Preprocessing**: Handled missing values and cleaned text data  
✅ **Feature Engineering**: Created combined 'tags' column from overview, genres, keywords, and cast  
✅ **Vectorization**: TF-IDF with 5000 max features and English stopwords removal  
✅ **Similarity**: Cosine similarity matrix for efficient movie comparisons  
✅ **Recommendation Function**: Returns top 5 similar movies for any given movie title  
✅ **Demonstration**: Tested with popular movies like The Dark Knight, Inception, and Avatar  

The system successfully recommends movies based on content similarity, making it useful for users who want to find movies similar to ones they already enjoy.