# Content-Based Movie Recommendation System
## Dataset: TMDB 5000 Movies + Credits

### Importing required Libraries

In [9]:
#Required Python Libraries
import pandas as pd
import numpy as np
import re
import ast
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Download NLTK data (run once)

In [10]:
nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### To Load the Dataset

In [11]:
movies = pd.read_csv("data/tmdb_5000_movies.csv")
credits = pd.read_csv("data/tmdb_5000_credits.csv")

# Merge datasets on 'id'
movies = movies.merge(credits, left_on="id", right_on="movie_id")
movies = movies[["title_x", "genres", "keywords", "overview", "cast", "crew"]] #Select only required columns
movies = movies.rename(columns={"title_x": "title"})

### Helping Functions

In [12]:
# Function to extract names from JSON-like fields
def extract_names(x):
    try:
        data = ast.literal_eval(x)
        return " ".join([d["name"] for d in data])
    except:
        return ""

# Function to get top 5 cast members
def extract_cast(x, top_n=5):
    try:
        data = ast.literal_eval(x)
        return " ".join([d["name"] for d in data[:top_n]])
    except:
        return ""

# Function to get director name
def extract_director(x):
    try:
        data = ast.literal_eval(x)
        directors = [d["name"] for d in data if d.get("job") == "Director"]
        return " ".join(directors)
    except:
        return ""

# Text preprocessing
def preprocess_text(text):
    text = re.sub(r"[^a-zA-Z\s]", " ", str(text))
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

### Apply Feature Extraction

In [13]:
movies["genres"] = movies["genres"].apply(extract_names)
movies["keywords"] = movies["keywords"].apply(extract_names)
movies["cast"] = movies["cast"].apply(extract_cast)
movies["crew"] = movies["crew"].apply(extract_director)

# Combine all features into one column
movies["combined"] = (
    movies["genres"] + " " +
    movies["keywords"] + " " +
    movies["overview"].astype(str) + " " +
    movies["cast"] + " " +
    movies["crew"]
)

# Clean and combined text
movies["cleaned_text"] = movies["combined"].apply(preprocess_text)

### Vectorization (TF-IDF) & Similarity

In [14]:
# Convert text to TF-IDF Vectors
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(movies["cleaned_text"])

# Compute cosine similarity between movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### Recommendation Function

In [15]:
def recommend_movies(movie_name, top_n=5):
    # Find the index of the movie
    idx = movies[movies["title"].str.lower() == movie_name.lower()].index
    if len(idx) == 0:
        return f"Movie '{movie_name}' not found"
    idx = idx[0]

    # Compute similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores] # Get indices of top movies
    return movies.iloc[movie_indices][["title"]]

### Your Input from the Dataset

In [16]:
recommend_movies("The Dark Knight")

Unnamed: 0,title
3,The Dark Knight Rises
119,Batman Begins
428,Batman Returns
299,Batman Forever
9,Batman v Superman: Dawn of Justice
