In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Load TMDB datasets (download from Kaggle)
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge TMDB movies and credits
movies = movies.merge(credits, left_on='title', right_on='title')

# Select relevant columns
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)  # Drop missing values
print(movies.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\one\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


       id                                     title  \
0   19995                                    Avatar   
1     285  Pirates of the Caribbean: At World's End   
2  206647                                   Spectre   
3   49026                     The Dark Knight Rises   
4   49529                               John Carter   

                                            overview  \
0  In the 22nd century, a paraplegic Marine is di...   
1  Captain Barbossa, long believed to be dead, ha...   
2  A cryptic message from Bondâ€™s past sends him o...   
3  Following the death of District Attorney Harve...   
4  John Carter is a war-weary, former military ca...   

                                              genres  \
0  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  [{"id": 28, "name": "Action"}, {"id": 12, "nam.

In [2]:
import json
import ast  # For safe parsing

def extract_genres(x):
    genres = []
    for i in ast.literal_eval(x):
        genres.append(i['name'])
    return genres

def extract_keywords(x):
    keywords = []
    for i in ast.literal_eval(x):
        keywords.append(i['name'])
    return keywords

def extract_cast(x, top_n=3):  # Top 3 cast members
    cast = []
    for i in ast.literal_eval(x):
        cast.append(i['name'])
    return cast[:top_n]

def extract_director(x):
    for i in ast.literal_eval(x):
        if i['job'] == 'Director':
            return [i['name']]
    return []

# Apply extractions
movies['genres'] = movies['genres'].apply(extract_genres)
movies['keywords'] = movies['keywords'].apply(extract_keywords)
movies['cast'] = movies['cast'].apply(extract_cast)
movies['director'] = movies['crew'].apply(extract_director)

# Clean overview: Remove stopwords
stop_words = set(stopwords.words('english'))
def clean_text(text):
    return ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words])

movies['overview'] = movies['overview'].apply(clean_text)

# Combine all features into a single string (feature-based "soup")
def create_soup(row):
    return ' '.join(row['genres']) + ' ' + ' '.join(row['keywords']) + ' ' + ' '.join(row['cast']) + ' ' + ' '.join(row['director']) + ' ' + row['overview']

movies['soup'] = movies.apply(create_soup, axis=1)
print(movies['soup'].head())

0    Action Adventure Fantasy Science Fiction cultu...
1    Adventure Fantasy Action ocean drug abuse exot...
2    Action Adventure Crime spy based on novel secr...
3    Action Crime Drama Thriller dc comics crime fi...
4    Action Adventure Science Fiction based on nove...
Name: soup, dtype: object


In [3]:
# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')  # Limit features to avoid sparsity
tfidf_matrix = tfidf.fit_transform(movies['soup'])

# Cosine Similarity (memory-efficient for large datasets)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape)  # Should be (n_movies, n_movies)

(4806, 4806)


In [8]:
# Create a reverse mapping of title to index
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_recommendations(title, cosine_sim=cosine_sim, top_n=10):
    if title not in indices:
        return f"Movie '{title}' not found."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # Exclude itself
    
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

# Test it
print(get_recommendations('Cartoon'))  # Example: Should suggest Batman-related or action movies

Movie 'Cartoon' not found.
