In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('tmdb_5000_movies.csv')

# Display the first few rows
print("Data Head:")
print(df.head())

# Get information about columns and data types
print("\nData Info:")
df.info()

Data Head:
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id"

In [6]:
%pip install nltk

import nltk

# Download required NLTK data
nltk.download('punkt') # for tokenization
nltk.download('stopwords') # for stop words


Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 2.4 MB/s eta 0:00:01
[?25hCollecting click
  Using cached click-8.1.8-py3-none-any.whl (98 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting regex>=2021.8.3
  Downloading regex-2025.8.29-cp39-cp39-macosx_11_0_arm64.whl (286 kB)
[K     |████████████████████████████████| 286 kB 9.4 MB/s eta 0:00:01
Installing collected packages: tqdm, regex, click, nltk
Successfully installed click-8.1.8 nltk-3.9.1 regex-2025.8.29 tqdm-4.67.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adityakumar/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
import pandas as pd
import json
from nltk.corpus import stopwords
import re

# --- Step 1: Load BOTH datasets ---
movies_df = pd.read_csv('tmdb_5000_movies.csv')
credits_df = pd.read_csv('tmdb_5000_credits.csv')

# --- Step 2: Merge them into a single DataFrame ---
# Note: The first file uses 'id', the second uses 'movie_id'. We merge on these.
df = movies_df.merge(credits_df, left_on='id', right_on='movie_id')


# --- Step 3: Now, select the columns from the new 'df' ---
# Your original code will now work perfectly. We'll also add 'cast' which is very useful.
movies = df[['id', 'title_x', 'overview', 'genres', 'keywords', 'cast']].copy()
# We use 'title_x' because the merge creates 'title_x' and 'title_y'. They are the same.
movies.rename(columns={'id': 'movie_id', 'title_x': 'title'}, inplace=True)


# --- Step 4: Your original cleaning code continues from here ---

# Function to extract names from JSON-like columns
def extract_names(text):
    # Check if the text is a valid list of dictionaries
    try:
        items = json.loads(text)
        names = [item['name'] for item in items]
        return " ".join(names)
    except (TypeError, json.JSONDecodeError):
        return ""

# Function to extract top 3 cast members
def extract_cast(text):
    try:
        items = json.loads(text)
        names = [item['name'] for item in items[:3]] # Get top 3 actors
        return " ".join(names)
    except (TypeError, json.JSONDecodeError):
        return ""

# Apply the functions
movies['genres'] = movies['genres'].apply(extract_names)
movies['keywords'] = movies['keywords'].apply(extract_names)
movies['cast'] = movies['cast'].apply(extract_cast)

# Handle missing overview data
movies['overview'] = movies['overview'].fillna('')

print("Successfully merged and started processing the data!")
print(movies.head())

Successfully merged and started processing the data!
   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                            overview  \
0  In the 22nd century, a paraplegic Marine is di...   
1  Captain Barbossa, long believed to be dead, ha...   
2  A cryptic message from Bond’s past sends him o...   
3  Following the death of District Attorney Harve...   
4  John Carter is a war-weary, former military ca...   

                                     genres  \
0  Action Adventure Fantasy Science Fiction   
1                  Adventure Fantasy Action   
2                    Action Adventure Crime   
3               Action Crime Drama Thriller   
4          Action Adventure Scie

In [9]:
# --- Step 1: Combine all text columns into a single 'tags' column ---
# We are also including the 'cast' to make the recommender even better!
movies['tags'] = movies['overview'] + ' ' + movies['genres'] + ' ' + movies['keywords'] + ' ' + movies['cast']


# --- Step 2: Perform the final text cleaning on the 'tags' column ---

# Get the list of common English "stop words"
stop_words = set(stopwords.words('english'))

def clean_final_tags(text):
    # Convert all text to lowercase
    text = text.lower()
    # Remove all punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove all numbers (optional, but good for this use case)
    text = re.sub(r'\d+', '', text)
    # Split the text into words and remove the stop words
    words = [word for word in text.split() if word not in stop_words]
    # Join the words back into a single string
    return " ".join(words)

# Apply this final cleaning function to our 'tags' column
movies['cleaned_tags'] = movies['tags'].apply(clean_final_tags)


# --- Step 3: Save the final, ready-to-use data ---

# Select only the columns we need for the next stage of the project
processed_df = movies[['movie_id', 'title', 'cleaned_tags']]

# Save this fully processed data to a new CSV file
processed_df.to_csv('processed_movies.csv', index=False)

print("\nData preprocessing is complete! The final cleaned data has been saved to 'processed_movies.csv'")
print("\nHere's a sample of the final data:")
print(processed_df.head())


Data preprocessing is complete! The final cleaned data has been saved to 'processed_movies.csv'

Here's a sample of the final data:
   movie_id                                     title  \
0     19995                                    Avatar   
1       285  Pirates of the Caribbean: At World's End   
2    206647                                   Spectre   
3     49026                     The Dark Knight Rises   
4     49529                               John Carter   

                                        cleaned_tags  
0  nd century paraplegic marine dispatched moon p...  
1  captain barbossa long believed dead come back ...  
2  cryptic message bonds past sends trail uncover...  
3  following death district attorney harvey dent ...  
4  john carter warweary former military captain w...  


In [10]:
%pip install sentence-transformers torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# 1. Load the processed data you just created
df = pd.read_csv('processed_movies.csv')

# Handle any potential empty tags that might have resulted from cleaning
df.dropna(subset=['cleaned_tags'], inplace=True)


# 2. Load a powerful pre-trained model
print("Loading the sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')


# 3. Generate the embeddings for all movies
print("Generating vector embeddings... (This may take a few minutes)")
embeddings = model.encode(df['cleaned_tags'].tolist(), show_progress_bar=True)


# 4. Save the embeddings to a file
np.save('movie_embeddings.npy', embeddings)

print(f"\nSuccessfully generated and saved embeddings with shape: {embeddings.shape}")
print("The embeddings are saved in 'movie_embeddings.npy'")

Defaulting to user installation because normal site-packages is not writeable
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
[K     |████████████████████████████████| 483 kB 1.7 MB/s eta 0:00:01
[?25hCollecting torch
  Downloading torch-2.8.0-cp39-none-macosx_11_0_arm64.whl (73.6 MB)
[K     |████████████████████████████████| 73.6 MB 4.8 MB/s eta 0:00:01
Collecting huggingface-hub>=0.20.0
  Downloading huggingface_hub-0.34.4-py3-none-any.whl (561 kB)
[K     |████████████████████████████████| 561 kB 12.7 MB/s eta 0:00:01
Collecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.56.0-py3-none-any.whl (11.6 MB)
[K     |████████████████████████████████| 11.6 MB 6.1 MB/s eta 0:00:01
Collecting sympy>=1.13.3
  Using cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Collecting networkx
  Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 10.1 MB/s eta 0:00:01
[?25hCollecting fi

  from .autonotebook import tqdm as notebook_tqdm


Loading the sentence transformer model...
Generating vector embeddings... (This may take a few minutes)


Batches: 100%|██████████| 151/151 [00:10<00:00, 13.91it/s]


Successfully generated and saved embeddings with shape: (4803, 384)
The embeddings are saved in 'movie_embeddings.npy'





In [11]:
%pip install faiss-cpu
import numpy as np
import faiss

# 1. Load the embeddings you just created
embeddings = np.load('movie_embeddings.npy')

# 2. Get the dimension of the vectors (it should be 384 for our model)
d = embeddings.shape[1]

# 3. Create a FAISS index. IndexFlatL2 is a standard, accurate index.
index = faiss.IndexFlatL2(d)

# 4. Add all of our movie embeddings to this index
index.add(embeddings)

# 5. Save the finished index to a file. This is our search engine!
faiss.write_index(index, 'movie_index.faiss')

print(f"Successfully created and saved a FAISS index with {index.ntotal} movies.")
print("The index is saved in 'movie_index.faiss'")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp39-cp39-macosx_14_0_arm64.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 1.8 MB/s eta 0:00:01
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Successfully created and saved a FAISS index with 4803 movies.
The index is saved in 'movie_index.faiss'
