# Movie Galaxy â€” Semantic Movie Exploration


## Imports

In [None]:
import os
import pandas as pd
import numpy as np

## Loading Dataset

In [None]:
import kagglehub
movies_path = kagglehub.dataset_download("alanvourch/tmdb-movies-daily-updates")

In [None]:
os.listdir(movies_path)

In [None]:
movies_df = pd.read_csv(f"{movies_path}/TMDB_all_movies.csv")

## Exploring Dataset

In [None]:
# download samble data to view in excel
# sample_movies = movies_df.sample(n=100, random_state=42)
# sample_movies.to_csv("movies_sample_100.csv", index=False)

In [None]:
print("Shape:", movies_df.shape)
print("\nColumns:", movies_df.columns.tolist())
print("\nFirst 3 rows:")
display(movies_df.head(3))
print("\nInfo:")
movies_df.info()
print("\nMissing values (%):")
print((movies_df.isnull().sum() / len(movies_df) * 100).round(2).sort_values(ascending=False))
print("\nBasic stats:")
display(movies_df.describe())

In [None]:
# title, overview, genres

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import ast

# Set plot style
sns.set_style("whitegrid")

# Plot 1: Number of movies released each year
print("\n--- Movies Released Each Year ---")
movies_df['release_year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').dt.year
year_counts = movies_df['release_year'].value_counts().sort_index()

plt.figure(figsize=(15, 7))
sns.lineplot(x=year_counts.index, y=year_counts.values)
plt.title('Number of Movies Released Each Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot 2: Movies from each original language
# This plot tells most of the movies are in english so ignore this feature.
print("\n--- Movies by Original Language ---")
language_counts = movies_df['original_language'].value_counts().head(20) # Top 20 languages

plt.figure(figsize=(12, 7))
sns.barplot(x=language_counts.index, y=language_counts.values, hue=language_counts.index, palette='viridis', legend=False)
plt.title('Top 20 Original Languages of Movies')
plt.xlabel('Original Language')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Number of peoples voted
# for 66% of data the count is 0
zero_vote_count = movies_df[movies_df['vote_count'] == 0]
print(f"Number of movies with 0 vote count: {len(zero_vote_count)}")

total_movies = movies_df.shape[0]
percentage_zero_votes = (len(zero_vote_count) / total_movies) * 100
print(f"Percentage of movies with 0 vote count: {percentage_zero_votes:.2f}%")

In [None]:
movies_df.shape

## Clean Data

In [None]:
df = movies_df[movies_df['vote_count'] >= 30].copy()
df = df.sort_values('popularity', ascending=False).head(5000).copy()
df.shape

In [None]:
print("Missing values for specified columns in df:")
print(df[['title', 'genres', 'overview']].isnull().sum())

In [None]:
df['genres'] = df['genres'].fillna('Unknown')
df['cast'] = df['cast'].fillna('')
df['director'] = df['director'].fillna('Unknown')

df['year'] = df['release_year'].fillna(0).astype(int)
df = df.drop(columns=['release_year'])


In [None]:
df = df[df['overview'].notna() & df['genres'].notna()].copy()

print("Shape after dropping rows with null 'overview' or 'genres':", df.shape)
print(df[['title', 'genres', 'overview']].isnull().sum())

In [None]:
def build_natural_text(row):
    parts = []

    title = str(row['title']).strip()
    year = f" ({int(row['year'])})" if 'year' in row and pd.notna(row['year']) and row['year'] != 0 else ""
    parts.append(f"{title}{year}.")

    overview = str(row.get('overview', '')).strip()
    if overview:
        if not overview.endswith(('.', '!', '?')):
            overview += "."
        parts.append(overview)

    genres = str(row.get('genres', '')).strip()
    if genres and genres.lower() not in ['unknown', 'none', '']:
        parts.append(f"Genres: {genres}.")

    text = " ".join(parts).strip()
    return f"search_document: {text}"

# Uncomment to use this version instead
df['text_for_embedding'] = df.apply(build_natural_text, axis=1)

In [None]:
for i in range(3):
    txt = df['text_for_embedding'].iloc[i]
    print(f"\nRow {i} ({df['title'].iloc[i]}):")
    print(txt[:600] + "..." if len(txt) > 600 else txt)

print("\nAverage text length:", df['text_for_embedding'].str.len().mean().round(0))
print("Any empty texts?", (df['text_for_embedding'].str.strip() == '').sum())

### Create Embeddings

In [None]:
!pip install -q sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import torch
from tqdm.auto import tqdm

model_name = 'nomic-ai/nomic-embed-text-v1.5'

print(f"Loading model: {model_name}")
model = SentenceTransformer(model_name, trust_remote_code=True)

# Use GPU if available (Colab â†’ change runtime to GPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print(f"Using device: {device}")

# Your texts
texts = df['text_for_embedding'].tolist()

print(f"Encoding {len(texts)} texts...")
embeddings = model.encode(
    texts,
    batch_size=128,
    show_progress_bar=True,
    normalize_embeddings=True,      # very important for cosine similarity
    convert_to_numpy=True
)

print("\nEmbeddings created!")
print("Shape:", embeddings.shape)      # should be (5000, 768)

# Save immediately
np.save('embeddings_5k_nomic.npy', embeddings)
print("Embeddings saved â†’ embeddings_5k_nomic.npy")

### Visualisation

In [None]:
import numpy as np
import pandas as pd
import os

# Embeddings are already in the 'embeddings' variable from the previous cell

# Prepare metadata
# Ensure 'df' is the filtered DataFrame used for embeddings
# It should contain 'title', 'genres', 'cast_top5', 'director', 'year'
# Check if 'cast_top5' and 'director' exist, otherwise use placeholders
metadata_cols = ['title', 'genres', 'year']
if 'cast_top5' in df.columns:
    metadata_cols.append('cast_top5')
else:
    df['cast_top5'] = '' # Add empty column if not present
if 'director' in df.columns:
    metadata_cols.append('director')
else:
    df['director'] = 'Unknown' # Add 'Unknown' if not present

metadata_df = df[metadata_cols].copy()

# Create a single label for each movie by concatenating relevant metadata
# This will be displayed when hovering over points in Projector
def create_label(row):
    parts = [f"{row['title']} ({int(row['year'])})"]
    if row['genres'] and row['genres'] != 'Unknown':
        parts.append(f"Genres: {row['genres']}")
    if 'cast_top5' in row and row['cast_top5']:
        parts.append(f"Cast: {row['cast_top5']}")
    if 'director' in row and row['director'] != 'Unknown':
        parts.append(f"Director: {row['director']}")
    return ' - '.join(parts)

metadata_df['label'] = metadata_df.apply(create_label, axis=1)

# Save embeddings to TSV
embeddings_tsv_path = 'embeddings.tsv'
np.savetxt(embeddings_tsv_path, embeddings, delimiter='\t')
print(f"Embeddings saved to {embeddings_tsv_path}")

# Save metadata to TSV
metadata_tsv_path = 'metadata.tsv'
# TensorBoard Projector expects metadata file without header and tab-separated
metadata_df['label'].to_csv(metadata_tsv_path, sep='\t', index=False, header=False)
print(f"Metadata saved to {metadata_tsv_path}")

print("\nNow we can upload these two files to https://projector.tensorflow.org/")

In [None]:
!zip embeddings.zip embeddings.tsv metadata.tsv
from google.colab import files
files.download('embeddings.zip')

In [None]:
from sentence_transformers.util import cos_sim

# Change indices to test different pairs
i, j = 0, 15

sim = cos_sim(embeddings[i], embeddings[j])[0][0].item()
print(f"Similarity between {df['title'].iloc[i]} and {df['title'].iloc[j]}: {sim:.4f}")

## FAISS search

In [None]:
!pip install faiss-cpu

In [None]:
import numpy as np
import faiss
import pandas as pd

# Load your data
# Use the embeddings array already in memory from the previous step (2DhlTt5Bk-4y)
# If you saved a specific 'embeddings_5k_nomic.npy', you could load it here:
embeddings = np.load('embeddings_5k_nomic.npy').astype('float32')

# Assuming 'df' is the cleaned and processed DataFrame already in memory.
# Remove the line that tries to load 'movies_5k_with_text.parquet' to avoid inconsistencies
# df = pd.read_parquet('movies_5k_with_text.parquet') # This line is removed

print("Embeddings shape:", embeddings.shape)
print("DataFrame shape:", df.shape)

In [None]:
# Build FAISS Index (very fast for similarity search)
dim = embeddings.shape[1]   # 768

index = faiss.IndexFlatIP(dim)        # Inner Product = Cosine when normalized
index.add(embeddings)

faiss.write_index(index, 'faiss_index_5k.faiss')

print("FAISS index built and saved!")

In [None]:
def find_similar_movies(query_title, top_k=10):
    # Find the row(s) in df where the title matches (case-insensitive)
    matching_rows_df = df[df['title'].str.contains(query_title, case=False, na=False)]

    if matching_rows_df.empty:
        print("Movie not found!")
        return

    # Get the *original index value* of the first matching movie
    movie_original_index_value = matching_rows_df.index[0]

    # Now, find the *positional integer index* (iloc) of this movie within the *current* df
    # This positional index will correctly map to the embeddings array
    movie_iloc = df.index.get_loc(movie_original_index_value)

    # Search using the correct positional embedding
    query_vector = embeddings[movie_iloc].reshape(1, -1)
    distances, indices = index.search(query_vector, top_k + 1)

    print(f"\nðŸŽ¥ Movies similar to: **{df['title'].iloc[movie_iloc]}**\n")
    for rank, i in enumerate(indices[0][1:], 1):   # skip itself
        # 'i' here are the positional indices returned by FAISS, so we can use .iloc
        title = df['title'].iloc[i]
        print(f"{rank:2d}. {title} (Similarity: {distances[0][rank]:.4f})")

# Test it
find_similar_movies("Inception", top_k=8)
find_similar_movies("The Dark Knight", top_k=8)

## Create 3D Galaxy Visualization

In [None]:
# Reduce to 3D using UMAP (best for galaxy look)
import umap
import plotly.express as px

print("Reducing to 3D... (this may take 30â€“90 seconds)")

reducer = umap.UMAP(
    n_components=3,
    n_neighbors=30,
    min_dist=0.1,
    random_state=42
)

umap_3d = reducer.fit_transform(embeddings)

# Add coordinates to dataframe
df['umap_x'] = umap_3d[:, 0]
df['umap_y'] = umap_3d[:, 1]
df['umap_z'] = umap_3d[:, 2]

print("3D coordinates created!")

In [None]:
fig = px.scatter_3d(
    df,
    x='umap_x',
    y='umap_y',
    z='umap_z',
    hover_name='title',
    hover_data=['year', 'genres'],
    color='year',                    # color by year (beautiful gradient)
    # color='genres',                # or color by genre (if you want)
    title="ðŸŽ¥ Movie Galaxy - 3D Interactive (5,000 Movies)",
    opacity=0.8,
    width=1000,
    height=800
)

fig.update_traces(marker=dict(size=4))
fig.update_layout(scene=dict(
    xaxis_title='UMAP 1',
    yaxis_title='UMAP 2',
    zaxis_title='UMAP 3'
))

fig.show()

In [None]:
fig.write_html("movie_galaxy_3d.html")
print("Saved as movie_galaxy_3d.html â€” open in browser!")