# Movie Galaxy — Semantic Movie Exploration


## Imports

In [None]:
import os
import pandas as pd
import numpy as np

## Loading Dataset

In [None]:
import kagglehub
movies_path = kagglehub.dataset_download("alanvourch/tmdb-movies-daily-updates")

# animes_path = kagglehub.dataset_download("calebmwelsh/anilist-anime-dataset")

In [None]:
os.listdir(movies_path)

In [None]:
movies_df = pd.read_csv(f"{movies_path}/TMDB_all_movies.csv")

## Exploring Dataset

In [None]:
# download samble data to view in excel
# sample_movies = movies_df.sample(n=100, random_state=42)
# sample_movies.to_csv("movies_sample_100.csv", index=False)

In [None]:
print("Shape:", movies_df.shape)
print("\nColumns:", movies_df.columns.tolist())
print("\nFirst 3 rows:")
display(movies_df.head(3))
print("\nInfo:")
movies_df.info()
print("\nMissing values (%):")
print((movies_df.isnull().sum() / len(movies_df) * 100).round(2).sort_values(ascending=False))
print("\nBasic stats:")
display(movies_df.describe())

In [None]:
# title, overview, genres, cast, director, year

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import ast

# Set plot style
sns.set_style("whitegrid")

# Plot 1: Number of movies released each year
print("\n--- Movies Released Each Year ---")
movies_df['release_year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').dt.year
year_counts = movies_df['release_year'].value_counts().sort_index()

plt.figure(figsize=(15, 7))
sns.lineplot(x=year_counts.index, y=year_counts.values)
plt.title('Number of Movies Released Each Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot 2: Movies from each original language
# This plot tells most of the movies are in english so ignore this feature.
print("\n--- Movies by Original Language ---")
language_counts = movies_df['original_language'].value_counts().head(20) # Top 20 languages

plt.figure(figsize=(12, 7))
sns.barplot(x=language_counts.index, y=language_counts.values, hue=language_counts.index, palette='viridis', legend=False)
plt.title('Top 20 Original Languages of Movies')
plt.xlabel('Original Language')
plt.ylabel('Number of Movies')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Number of peoples voted
# for 66% of data the count is 0
zero_vote_count = movies_df[movies_df['vote_count'] == 0]
print(f"Number of movies with 0 vote count: {len(zero_vote_count)}")

total_movies = movies_df.shape[0]
percentage_zero_votes = (len(zero_vote_count) / total_movies) * 100
print(f"Percentage of movies with 0 vote count: {percentage_zero_votes:.2f}%")

## Clean Data

In [None]:
keep_cols = [
    'id', 'title', 'overview', 'genres', 'cast', 'director',
    'release_date', 'poster_path', 'popularity'
]

df = movies_df[keep_cols].copy()
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').dt.year.astype('Int64')
print("Shape after selecting columns:", df.shape)
print(df.dtypes)

### Handle missing values

In [None]:
df['genres'] = df['genres'].fillna('Unknown')
df['cast'] = df['cast'].fillna('')
df['director'] = df['director'].fillna('Unknown')
df['year'] = df['year'].fillna(0).astype(int)

print("Missing values after filling:")
print(df.isnull().sum())

In [None]:
df = df[
    df['title'].notna() &
    (df['title'].str.strip() != '') &
    df['overview'].notna() &
    (df['overview'] != '') &
    (df['overview'].str.len() >= 20)
].copy()

print("Shape after dropping bad overview rows:", df.shape)
print(df.isnull().sum())

In [None]:
import ast

def get_top_5_cast(cast_value):
    if not cast_value:  # empty string
        return ''

    try:
        # Most TMDB dumps store cast as stringified list: "['Tom Hanks', 'Meg Ryan', ...]"
        if isinstance(cast_value, str) and cast_value.startswith('['):
            cast_list = ast.literal_eval(cast_value)
            if isinstance(cast_list, list):
                return ', '.join(str(name).strip() for name in cast_list[:5])

        # If it's already comma-separated
        elif ',' in cast_value:
            cast_list = [name.strip() for name in cast_value.split(',')]
            return ', '.join(cast_list[:5])

        # Fallback – just keep short
        return str(cast_value)[:150]

    except Exception:
        return ''


df['cast_top5'] = df['cast'].apply(get_top_5_cast)
print("Sample cast cleaning:")
display(df[['title', 'cast', 'cast_top5']].sample(8))

In [None]:
df.shape

In [None]:
from datetime import datetime

current_year = datetime.now().year
year_cutoff = current_year - 5

# Filter movies from the last 5 years and update df
df = df[df['year'] >= year_cutoff].copy()

print(f"Number of movies in the last 5 years (from {year_cutoff} to {current_year}): {df.shape[0]}")
print("Shape of the DataFrame with recent movies:", df.shape)

In [None]:
def build_text(row):
    parts = []

    parts.append(f"Title: {row['title']}.")

    overview = row['overview'][:650].rstrip(" .!?")
    parts.append(f"Overview: {overview}.")

    if row['genres'] != 'Unknown':
        parts.append(f"Genres: {row['genres']}.")

    if row['cast_top5']:
        parts.append(f"Cast: {row['cast_top5']}.")

    if row['director'] != 'Unknown':
        parts.append(f"Director: {row['director']}.")

    if row['year'] != 0:
        parts.append(f"Year: {row['year']}.")

    # Join as natural sentences + add E5 prefix
    return "passage: " + " ".join(parts)


from tqdm.auto import tqdm

tqdm.pandas(desc="Building embedding text")
df['text_to_embed'] = df.progress_apply(build_text, axis=1)

print("Sample text to embed:")
display(df[['title', 'text_to_embed']].sample(3))


In [None]:
df['text_to_embed'].iloc[0]

In [None]:
df.shape

In [None]:
# Save the cleaned version
df.to_parquet('movies_cleaned_for_embedding.parquet', index=False)

### Create Embeddings

In [None]:
!pip install -q sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm.auto import tqdm
import os
import torch

# ────────────────────────────────────────────────
# CONFIG
# ────────────────────────────────────────────────
model_name = "intfloat/multilingual-e5-large"
batch_size = 192
checkpoint_every = 5000
output_dir = "./checkpoints"

embeddings_file = "movie_embeddings.npy"
progress_file = "embedding_progress.npy"

os.makedirs(output_dir, exist_ok=True)

full_embeddings_path = os.path.join(output_dir, embeddings_file)
progress_path = os.path.join(output_dir, progress_file)

# ────────────────────────────────────────────────
# DATA
# ────────────────────────────────────────────────
texts = df["text_to_embed"].tolist()
total = len(texts)

# Resume position
if os.path.exists(progress_path):
    start_idx = int(np.load(progress_path)[0])
    print(f"Resuming from index {start_idx} / {total}")
else:
    start_idx = 0
    print("Starting from beginning")

# ────────────────────────────────────────────────
# MODEL
# ────────────────────────────────────────────────
print(f"Loading {model_name}")
model = SentenceTransformer(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# speed + stability improvements
torch.set_float32_matmul_precision("high")
if device == "cuda":
    torch.cuda.empty_cache()

print(f"Using device: {device}")

# Automatically detect embedding dimension
embedding_dim = model.get_sentence_embedding_dimension()
print(f"Embedding dimension: {embedding_dim}")

# ────────────────────────────────────────────────
# CREATE / LOAD MEMORY-MAPPED ARRAY
# ────────────────────────────────────────────────
if os.path.exists(full_embeddings_path):
    embeddings = np.memmap(
        full_embeddings_path,
        dtype="float32",
        mode="r+",
        shape=(total, embedding_dim),
    )
    print("Loaded existing embedding memmap")
else:
    embeddings = np.memmap(
        full_embeddings_path,
        dtype="float32",
        mode="w+",
        shape=(total, embedding_dim),
    )
    print("Created new embedding memmap")

# ────────────────────────────────────────────────
# EMBEDDING LOOP (RESUMABLE)
# ────────────────────────────────────────────────
print("Starting / resuming embedding...")

save_interval_batches = max(1, checkpoint_every // batch_size)

for step, i in enumerate(
    tqdm(range(start_idx, total, batch_size), desc="Embedding batches")
):
    end = min(i + batch_size, total)
    chunk_texts = texts[i:end]

    try:
        chunk_emb = model.encode(
            chunk_texts,
            batch_size=batch_size,
            show_progress_bar=False,
            normalize_embeddings=True,
            convert_to_numpy=True,
        )

        embeddings[i:end] = chunk_emb

        # periodic checkpoint
        if step % save_interval_batches == 0 or end == total:
            embeddings.flush()  # write memmap to disk
            np.save(progress_path, np.array([end]))
            print(f"Checkpoint saved at {end}/{total}")

    except Exception as e:
        print(f"Error in batch {i}-{end}: {e}")
        print("Saving progress before exit...")
        embeddings.flush()
        np.save(progress_path, np.array([i]))
        raise e

# Final save
embeddings.flush()
np.save(progress_path, np.array([total]))

print("\n✅ Embedding complete!")
print(f"Saved embeddings at: {full_embeddings_path}")
print(f"Final shape: ({total}, {embedding_dim})")


### Visualisation

In [None]:
!zip -r checkpoints.zip ./checkpoints
from google.colab import files
files.download('checkpoints.zip')

In [None]:
import numpy as np
import pandas as pd
import os

# Load embeddings
embeddings_path = './checkpoints/movie_embeddings.npy'
# Load as memory-mapped array, specifying dtype and shape
# The shape is (total, embedding_dim) from the embedding cell's output
total_embeddings = 210201  # Value of 'total' from previous embedding cell
embedding_dimension = 1024 # Value of 'embedding_dim' from previous embedding cell
embeddings = np.memmap(embeddings_path, dtype='float32', mode='r', shape=(total_embeddings, embedding_dimension))

# Prepare metadata
# Ensure 'df' is the filtered DataFrame used for embeddings
# It should contain 'title', 'genres', 'cast_top5', 'director', 'year'
metadata_df = df[['title', 'genres', 'cast_top5', 'director', 'year']].copy()

# Create a single label for each movie by concatenating relevant metadata
# This will be displayed when hovering over points in Projector
metadata_df['label'] = metadata_df.apply(
    lambda row: f"{row['title']} ({row['year']}) - {row['genres']} - {row['cast_top5']}",
    axis=1
)

# Save embeddings to TSV
embeddings_tsv_path = 'embeddings.tsv'
np.savetxt(embeddings_tsv_path, embeddings, delimiter='\t')
print(f"Embeddings saved to {embeddings_tsv_path}")

# Save metadata to TSV
metadata_tsv_path = 'metadata.tsv'
metadata_df['label'].to_csv(metadata_tsv_path, sep='\t', index=False, header=False)
print(f"Metadata saved to {metadata_tsv_path}")

print("\nNow you can upload these two files to https://projector.tensorflow.org/")

In [None]:
!unzip checkpoints.zip