# Data preprocessing

This notebook preprocesses MovieLens 32M and TMDB datasets to create three model-specific datasets:
1. **Collaborative Filtering**: User-item interaction matrix
2. **Content-Based Filtering**: Movie metadata with embeddings
3. **Two-Tower Model**: Combined ratings and movie features

## 1. Data Fetching

In [None]:
# Login to kaggle
import kagglehub as kh
import os

os.environ["KAGGLEHUB_CACHE"] = "./data"

In [None]:
# Downloading datasets
movie_path = kh.dataset_download("asaniczka/tmdb-movies-dataset-2023-930k-movies")
ratings_path = kh.dataset_download("justsahil/movielens-32m")

print(f"Movies path: {movie_path}")
print(f"Ratings path: {ratings_path}")

## 2. Load Data with Polars


In [None]:
import polars as pl
import numpy as np

movie_df = pl.read_csv(movie_path + "/TMDB_movie_dataset_v11.csv")
ratings_df = pl.read_csv(ratings_path + "/ml-32m/ratings.csv")
links_df = pl.read_csv(ratings_path + "/ml-32m/links.csv")

print(f"Movies: {movie_df.shape}")
print(f"Ratings: {ratings_df.shape}")
print(f"Links: {links_df.shape}")

## 3. Early Column Filtering


In [None]:
movie_df = movie_df.select(
    [
        "id",
        "title",
        "overview",
        "tagline",
        "genres",
        "keywords",
        "vote_average",
        "vote_count",
        "runtime",
        "release_date",
        "original_language",
    ]
)

print(f"Columns: {movie_df.columns}")

## 4. Handle Invalid Ratings

Ratings < 0.5 indicate no rating (not an actual 0.5 rating)


In [None]:
invalid_count = ratings_df.filter(pl.col("rating") < 0.5).shape[0]
print(f"Invalid ratings: {invalid_count}")

ratings_df = ratings_df.with_columns(
    pl.when(pl.col("rating") < 0.5)
    .then(None)
    .otherwise(pl.col("rating"))
    .alias("rating")
)

print(f"Set to null: {invalid_count}")

## 5. Merge Datasets


In [None]:
# Merge ratings with links
merged_df = ratings_df.join(links_df, on="movieId", how="inner")

# Merge with movie metadata
merged_df = merged_df.join(movie_df, left_on="tmdbId", right_on="id", how="inner")

In [None]:
merged_df

## 6. Data Cleaning


In [None]:
merged_df = merged_df.with_columns(
    [
        pl.col("overview").fill_null(""),
        pl.col("tagline").fill_null(""),
        pl.col("genres").fill_null(""),
        pl.col("keywords").fill_null(""),
        pl.col("runtime").fill_null(pl.col("runtime").median()),
        pl.col("vote_average").fill_null(0),
        pl.col("vote_count").fill_null(0),
    ]
)

# Drop rows with missing id or title
initial_rows = merged_df.shape[0]
merged_df = merged_df.drop_nulls(subset=["movieId", "title"])
print(f"Dropped {initial_rows - merged_df.shape[0]} rows")

## 7. Create Dataset 1: Collaborative Filtering


In [None]:
# Filter out null ratings and select only necessary columns
ratings_cf = merged_df.filter(pl.col("rating").is_not_null())
ratings_cf = ratings_cf.select(["userId", "movieId", "tmdbId", "rating", "timestamp"])

# Calculate sparsity
n_users = ratings_cf["userId"].n_unique()
n_movies = ratings_cf["tmdbId"].n_unique()
sparsity = 1 - (ratings_cf.shape[0] / (n_users * n_movies))
print(f"Matrix sparsity: {sparsity:.4%}")

In [None]:
ratings_cf.describe()

In [None]:
ratings_cf

## 8. Create Dataset 2: Content-Based Filtering

Process movie metadata and generate embeddings


In [None]:
# Get unique movies
movies_cb = movie_df.unique(subset=["id"])

print(f"Unique movies: {movies_cb.shape[0]:,}")
print(f"Columns: {movies_cb.columns}")

### Merging the feature columns

We want to merge the feature columns into one single column so that we have a one string to have embeddings from.
I've decided to merge `title`, `overview`, `tagline`, `genres`, `keywords` columns, since they will be the most important for content based filtering.

In [None]:
import pandas as pd

# Convert to pandas
movies_cb_pd = movies_cb.to_pandas()
movies_cb_pd.head()

In [None]:
import re

# remove emojis
movies_cb_pd["overview"] = movies_cb_pd["overview"].apply(
    lambda x: re.sub(r"[^\x00-\x7F]+", "", x)
)
movies_cb_pd["tagline"] = movies_cb_pd["tagline"].apply(
    lambda x: re.sub(r"[^\x00-\x7F]+", "", x)
)

movies_cb_pd["merged_text"] = (
    movies_cb_pd["title"].fillna("")
    + " "
    + movies_cb_pd["overview"].fillna("")
    + " "
    + movies_cb_pd["tagline"].fillna("")
    + " "
    + movies_cb_pd["genres"].fillna("")
    + " "
    + movies_cb_pd["keywords"].fillna("")
).str.strip()

# Clean up extra spaces
movies_cb_pd["merged_text"] = movies_cb_pd["merged_text"].str.replace(
    r"\s+", " ", regex=True
)

print(f"Average text length: {movies_cb_pd['merged_text'].str.len().mean():.2f}")

In [None]:
movies_cb_pd.head()

### Generate TF-IDF Embeddings


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp

tfidf_vectorizer = TfidfVectorizer(
    max_features=10000, stop_words="english", ngram_range=(1, 2), min_df=2, max_df=0.8
)

tfidf_embeddings = tfidf_vectorizer.fit_transform(movies_cb_pd["merged_text"])

print(f"TF-IDF shape: {tfidf_embeddings.shape}")
print(f"Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(
    f"Sparsity: {tfidf_embeddings.nnz / (tfidf_embeddings.shape[0] * tfidf_embeddings.shape[1]):.4%}"
)

### Generate BERT Embeddings


In [None]:
import torch

# Check if MPS is available for MacOS

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print(x)
else:
    print("MPS device not found.")

In [None]:
from sentence_transformers import SentenceTransformer

# Load model (huggingface)
bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Encode
bert_embeddings = bert_model.encode(
    movies_cb_pd["merged_text"].tolist(),
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True,
)

print(f"BERT embeddings shape: {bert_embeddings.shape}")
print(f"Embedding dimension: {bert_embeddings.shape[1]}")

In [None]:
# Convert back to Polars with essential columns

movies_cb_final = movies_cb_pd[
    [
        "id",
        "title",
        "overview",
        "tagline",
        "genres",
        "vote_average",
        "vote_count",
        "runtime",
        "release_date",
        "original_language",
        "merged_text",
    ]
]

# Convert back to Polars for efficient storage
movies_cb = pl.from_pandas(movies_cb_final)

print(f"Content-based dataset ready: {movies_cb.shape}")

## 9. Create Dataset 3: Two-Tower Model Training Data


In [None]:
movies_cb

In [None]:
# Select movie features for item tower
movie_features = movies_cb.select(
    ["id", "merged_text", "genres", "vote_average", "vote_count", "runtime"]
)

# Join with ratings
two_tower_df = ratings_cf.join(
    movie_features, left_on="tmdbId", right_on="id", how="inner"
)

print(f"Two-tower training samples: {two_tower_df.shape[0]:,}")
print(f"Unique users: {two_tower_df['userId'].n_unique():,}")
print(f"Unique movies: {two_tower_df['tmdbId'].n_unique():,}")
print(f"\nColumns: {two_tower_df.columns}")

In [None]:
two_tower_df.head()

## 10. Data Validation & Statistics


In [None]:
import matplotlib.pyplot as plt

# Visualizations
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Convert to pandas for plotting
ratings_cf_pd = ratings_cf.to_pandas()

# Rating distribution
axes[0].hist(ratings_cf_pd["rating"], bins=10, edgecolor="black", color="skyblue")
axes[0].set_title("Rating Distribution")
axes[0].set_xlabel("Rating")
axes[0].set_ylabel("Count")

# Ratings per user
user_counts = ratings_cf_pd.groupby("userId").size()
axes[1].hist(user_counts, bins=50, edgecolor="black", color="lightcoral")
axes[1].set_title("Ratings per User")
axes[1].set_xlabel("Number of Ratings")
axes[1].set_ylabel("Number of Users")
axes[1].set_yscale("log")

# Ratings per movie
movie_counts = ratings_cf_pd.groupby("tmdbId").size()
axes[2].hist(movie_counts, bins=50, edgecolor="black", color="lightgreen")
axes[2].set_title("Ratings per Movie")
axes[2].set_xlabel("Number of Ratings")
axes[2].set_ylabel("Number of Movies")
axes[2].set_yscale("log")

plt.tight_layout()
plt.show()

print(
    f"\nRatings per user - Min: {user_counts.min()}, Max: {user_counts.max()}, Median: {user_counts.median():.0f}"
)
print(
    f"Ratings per movie - Min: {movie_counts.min()}, Max: {movie_counts.max()}, Median: {movie_counts.median():.0f}"
)

## 11. Save Processed Datasets


In [None]:
import joblib
import json

output_dir = "data/processed"
os.makedirs(output_dir, exist_ok=True)

# 1. Collaborative Filtering
ratings_cf.write_parquet(f"{output_dir}/ratings_cf.parquet")

# 2. Content-Based Filtering
movies_cb.write_parquet(f"{output_dir}/movies_cb.parquet")
np.save(f"{output_dir}/bert_embeddings_cb.npy", bert_embeddings)
sp.save_npz(f"{output_dir}/tfidf_embeddings_cb.npz", tfidf_embeddings)
joblib.dump(tfidf_vectorizer, f"{output_dir}/tfidf_vectorizer.pkl")

# 3. Two-Tower Model
two_tower_df.write_parquet(f"{output_dir}/two_tower_train.parquet")

# 4. Metadata
metadata = {
    "collaborative_filtering": {
        "num_ratings": int(ratings_cf.shape[0]),
        "num_users": int(ratings_cf["userId"].n_unique()),
        "num_movies": int(ratings_cf["tmdbId"].n_unique()),
        "avg_rating": float(ratings_cf["rating"].mean()),
        "sparsity": float(sparsity),
    },
    "content_based": {
        "num_movies": int(movies_cb.shape[0]),
        "tfidf_dim": tfidf_embeddings.shape[1],
        "bert_dim": bert_embeddings.shape[1],
        "bert_model": "sentence-transformers/all-MiniLM-L6-v2",
    },
    "two_tower": {
        "num_samples": int(two_tower_df.shape[0]),
        "num_users": int(two_tower_df["userId"].n_unique()),
        "num_movies": int(two_tower_df["tmdbId"].n_unique()),
    },
}

with open(f"{output_dir}/metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)

with open(f"metadata.json", "w") as f:
    json.dump(metadata, f, indent=2)