# Data Collection

The first step is to get the data we will be working with from Kaggle.

Gonna use this one

https://www.kaggle.com/datasets/alanvourch/tmdb-movies-daily-updates

it has a ton of data in it, about 240mb. We can do a lot with this.

In [None]:
%pip -q install pandas openai python-dotenv chromadb

In [None]:
from dotenv import load_dotenv

load_dotenv()


In [None]:
# Load the csv in ../raw/TMDB_all_movies.csv
import pandas as pd


In [None]:

df = pd.read_csv('../raw/TMDB_all_movies.csv')

df.head()

In [None]:
# Print the shape of the dataframe
print(df.shape)

In [None]:
# Print the columns of the dataframe
print(df.columns)
# Print the dtypes of the dataframe
print(df.dtypes)
# Print the info of the dataframe
print(df.info())
# Print the description of the dataframe
print(df.describe())

In [None]:
# Print a row
print(df.iloc[0])


In [None]:
# Sort by date and find first and last date
df = df.sort_values(by='release_date', ascending=False)
df.head()

In [None]:
# Get all possible options in status column
df['status'].unique()

In [None]:
released = df[df['status'] == 'Released']
print(released.shape)
released.head(100)


In [None]:
# Get rows where the release_date is before 2025
before_2025 = released[released['release_date'] < '2025-01-01']


In [None]:
after_1950 = before_2025[before_2025['release_date'] > '1950-01-01']

print(after_1950.shape)
after_1950.head()

In [None]:
# movies with a vote count of more than 0
voted_on = after_1950[after_1950['vote_count'] > 0]
print(voted_on.shape)
voted_on.head()

In [None]:
# Sort by vote count
voted_on = voted_on.sort_values(by='vote_count', ascending=False)
voted_on.head()


In [None]:
top_by_vote_count = voted_on.head(150000)

### Embeddings

Okay now we need to get the embeddings

We are going to embed just the title and overview for now using openai embeddings

In [None]:
df = top_by_vote_count

In [None]:
# Get a list of all of the titles their overviews
titles = df['title'].tolist()
overviews = df['overview'].tolist()

titles_and_overviews = [f"{title} {overview}" for title, overview in zip(titles, overviews)]

print(titles_and_overviews[0])


In [None]:
from openai import OpenAI
import openai
from tqdm.notebook import tqdm
from math import ceil

client = OpenAI()

def chunkify(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]


In [None]:
# Parameters
chunk_size = 2048
model_name = "text-embedding-3-large"  # Ensure this is the correct model name

# Initialize list to store embeddings
all_embeddings = []

# Calculate total number of chunks
total_chunks = ceil(len(titles_and_overviews) / chunk_size)

# Process each chunk
for chunk in chunkify(titles_and_overviews, chunk_size):
    try:
        response = client.embeddings.create(
            input=chunk,
            model=model_name
        )
        # Extract embeddings from the response and append to the list
        embeddings = [embedding.embedding for embedding in response.data]
        all_embeddings.extend(embeddings)
        print(f"Processed {len(all_embeddings)} embeddings out of {len(titles_and_overviews)}")
    except openai.OpenAIError as e:
        print(f"An error occurred: {e}")
        # Optionally, implement retry logic or handle the error as needed
        raise e

# Verify the number of embeddings matches the number of documents
assert len(all_embeddings) == len(
    titles_and_overviews), "Mismatch between embeddings and documents."

# Now, all_embeddings[i] corresponds to titles_and_overviews[i]
print("All embeddings generated successfully.")

In [None]:
# Copy df and add the embeddings
df_with_embeddings = df.copy()
df_with_embeddings['embedding'] = all_embeddings




In [None]:
# Remove the id column and the index column and replace them with i in order of vote count
ided_df = df_with_embeddings.reset_index().drop(columns=['id'])
ided_df = ided_df.drop(columns=['index'])
ided_df.head()

In [None]:
# Pickle the dataframe
ided_df.to_pickle('../raw/df_with_embeddings.pkl')

In [None]:
# Assuming you're using a chroma db client for semantic search
from chromadb import Client


def get_embeddings(inputs: list[str], model_name: str = "text-embedding-3-large"):
    response = client.embeddings.create(
        input=inputs,
        model=model_name
    )
    return [embedding.embedding for embedding in response.data]


def load_df(file_path: str = '../raw/df_with_embeddings.pkl'):
    return pd.read_pickle(file_path)


class MovieData:
    def __init__(self, movies_df: pd.DataFrame):
        self.df = movies_df
        batch_size = 41666

        print("Loading vector db")
        self.db_client = Client()  # Assuming chromadb client initialization

        # If collection exists, use it
        try:
            self.collection = self.db_client.get_collection(name="movies")
            entries = self.collection.count()
            if entries != len(self.df):
                print("Collection does not match the number of movies")
                raise ValueError(
                    "Collection does not match the number of movies")
        except:
            self.collection = self.db_client.get_or_create_collection(
                name="movies")

            ids = self.df.index.tolist()
            id_strings = [str(id) for id in ids]
            embeddings = self.df['embedding'].tolist()

            for i in range(0, len(ids), batch_size):
                print(f"Processing batch {i} of {len(ids)}")
                batch_ids = id_strings[i:i + batch_size]
                batch_embeddings = embeddings[i:i + batch_size]
                self.collection.upsert(
                    ids=batch_ids, embeddings=batch_embeddings)

    def semantic_search(self, query: str, k: int = 10):
        response = client.embeddings.create(
            input=query,
            model=model_name
        )
        query_embedding = response.data[0].embedding
        results = self.collection.query(
            query_embeddings=[query_embedding], n_results=k)
        ids: list[str] = results['ids'][0]
        # df from the ids
        ids = [int(i) for i in ids]

        results_df = self.df.loc[ids]
        results_df["distance"] = results['distances'][0]
        return results_df


# movie_data = MovieData(ided_df)

In [None]:

movie_data.semantic_search("Ghosts of a relative")

In [None]:
full_df = load_df()
full_df.head()

In [None]:
# Top 1500
top_1500 = full_df.head(50000)
top_1500.to_pickle('../raw/top_50000.pkl')


In [None]:
top: pd.DataFrame = pd.read_pickle('../raw/top_50000.pkl')
top.head()


In [None]:

# # Precompute norms for the movie embeddings to speed up cosine similarity
# self.df['embedding'] = self.df['embedding'].apply(np.array)
# self.df['embedding_norm'] = self.df['embedding'].apply(np.linalg.norm)
import numpy as np

top['embedding'] = top['embedding'].apply(np.array)
top['embedding_norm'] = top['embedding'].apply(np.linalg.norm)

top.head()


In [None]:
top.to_pickle('../raw/top_50000.pkl')