In [4]:
import pandas as pd
import numpy as np

In [5]:
def create_score(df):
    """
    Calculates a custom score for each movie based on ratings and popularity,
    then filters out the bottom 25% of movies by score.

    Args:
        df (pd.DataFrame): The input dataframe containing movie metrics.

    Returns:
        pd.DataFrame: The filtered dataframe with a new 'score' column.
    """
    df["score"] = (
    df["vote_average"] * np.log(df["vote_count"] + 1)
    + (df["popularity"]/10)
)

    threshold = df["score"].quantile(0.25)
    df = df[df["score"] >= threshold]

    return df

In [6]:
def remove_short(df):
    """
    Filters out movies with a runtime of less than 60 minutes.

    Args:
        df (pd.DataFrame): The input dataframe containing movie metrics.

    Returns:
        pd.DataFrame: The filtered dataframe containing only movies with runtime >= 60.
    """
    df = df[df["runtime"] >= 60]
    return df

In [7]:
def handling_missing_values(df):
    """
    Handles missing values by dropping rows with missing overviews and
    filling missing values in specific metadata columns with empty strings.

    Args:
        df (pd.DataFrame): The input dataframe containing movie metrics.

    Returns:
        pd.DataFrame: The dataframe with missing values handled.
    """
    cols = ["tagline", "director", "main_cast", "keywords", "poster_path"]
    
    df = df.dropna(subset=["overview"], axis=0).copy()
    
    for col in cols:
        if col in df.columns:
            df[col] = df[col].fillna("")
            
    return df

In [8]:
def build_embedding_text(df):
    """
    Combines movie metadata fields into a single string for text embedding.

    Args:
        df (pd.Series): A row from the movie dataframe.

    Returns:
        str: A formatted string containing the plot, tagline, and keywords.
    """
    parts = []

    if df["overview"]:
        parts.append(f"Plot: {df['overview']}")

    if df["tagline"]:
        parts.append(f"Main theme and tone: {df['tagline']}")

    if df["keywords"]:
        parts.append(f"Core themes and concepts: {df['keywords']}")

    return " | ".join(parts)

In [9]:
def normalize_text(text):
    """
    Normalizes a given string by converting it to lowercase and 
    collapsing multiple whitespace characters into a single space.

    Args:
        text (str): The input string to be normalized.

    Returns:
        str: The normalized string.
    """
    text = text.lower()
    text = " ".join(text.split())
    return text

In [15]:
def get_data_ready(path):
    """
    Orchestrates the data processing pipeline by reading raw data, filtering short movies,
    calculating scores, handling missing values, and generating normalized metadata.
    The final processed dataframe is saved to 'movies-processed.csv'.

    Args:
        path (str): The file path to the raw movie data CSV.

    Returns:
        None: Saves the processed data to a CSV file.
    """
    df = pd.read_csv(path)
    df = remove_short(df).copy()
    df = create_score(df).copy()
    df = handling_missing_values(df)

    df["search_metadata"] = df.apply(build_embedding_text, axis=1)
    df["search_metadata"] = df["search_metadata"].apply(normalize_text)
    df = df.drop(columns=['tagline', 'keywords', 'overview'])

    return df.to_csv("movies-processed.csv", index=False)

In [16]:
movie_raw_path = "../raw/movies-raw.csv"

get_data_ready(movie_raw_path)