## Imports

In [1]:
import pandas as pd 
import numpy as np
import heapq
from typing import List, Optional

## Script variables

In [2]:
num_actors = 10

### Load data from the merged TSV files

In [3]:
useBackslash = True
pathToMergedFiles = r'..\data\merging_tsv_files_data.csv' if useBackslash  else r'../data/merging_tsv_files_data.csv'
merging_tsv_files_data = pd.read_csv(pathToMergedFiles, sep=';')


  merging_tsv_files_data = pd.read_csv(pathToMergedFiles, sep=';')


### Functions

In [4]:
def cumulative_top3_prior(
    group: pd.DataFrame,
    director: bool = False,
    key_cols: tuple = ("directors", "tconst"),   # or ("tconst", "nconst")
):
    """
    For each row in a group, emit the top-3 *prior* movie_score values before this row.
    Returns a DataFrame aligned to group.index and includes key_cols for safe merges.
    """

    # Min-heap that stores at most 3 tuples of (movie_score, tconst)
    # This always represents the current top-3 movie scores seen so far
    top_heap = []

    # Lists that will store the 1st, 2nd, and 3rd highest PRIOR ratings per row
    prior1, prior2, prior3 = [], [], []

    # IMPORTANT: iterate row-by-row in the existing order of `group`
    # The meaning of "prior" depends on this order already being correct
    for _, row in group.iterrows():

        # Take a snapshot of the heap BEFORE adding the current row
        # Sort descending so highest scores come first
        snapshot = sorted(top_heap, key=lambda x: x[0], reverse=True)

        # Extract only the movie_score values from the snapshot
        ratings = [r for r, _ in snapshot]

        # Ensure exactly three values (pad with NaN if fewer than 3 exist)
        while len(ratings) < 3:
            ratings.append(np.nan)

        # Store the top-1, top-2, and top-3 PRIOR ratings for this row
        prior1.append(ratings[0])
        prior2.append(ratings[1])
        prior3.append(ratings[2])

        # Add the current row's movie_score to the heap
        # This happens AFTER collecting prior values
        heapq.heappush(top_heap, (row["movie_score"], row.get("tconst", None)))

        # Keep only the top 3 scores in the heap
        if len(top_heap) > 3:
            heapq.heappop(top_heap)

    # Create an output DataFrame with the SAME index as the input group
    # This guarantees perfect row alignment
    out = pd.DataFrame(index=group.index)

    # Copy key columns directly from the input group
    # These allow safe merges back to the original dataset
    for c in key_cols:
        if c not in group.columns:
            raise KeyError(
                f"Key column '{c}' not found in group columns: {list(group.columns)}"
            )
        out[c] = group[c].values

    # Choose column name suffix depending on whether this is for directors or actors
    suffix = "_director" if director else "_actor"

    # Attach the computed prior-rating columns
    out[f"prior1_rating{suffix}"] = prior1
    out[f"prior2_rating{suffix}"] = prior2
    out[f"prior3_rating{suffix}"] = prior3

    return out

In [5]:
def make_movie_actor_wide(
    df: pd.DataFrame,
    *,
    movie_id_col: str = "tconst",
    actor_order_col: str = "ordering",
    max_actors: int = 15,
    actor_categories: tuple = ("actor", "actress", "self"),
    include_actor_text_cols: bool = True,
    primary_name_col: str = "primaryName",
) -> pd.DataFrame:
    """
    Convert a long movie-actor dataset to a wide one: 1 row per movie, N actors as columns.
    """

    # Work on a copy so the caller's DataFrame is not modified in-place.
    df = df.copy()

    # Best-effort auto-detect for the actor name column if the exact column name isn't present.
    # (e.g., handles variations like "primary_name" vs "primaryName".)
    if primary_name_col not in df.columns:
        candidates = [c for c in df.columns if c.lower().replace("_", "") == "primaryname"]
        if len(candidates) == 1:
            primary_name_col = candidates[0]  # use the one unambiguous match found

    # Ensure an ordering column exists (used to define "top billed" actor positions).
    # If missing, create it per movie using row order within each movie group (1..N).
    if actor_order_col not in df.columns:
        df[actor_order_col] = df.groupby(movie_id_col).cumcount() + 1

    # If a "category" column exists, keep only rows matching actor_categories
    # (filters out crew jobs, etc.). If actor_categories is falsy, skip filtering.
    if "category" in df.columns and actor_categories:
        df = df[df["category"].isin(actor_categories)].copy()

    # Sort by movie then actor ordering so that groupby(...).first() will pick:
    # - the first-billed actor's row as the representative for movie-level fields
    #   (important if movie-level columns are repeated on each actor row).
    # mergesort is stable, so ties preserve original order.
    df = df.sort_values([movie_id_col, actor_order_col], kind="mergesort")

    # Define columns considered "actor-level" (vary per actor within a movie).
    # Everything else will be treated as "movie-level" (kept once per movie).
    actor_level_cols = {
        "nconst",                     # actor/person id
        primary_name_col,             # actor name
        # Avoid leakage: per-actor sentiment for the *current* movie is actor-level and excluded.
        "actor_sentiment",
        # These are allowed actor features because they are prior-only (historical) signals.
        "prior_movie_actor_sentiment",
        "prior1_rating_actor", "prior2_rating_actor", "prior3_rating_actor",
        actor_order_col               # billing/order position
    }

    # Movie-level columns are all columns not in actor_level_cols.
    # These will be reduced to one row per movie.
    movie_level_cols = [c for c in df.columns if c not in actor_level_cols]

    # Create a base movie table: one row per movie containing only movie-level columns.
    # Using .first() after sorting ensures consistent selection of repeated values.
    movie_base = df.groupby(movie_id_col, as_index=False)[movie_level_cols].first()

    # Assign a 1..k index to actors within each movie (based on the sorted order),
    # which later becomes the column suffix (actor1_*, actor2_*, ...).
    df["actor_idx"] = df.groupby(movie_id_col).cumcount() + 1

    # Cap the number of actors kept per movie to max_actors (e.g., top 15 billed).
    df = df[df["actor_idx"] <= max_actors].copy()

    # Choose actor features to pivot into wide format.
    # These are "prior-only" features to reduce leakage risk.
    actor_feature_cols = [
        "prior_movie_actor_sentiment",
        "prior1_rating_actor", "prior2_rating_actor", "prior3_rating_actor",
    ]

    # Optionally include text/id columns for each actor (nconst + primary name).
    if include_actor_text_cols:
        actor_feature_cols = ["nconst", primary_name_col] + actor_feature_cols

    # Keep only columns needed for the pivot:
    # - movie id
    # - actor index within the movie
    # - whichever actor_feature_cols actually exist in df (guards missing columns)
    keep_cols = [movie_id_col, "actor_idx"] + [c for c in actor_feature_cols if c in df.columns]
    actors_narrow = df[keep_cols].copy()

    # Pivot from narrow (one row per movie-actor) to wide (one row per movie).
    # After unstack, columns become a MultiIndex like: (feature_name, actor_idx).
    actors_wide = (
        actors_narrow
        .set_index([movie_id_col, "actor_idx"])
        .unstack("actor_idx")
    )

    # Flatten the MultiIndex columns: (feature, idx) -> "actor{idx}_{feature}"
    # Example: ("nconst", 2) -> "actor2_nconst"
    actors_wide.columns = [f"actor{idx}_{name}" for name, idx in actors_wide.columns]

    # Bring movie_id_col back as a normal column for merging.
    actors_wide = actors_wide.reset_index()

    # Merge the per-movie base table with the actor-wide features.
    # Left join keeps all movies even if some have fewer than max_actors actors.
    wide_df = movie_base.merge(actors_wide, on=movie_id_col, how="left")

    # Reorder columns so movie-level fields come first.
    movie_cols_first = list(movie_base.columns)

    # Then order actor columns by actor index first (actor1_..., actor2_..., etc.),
    # and within the same actor index, sort by the full column name for stability.
    actor_cols_after = sorted(
        [c for c in wide_df.columns if c not in movie_cols_first],
        key=lambda x: (int(x.split("_")[0].replace("actor", "")), x)
    )
    wide_df = wide_df[movie_cols_first + actor_cols_after]

    return wide_df

# Feature engineering

#### Drop unwanted columns

In [6]:
simpel_dataset = merging_tsv_files_data.drop(labels = [merging_tsv_files_data.columns[0], 'writers', 'characters', 'primaryProfession', 'birthYear', 'deathYear', 'known_for_movie_1', 'known_for_movie_2', 'known_for_movie_3', 'known_for_movie_4'], axis=1)

#### Calculating movie score and an actor sentiment score

In [7]:
simpel_dataset['movie_score'] = simpel_dataset['averageRating'] * simpel_dataset['numVotes']
simpel_dataset['actor_sentiment'] = simpel_dataset['movie_score']/simpel_dataset['ordering']

### Log transforms data for a better distribution 

In [8]:
simpel_dataset['movie_score'] = np.log1p(simpel_dataset['movie_score'])
simpel_dataset['actor_sentiment'] = np.log1p(simpel_dataset['actor_sentiment'])

#### Converting strings to int 

In [9]:
simpel_dataset['startYear'] = simpel_dataset['startYear'].replace(r'\N', np.nan)
simpel_dataset['startYear'] = pd.to_numeric(simpel_dataset['startYear'], errors='coerce')
simpel_dataset['startYear'] = np.floor(simpel_dataset['startYear'])
simpel_dataset['startYear'] = simpel_dataset['startYear'].astype('Int64')
simpel_dataset['runtimeMinutes'] = simpel_dataset['runtimeMinutes'].replace(r'\N', np.nan)
simpel_dataset['runtimeMinutes'] = pd.to_numeric(simpel_dataset['runtimeMinutes'], errors='coerce')
simpel_dataset['runtimeMinutes'] = np.floor(simpel_dataset['runtimeMinutes'])
simpel_dataset['runtimeMinutes'] = simpel_dataset['runtimeMinutes'].astype('Int64')


#### Creating a variable to break same year ties
The reason being an exact date isn't accessible from the date set therefore something arbitrary had replace it

In [10]:
simpel_dataset['_orig_order'] = np.arange(len(simpel_dataset))

#### Calculating directors prior movie sentiment and top 3 prior movies

In [None]:
# Split directors column into the different directors but only keep the first 
simpel_dataset['directors'] = simpel_dataset['directors'].str.split(',').str[0]
# Create a seperate dataframe to remove multiple rows per movie and keeps the temporal order
director_df = (simpel_dataset[['tconst', 'directors', 'startYear', '_orig_order', 'movie_score']]
               .where(simpel_dataset['directors'] != r'\N')
               .drop_duplicates(subset=['tconst', 'directors'])
               .sort_values(['directors', 'startYear', '_orig_order'], kind='mergesort'))
# Calculate directors prior movie performance
director_df['prior_movie_director_sentiment'] = director_df.groupby('directors')['movie_score'].transform(lambda x : x.cumsum()-x).fillna(0)
# Calculate directors top 3 prior movies
director_aug = (
    director_df
    .groupby("directors", group_keys=False)
    .apply(cumulative_top3_prior, director=True, key_cols=("directors", "tconst"))
    .reset_index(drop=True)
)
#Merge the director_aug and director_df 
director_df = director_df.merge(
    director_aug,
    on=["directors", "tconst"],
    how="left"
)
# Merge director_df into the original dataframe
simpel_dataset = simpel_dataset.merge(
    director_df[['tconst', 'directors', 'prior_movie_director_sentiment', 'prior1_rating_director', 'prior2_rating_director', 'prior3_rating_director']], 
    on=['tconst', 'directors'], 
    how='left'
    )

### Remove duplicate actors per movie

In [12]:
# Drops duplicates
simpel_dataset = simpel_dataset.drop_duplicates(subset=['tconst', 'nconst'])
# Resets ordering without duplicates but keeps the original actor ordering 
simpel_dataset = simpel_dataset.sort_values(['tconst', 'ordering'])
simpel_dataset['ordering'] = simpel_dataset.groupby('tconst').cumcount() + 1

#### Calculating actors prior movie sentiment scores

In [13]:
# Sort the data to keep the temporal order for each actor
simpel_dataset = simpel_dataset.sort_values(['primaryName', 'startYear', '_orig_order'], kind='mergesort')
# Calculates the prior performance for each actor
simpel_dataset['prior_movie_actor_sentiment'] = simpel_dataset.groupby('primaryName')['actor_sentiment'].transform(lambda x: x.cumsum() - x)

#### Calculating prior known for movies per actor

In [None]:
# Makes sure that the temporal order is kept just as before
simpel_dataset = simpel_dataset.sort_values(['primaryName', 'startYear', '_orig_order'], kind='mergesort')
# Calcultates every actors top 3 prior movies
aug = simpel_dataset.groupby('primaryName', group_keys=False).apply(cumulative_top3_prior, director=False, key_cols=('nconst', 'tconst'))
# Merge aug with the original dataset
simpel_dataset = simpel_dataset.merge(aug[['tconst', 'nconst', 'prior1_rating_actor', 'prior2_rating_actor', 'prior3_rating_actor']], on=['tconst', 'nconst'], how='left')

## One row per movie

In [15]:
wide_df = make_movie_actor_wide(simpel_dataset, max_actors=num_actors)
# remove any leaked current-movie actor sentiment if present
wide_df = wide_df.drop(columns=["actor_sentiment"], errors="ignore")


### Calculates year since release

In [16]:
# Add years since release
# The dataset is from 2024, so we subtract the release year from 2024
wide_df['years_since_release'] = 2024 - wide_df['startYear']

# Add years since release squared
wide_df['years_since_release_squared'] = wide_df['years_since_release'] ** 2

In [17]:
saveDatasetAt = r'.\common_datasets\original_dataset_log_scaled.csv' if useBackslash else r'./common_data/original_dataset_log_scaled.csv'
wide_df.to_csv(saveDatasetAt, sep =';')