## Imports

In [49]:
import pandas as pd 
import numpy as np
import heapq
from typing import List, Optional
import sklearn.preprocessing

### Load data from the merged TSV files

In [None]:
merging_tsv_files_data = pd.read_csv('..\\data\\merging_tsv_files_data.csv', sep=';')

### Functions

In [51]:
def cumulative_top3_prior(group: pd.DataFrame, director = False):
    """
    For each row in an actor group, emit the top-3 *prior* ratings (and titles) before this row.
    Uses a size-3 min-heap for O(n) per group.
    """
    top_heap = []  # min-heap of (rating, title) so smallest on top; keep size<=3
    prior1_rating, prior2_rating, prior3_rating = [], [], []

    for _, row in group.iterrows():
        # snapshot current top prior works (sorted desc)
        snapshot = sorted(top_heap, key=lambda x: x[0], reverse=True)
        ratings = [r for r, _ in snapshot]

        # pad to length 3
        while len(ratings) < 3:
            ratings.append(np.nan)

        prior1_rating.append(ratings[0])
        prior2_rating.append(ratings[1])
        prior3_rating.append(ratings[2])
        # now add current row into the actor's history
        heapq.heappush(top_heap, (row['movie_score'], row.get('tconst', None)))
        if len(top_heap) > 3:
            heapq.heappop(top_heap)
    if director is True:
        out = pd.DataFrame({
            'prior1_rating_director' : prior1_rating,
            'prior2_rating_director' : prior2_rating,
            'prior3_rating_director' : prior3_rating
        })
    else: 
        out = pd.DataFrame({
            'prior1_rating': prior1_rating,
            'prior2_rating': prior2_rating,
            'prior3_rating': prior3_rating,
        }, index=group.index)

    return out

In [52]:
def cumulative_top3_prior_test(
    group: pd.DataFrame,
    director: bool = False,
    key_cols: tuple = ("directors", "tconst"),   # or ("tconst", "nconst")
):
    """
    For each row in a group, emit the top-3 *prior* movie_score values before this row.
    Returns a DataFrame aligned to group.index and includes key_cols for safe merges.
    """
    top_heap = []  # min-heap of (movie_score, tconst) keep size<=3
    prior1, prior2, prior3 = [], [], []

    # IMPORTANT: preserve order + index alignment
    for _, row in group.iterrows():
        snapshot = sorted(top_heap, key=lambda x: x[0], reverse=True)
        ratings = [r for r, _ in snapshot]
        while len(ratings) < 3:
            ratings.append(np.nan)

        prior1.append(ratings[0])
        prior2.append(ratings[1])
        prior3.append(ratings[2])

        heapq.heappush(top_heap, (row["movie_score"], row.get("tconst", None)))
        if len(top_heap) > 3:
            heapq.heappop(top_heap)

    # Always align output to the same rows
    out = pd.DataFrame(index=group.index)

    # Append join keys (copied from group so they align 1:1 with rows)
    for c in key_cols:
        if c not in group.columns:
            raise KeyError(f"Key column '{c}' not found in group columns: {list(group.columns)}")
        out[c] = group[c].values

    # Column names depend on director flag
    suffix = "_director" if director else "_actor"
    out[f"prior1_rating{suffix}"] = prior1
    out[f"prior2_rating{suffix}"] = prior2
    out[f"prior3_rating{suffix}"] = prior3

    return out


In [53]:

def make_movie_actor_wide(
    df: pd.DataFrame,
    *,
    movie_id_col: str = "tconst",
    actor_order_col: str = "ordering",
    max_actors: int = 15,
    actor_categories: tuple = ("actor", "actress", 'self'),
    include_actor_text_cols: bool = True,
    primary_name_col: str = "primaryName",
) -> pd.DataFrame:
    """
    Convert a long movie-actor dataset to a wide one: 1 row per movie, N actors as columns.
    """

    df = df.copy()

    # best-effort auto-detect for primary name column if exact match isn't present
    if primary_name_col not in df.columns:
        candidates = [c for c in df.columns if c.lower().replace("_","") == "primaryname"]
        if len(candidates) == 1:
            primary_name_col = candidates[0]

    # Ensure ordering exists
    if actor_order_col not in df.columns:
        df[actor_order_col] = df.groupby(movie_id_col).cumcount() + 1

    # Filter to actors only (if category exists)
    if "category" in df.columns and actor_categories:
        df = df[df["category"].isin(actor_categories)].copy()

    # Sort so 'first()' is first-billed for movie-level fields
    df = df.sort_values([movie_id_col, actor_order_col], kind="mergesort")

    # Define movie-level columns (keep once per movie)
    actor_level_cols = {
        "nconst", primary_name_col,
        "actor_sentiment", "prior_movie_actor_sentiment",
        "prior1_rating", "prior2_rating", "prior3_rating",
        actor_order_col
    }
    movie_level_cols = [c for c in df.columns if c not in actor_level_cols]

    # Base movie table
    movie_base = df.groupby(movie_id_col, as_index=False)[movie_level_cols].first()

    # Assign actor index and cap at N
    df["actor_idx"] = df.groupby(movie_id_col).cumcount() + 1
    df = df[df["actor_idx"] <= max_actors].copy()

    # Select actor features
    actor_feature_cols = [
        "actor_sentiment",
        "prior_movie_actor_sentiment",
        "prior1_rating", "prior2_rating", "prior3_rating",
    ]
    if include_actor_text_cols:
        actor_feature_cols = ["nconst", primary_name_col] + actor_feature_cols

    keep_cols = [movie_id_col, "actor_idx"] + [c for c in actor_feature_cols if c in df.columns]
    actors_narrow = df[keep_cols].copy()

    # Pivot wide
    actors_wide = (
        actors_narrow
        .set_index([movie_id_col, "actor_idx"])
        .unstack("actor_idx")
    )

    # Flatten MultiIndex columns: (feature, idx) -> f"actor{idx}_{feature}"
    actors_wide.columns = [f"actor{idx}_{name}" for name, idx in actors_wide.columns]
    actors_wide = actors_wide.reset_index()

    # Merge back
    wide_df = movie_base.merge(actors_wide, on=movie_id_col, how="left")

    # Order columns: movie fields first, then actors by idx
    movie_cols_first = list(movie_base.columns)
    actor_cols_after = sorted(
        [c for c in wide_df.columns if c not in movie_cols_first],
        key=lambda x: (int(x.split("_")[0].replace("actor", "")), x)
    )
    wide_df = wide_df[movie_cols_first + actor_cols_after]

    return wide_df


In [54]:
def make_movie_actor_wide_test(
    df: pd.DataFrame,
    *,
    movie_id_col: str = "tconst",
    actor_order_col: str = "ordering",
    max_actors: int = 15,
    actor_categories: tuple = ("actor", "actress", 'self'),
    include_actor_text_cols: bool = True,
    primary_name_col: str = "primaryName",
) -> pd.DataFrame:
    """
    Convert a long movie-actor dataset to a wide one: 1 row per movie, N actors as columns.
    """

    df = df.copy()

    # best-effort auto-detect for primary name column if exact match isn't present
    if primary_name_col not in df.columns:
        candidates = [c for c in df.columns if c.lower().replace("_","") == "primaryname"]
        if len(candidates) == 1:
            primary_name_col = candidates[0]

    # Ensure ordering exists
    if actor_order_col not in df.columns:
        df[actor_order_col] = df.groupby(movie_id_col).cumcount() + 1

    # Filter to actors only (if category exists)
    if "category" in df.columns and actor_categories:
        df = df[df["category"].isin(actor_categories)].copy()

    # Sort so 'first()' is first-billed for movie-level fields
    df = df.sort_values([movie_id_col, actor_order_col], kind="mergesort")

    # Define movie-level columns (keep once per movie)
    actor_level_cols = {
        "nconst", primary_name_col,
        "actor_sentiment", "prior_movie_actor_sentiment",
        "prior1_rating_actor", "prior2_rating_actor", "prior3_rating_actor",
        actor_order_col
    }
    movie_level_cols = [c for c in df.columns if c not in actor_level_cols]

    # Base movie table
    movie_base = df.groupby(movie_id_col, as_index=False)[movie_level_cols].first()

    # Assign actor index and cap at N
    df["actor_idx"] = df.groupby(movie_id_col).cumcount() + 1
    df = df[df["actor_idx"] <= max_actors].copy()

    # Select actor features
    actor_feature_cols = [
        "actor_sentiment",
        "prior_movie_actor_sentiment",
        "prior1_rating_actor", "prior2_rating_actor", "prior3_rating_actor",
    ]
    if include_actor_text_cols:
        actor_feature_cols = ["nconst", primary_name_col] + actor_feature_cols

    keep_cols = [movie_id_col, "actor_idx"] + [c for c in actor_feature_cols if c in df.columns]
    actors_narrow = df[keep_cols].copy()

    # Pivot wide
    actors_wide = (
        actors_narrow
        .set_index([movie_id_col, "actor_idx"])
        .unstack("actor_idx")
    )

    # Flatten MultiIndex columns: (feature, idx) -> f"actor{idx}_{feature}"
    actors_wide.columns = [f"actor{idx}_{name}" for name, idx in actors_wide.columns]
    actors_wide = actors_wide.reset_index()

    # Merge back
    wide_df = movie_base.merge(actors_wide, on=movie_id_col, how="left")

    # Order columns: movie fields first, then actors by idx
    movie_cols_first = list(movie_base.columns)
    actor_cols_after = sorted(
        [c for c in wide_df.columns if c not in movie_cols_first],
        key=lambda x: (int(x.split("_")[0].replace("actor", "")), x)
    )
    wide_df = wide_df[movie_cols_first + actor_cols_after]

    return wide_df


# Feature engineering

In [55]:
simpel_dataset = merging_tsv_files_data.drop(labels = [merging_tsv_files_data.columns[0], merging_tsv_files_data.columns[1], 'writers', 'characters', 'primaryProfession', 'birthYear', 'deathYear', 'known_for_movie_1', 'known_for_movie_2', 'known_for_movie_3', 'known_for_movie_4'], axis=1)

#### Calculating movie score and an actor sentiment score

In [56]:
simpel_dataset['movie_score'] = simpel_dataset['averageRating'] * simpel_dataset['numVotes']
simpel_dataset['actor_sentiment'] = simpel_dataset['movie_score']/simpel_dataset['ordering']

#### Converting strings to int 

In [57]:
simpel_dataset['startYear'] = simpel_dataset['startYear'].replace(r'\N', np.nan)
simpel_dataset['startYear'] = pd.to_numeric(simpel_dataset['startYear'], errors='coerce')
simpel_dataset['startYear'] = np.floor(simpel_dataset['startYear'])
simpel_dataset['startYear'] = simpel_dataset['startYear'].astype('Int64')
simpel_dataset['runtimeMinutes'] = simpel_dataset['runtimeMinutes'].replace(r'\N', np.nan)
simpel_dataset['runtimeMinutes'] = pd.to_numeric(simpel_dataset['runtimeMinutes'], errors='coerce')
simpel_dataset['runtimeMinutes'] = np.floor(simpel_dataset['runtimeMinutes'])
simpel_dataset['runtimeMinutes'] = simpel_dataset['runtimeMinutes'].astype('Int64')


In [58]:
simpel_dataset['_orig_order'] = np.arange(len(simpel_dataset))  # to break same-date ties

#### Calculating directors prior movie sentiment and top 3 prior movies

In [None]:
simpel_dataset['directors'] = simpel_dataset['directors'].str.split(',').str[0]
director_df = (simpel_dataset[['tconst', 'directors', 'startYear', '_orig_order', 'movie_score']]
               .where(simpel_dataset['directors'] != r'\N')
               .drop_duplicates(subset=['tconst', 'directors'])
               .sort_values(['directors', 'startYear', '_orig_order'], kind='mergesort'))
director_df['prior_movie_director_sentiment'] = director_df.groupby('directors')['movie_score'].transform(lambda x : x.cumsum()-x).fillna(0)
director_aug = (
    director_df
    .groupby("directors", group_keys=False)
    .apply(cumulative_top3_prior_test, director=True, key_cols=("directors", "tconst"))
    .reset_index(drop=True)
)
director_df = director_df.merge(
    director_aug,
    on=["directors", "tconst"],
    how="left"
)
simpel_dataset = simpel_dataset.merge(director_df[['tconst', 'directors', 'prior_movie_director_sentiment', 'prior1_rating_director', 'prior2_rating_director', 'prior3_rating_director']], on=['tconst', 'directors'], how='left')
#simpel_dataset[['directors', 'prior1_rating_director', 'prior2_rating_director', 'prior3_rating_director']].fillna(0, inplace=True)

### Remove duplicate actors per movie

In [60]:
simpel_dataset = simpel_dataset.drop_duplicates(subset=['tconst', 'nconst'])
simpel_dataset = simpel_dataset.sort_values(['tconst', 'ordering'])
simpel_dataset['ordering'] = simpel_dataset.groupby('tconst').cumcount() + 1

#### Calculating actors prior movie sentiment scores

In [61]:
simpel_dataset = simpel_dataset.sort_values(['primaryName', 'startYear', '_orig_order'], kind='mergesort')
simpel_dataset['prior_movie_actor_sentiment'] = simpel_dataset.groupby('primaryName')['actor_sentiment'].transform(lambda x: x.cumsum() - x)

#### Calculating prior known for movies per actor

In [None]:
simpel_dataset = simpel_dataset.sort_values(['primaryName', 'startYear', '_orig_order'], kind='mergesort')
aug = simpel_dataset.groupby('primaryName', group_keys=False).apply(cumulative_top3_prior_test, director=False, key_cols=('nconst', 'tconst'))
simpel_dataset = simpel_dataset.merge(aug[['tconst', 'nconst', 'prior1_rating_actor', 'prior2_rating_actor', 'prior3_rating_actor']], on=['tconst', 'nconst'], how='left')

In [63]:
simpel_dataset.to_csv('..\\data\\interrim_data1.csv', sep =';')

## One row per movie

In [64]:
simpel_dataset = pd.read_csv('..\\data\\interrim_data1.csv', sep=';')

In [65]:
num_actors = 10

In [66]:
wide_df = make_movie_actor_wide_test(simpel_dataset, max_actors=num_actors)


In [67]:
wide_df.to_csv('..\\data\\dataset.csv', sep =';')