## Imports

In [1]:
import pandas as pd 
import numpy as np
import heapq
from typing import List, Optional
import sklearn.preprocessing

### Load data from the merged TSV files

In [2]:
merging_tsv_files_data = pd.read_csv('..\\data\\merging_tsv_files_data.csv', sep=';')

  merging_tsv_files_data = pd.read_csv('..\\data\\merging_tsv_files_data.csv', sep=';')


### Functions

In [2]:
def cumulative_top3_prior(group: pd.DataFrame):
    """
    For each row in an actor group, emit the top-3 *prior* ratings (and titles) before this row.
    Uses a size-3 min-heap for O(n) per group.
    """
    top_heap = []  # min-heap of (rating, title) so smallest on top; keep size<=3
    prior1_rating, prior2_rating, prior3_rating = [], [], []

    for _, row in group.iterrows():
        # snapshot current top prior works (sorted desc)
        snapshot = sorted(top_heap, key=lambda x: x[0], reverse=True)
        ratings = [r for r, _ in snapshot]

        # pad to length 3
        while len(ratings) < 3:
            ratings.append(np.nan)

        prior1_rating.append(ratings[0])
        prior2_rating.append(ratings[1])
        prior3_rating.append(ratings[2])
        # now add current row into the actor's history
        heapq.heappush(top_heap, (row['movie_score'], row.get('tconst', None)))
        if len(top_heap) > 3:
            heapq.heappop(top_heap)

    out = pd.DataFrame({
        'prior1_rating': prior1_rating,
        'prior2_rating': prior2_rating,
        'prior3_rating': prior3_rating,
    }, index=group.index)

    return out

In [3]:
import pandas as pd

def make_movie_actor_wide(
    df: pd.DataFrame,
    *,
    movie_id_col: str = "tconst",
    actor_order_col: str = "ordering",
    max_actors: int = 15,
    actor_categories: tuple = ("actor", "actress", 'self'),
    include_actor_text_cols: bool = True,
    primary_name_col: str = "primaryName",
) -> pd.DataFrame:
    """
    Convert a long movie-actor dataset to a wide one: 1 row per movie, N actors as columns.
    """

    df = df.copy()

    # best-effort auto-detect for primary name column if exact match isn't present
    if primary_name_col not in df.columns:
        candidates = [c for c in df.columns if c.lower().replace("_","") == "primaryname"]
        if len(candidates) == 1:
            primary_name_col = candidates[0]

    # Ensure ordering exists
    if actor_order_col not in df.columns:
        df[actor_order_col] = df.groupby(movie_id_col).cumcount() + 1

    # Filter to actors only (if category exists)
    if "category" in df.columns and actor_categories:
        df = df[df["category"].isin(actor_categories)].copy()

    # Sort so 'first()' is first-billed for movie-level fields
    df = df.sort_values([movie_id_col, actor_order_col], kind="mergesort")

    # Define movie-level columns (keep once per movie)
    actor_level_cols = {
        "nconst", primary_name_col,
        "actor_sentiment", "prior_movie_actor_sentiment",
        "prior1_rating", "prior2_rating", "prior3_rating",
        actor_order_col
    }
    movie_level_cols = [c for c in df.columns if c not in actor_level_cols]

    # Base movie table
    movie_base = df.groupby(movie_id_col, as_index=False)[movie_level_cols].first()

    # Assign actor index and cap at N
    df["actor_idx"] = df.groupby(movie_id_col).cumcount() + 1
    df = df[df["actor_idx"] <= max_actors].copy()

    # Select actor features
    actor_feature_cols = [
        "actor_sentiment",
        "prior_movie_actor_sentiment",
        "prior1_rating", "prior2_rating", "prior3_rating",
    ]
    if include_actor_text_cols:
        actor_feature_cols = ["nconst", primary_name_col] + actor_feature_cols

    keep_cols = [movie_id_col, "actor_idx"] + [c for c in actor_feature_cols if c in df.columns]
    actors_narrow = df[keep_cols].copy()

    # Pivot wide
    actors_wide = (
        actors_narrow
        .set_index([movie_id_col, "actor_idx"])
        .unstack("actor_idx")
    )

    # Flatten MultiIndex columns: (feature, idx) -> f"actor{idx}_{feature}"
    actors_wide.columns = [f"actor{idx}_{name}" for name, idx in actors_wide.columns]
    actors_wide = actors_wide.reset_index()

    # Merge back
    wide_df = movie_base.merge(actors_wide, on=movie_id_col, how="left")

    # Order columns: movie fields first, then actors by idx
    movie_cols_first = list(movie_base.columns)
    actor_cols_after = sorted(
        [c for c in wide_df.columns if c not in movie_cols_first],
        key=lambda x: (int(x.split("_")[0].replace("actor", "")), x)
    )
    wide_df = wide_df[movie_cols_first + actor_cols_after]

    return wide_df


In [4]:
def one_hot_encode_columns(df, columns):
    for col in columns:
        encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=False)
        encoded = encoder.fit_transform(df[[col]])
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]))
        df = pd.concat([df.drop(col, axis=1), encoded_df], axis=1)
    return df

In [5]:
def categorize_runtime(df, column_name='runtime'):
    # Define bins (in minutes)
    bins = [0, 90, 120, 150, float('inf')]
    labels = ['short', 'medium', 'long', 'very_long']

    # Create a new column for runtime category
    df['runtime_category'] = pd.cut(df[column_name], bins=bins, labels=labels, right=False)

    return df

# Feature engineering

In [9]:
simpel_dataset = merging_tsv_files_data.drop(labels = [merging_tsv_files_data.columns[0], merging_tsv_files_data.columns[1], 'writers', 'characters', 'primaryProfession', 'birthYear', 'deathYear', 'known_for_movie_1', 'known_for_movie_2', 'known_for_movie_3', 'known_for_movie_4'], axis=1)

#### Calculating movie score and an actor sentiment score

In [10]:
simpel_dataset['movie_score'] = simpel_dataset['averageRating'] * simpel_dataset['numVotes']
simpel_dataset['actor_sentiment'] = simpel_dataset['movie_score']/simpel_dataset['ordering']

### Remove duplicate actors per movie

In [12]:
simpel_dataset = simpel_dataset.drop_duplicates(subset=['tconst', 'nconst'])
simpel_dataset = simpel_dataset.sort_values(['tconst', 'ordering'])
simpel_dataset['ordering'] = simpel_dataset.groupby('tconst').cumcount() + 1

#### Calculating actors prior movie sentiment scores

In [13]:
simpel_dataset['_orig_order'] = np.arange(len(simpel_dataset))  # to break same-date ties
simpel_dataset = simpel_dataset.sort_values(['primaryName', 'startYear', '_orig_order'], kind='mergesort')
simpel_dataset['prior_movie_actor_sentiment'] = simpel_dataset.groupby('primaryName')['actor_sentiment'].transform(lambda x: x.cumsum() - x)

#### Converting strings to int 

In [14]:
simpel_dataset['startYear'] = simpel_dataset['startYear'].replace('\\N', np.nan)
simpel_dataset['startYear'] = pd.to_numeric(simpel_dataset['startYear'], errors='coerce')
simpel_dataset['startYear'] = np.floor(simpel_dataset['startYear'])
simpel_dataset['startYear'] = simpel_dataset['startYear'].astype('Int64')
simpel_dataset['runtimeMinutes'] = simpel_dataset['runtimeMinutes'].replace('\\N', np.nan)
simpel_dataset['runtimeMinutes'] = pd.to_numeric(simpel_dataset['runtimeMinutes'], errors='coerce')
simpel_dataset['runtimeMinutes'] = np.floor(simpel_dataset['runtimeMinutes'])
simpel_dataset['runtimeMinutes'] = simpel_dataset['runtimeMinutes'].astype('Int64')


#### Calculating prior known for movies per actor

In [16]:
simpel_dataset = simpel_dataset.sort_values(['primaryName', 'startYear', '_orig_order'], kind='mergesort')
aug = simpel_dataset.groupby('primaryName', group_keys=False).apply(cumulative_top3_prior)
simpel_dataset = simpel_dataset.join(aug)

  aug = simpel_dataset.groupby('primaryName', group_keys=False).apply(cumulative_top3_prior)


In [18]:
simpel_dataset.to_csv('..\\data\\interrim_data1.csv', sep =';')

## One row per movie

In [6]:
simpel_dataset = pd.read_csv('..\\data\\interrim_data.csv', sep=';')

In [7]:
num_actors = 10

In [8]:
wide_df = make_movie_actor_wide(simpel_dataset, max_actors=num_actors)


In [9]:
for i in range (1, num_actors + 1):
    actor_pos = f'actor{i}_nconst'
    actor_null_mask = wide_df[actor_pos].isna()
    actor_non_null_mask = ~actor_null_mask
    wide_df.loc[actor_null_mask, actor_pos] = -1
    for suffix in ['_actor_sentiment', '_prior1_rating', '_prior2_rating', '_prior3_rating', '_prior_movie_actor_sentiment']:
        col_name = f'actor{i}{suffix}'
        if col_name in wide_df.columns:
            wide_df.loc[actor_null_mask, col_name] = -1
            wide_df.loc[actor_non_null_mask & wide_df[col_name].isna(), col_name] = 0

### One hot encode

In [10]:
wide_df['runtimeMinutes'] = wide_df['runtimeMinutes'].fillna(wide_df.groupby('startYear')['runtimeMinutes'].transform('mean'))

In [11]:
wide_df = categorize_runtime(wide_df, 'runtimeMinutes')

In [12]:
wide_df = one_hot_encode_columns(wide_df, ['genre_1', 'genre_2', 'genre_3', 'runtime_category'])

## Removing columns

In [13]:
for i in range (1, num_actors + 1):
    actor_id = f'actor{i}_nconst'
    actor_name = f'actor{i}_primaryName'
    wide_df = wide_df.drop(columns=[actor_id, actor_name])

In [14]:
columns_to_drop = ['Unnamed: 0', 'tconst', 'titleType', 'primaryTitle', 'originalTitle', 'directors', 'category', 'job']
wide_df = wide_df.drop(columns=columns_to_drop)