In [1]:
import pandas as pd 
import numpy as np
import heapq
from typing import List, Optional
import sklearn.preprocessing
import matplotlib.pyplot as plt

In [2]:
def handle_missing_actor_value (df, num_actors):
    for i in range (1, num_actors + 1):
        actor_pos = f'actor{i}_nconst'
        actor_null_mask = df[actor_pos].isna()
        actor_non_null_mask = ~actor_null_mask
        df.loc[actor_null_mask, actor_pos] = -1
        for suffix in ['_actor_sentiment', '_prior1_rating', '_prior2_rating', '_prior3_rating', '_prior_movie_actor_sentiment']:
            col_name = f'actor{i}{suffix}'
            if col_name in df.columns:
                df.loc[actor_null_mask, col_name] = -1
                df.loc[actor_non_null_mask & df[col_name].isna(), col_name] = 0
    return df

In [3]:
def fill_mean (df, column, groupby):
    df[column] = df[column].fillna(df.groupby(groupby)[column].transform('mean'))
    return df

In [4]:
def categorize_runtime(df, column_name='runtime'):
    # Define bins (in minutes)
    bins = [0, 90, 120, 150, float('inf')]
    labels = ['short', 'medium', 'long', 'very_long']

    # Create a new column for runtime category
    df['runtime_category'] = pd.cut(df[column_name], bins=bins, labels=labels, right=False)

    return df

In [5]:
def one_hot_encode_columns(df, columns):
    for col in columns:
        encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=False)
        encoded = encoder.fit_transform(df[[col]])
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]))
        df = pd.concat([df.drop(col, axis=1), encoded_df], axis=1)
    return df

In [6]:
def remove_actors_columns(df, num_actors):
    for i in range (1, num_actors + 1):
        actor_id = f'actor{i}_nconst'
        actor_name = f'actor{i}_primaryName'
        df = df.drop(columns=[actor_id, actor_name])
    return df

# Split data

In [7]:
num_actors = 10

In [8]:
data = pd.read_csv('..\\data\\dataset.csv', sep=';')

# Preparing training set

### Handle missing values

In [None]:
data = handle_missing_actor_value(data, num_actors)
data = fill_mean(data, 'runtimeMinutes', 'startYear')

### One hot encode

In [None]:
data = categorize_runtime(data, 'runtimeMinutes')
data = one_hot_encode_columns(data, ['genre_1', 'genre_2', 'genre_3', 'runtime_category'])

### Drop columns

In [10]:
data = remove_actors_columns(data, num_actors)
columns_to_drop = ['Unnamed: 0', 'tconst', 'titleType', 'primaryTitle', 'originalTitle', 'directors', 'category', 'job']
data = data.drop(columns=columns_to_drop)

In [None]:
data.to_csv('..\\data\\training_dataset.csv', sep =';')

# Preparing test set