In [1]:
import pandas as pd 
import numpy as nps
import sklearn.preprocessing

In [12]:
def handle_missing_actor_value (df, num_actors):
    for i in range (1, num_actors + 1):
        actor_pos = f'actor{i}_nconst'
        actor_null_mask = df[actor_pos].isna()
        actor_non_null_mask = ~actor_null_mask
        df.loc[actor_null_mask, actor_pos] = -1
        for suffix in ['_actor_sentiment', '_prior1_rating_actor', '_prior2_rating_actor', '_prior3_rating_actor', '_prior_movie_actor_sentiment']:
            col_name = f'actor{i}{suffix}'
            if col_name in df.columns:
                df.loc[actor_null_mask, col_name] = -1
                df.loc[actor_non_null_mask & df[col_name].isna(), col_name] = 0
    return df

In [16]:
def handle_missing_director_value (df):
    actor_null_mask = df['directors'].isna()
    actor_non_null_mask = ~actor_null_mask
    df.loc[actor_null_mask] = -1
    for col_name in ['prior_movie_director_sentiment', 'prior1_rating_director', 'prior2_rating_director', 'prior3_rating_director']:
        if col_name in df.columns:
            df.loc[actor_null_mask, col_name] = -1
            df.loc[actor_non_null_mask & df[col_name].isna(), col_name] = 0
    return df

In [3]:
def fill_mean (df, column, groupby):
    df[column] = df[column].fillna(df.groupby(groupby)[column].transform('mean'))
    return df

In [4]:
def categorize_runtime(df, column_name='runtime'):
    # Define bins (in minutes)
    bins = [0, 90, 120, 150, float('inf')]
    labels = ['short', 'medium', 'long', 'very_long']

    # Create a new column for runtime category
    df['runtime_category'] = pd.cut(df[column_name], bins=bins, labels=labels, right=False)

    return df

In [5]:
def one_hot_encode_columns(df, columns):
    for col in columns:
        encoder = sklearn.preprocessing.OneHotEncoder(sparse_output=False)
        encoded = encoder.fit_transform(df[[col]])
        # Preserve original index to avoid new rows being created
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out([col]), index=df.index)
        # Concatenate safely
        df = pd.concat([df.drop(col, axis=1), encoded_df], axis=1)

    return df


In [6]:
def remove_actors_columns(df, num_actors):
    for i in range (1, num_actors + 1):
        actor_id = f'actor{i}_nconst'
        actor_name = f'actor{i}_primaryName'
        df = df.drop(columns=[actor_id, actor_name])
    return df

In [7]:
def align_columns(df_train, df_test):
    for col in df_train.columns:
        if col not in df_test.columns:
            df_test[col] = 0
    df_test = df_test[df_train.columns]
    return df_test

# Split data

## Variables

In [None]:
num_actors = 10
split_ratio = 0.8
columns_to_drop = ['Unnamed: 0.1', 'Unnamed: 0', 'tconst', 'titleType', 'primaryTitle', 'originalTitle', 'directors', 'category', 'job']

In [18]:
data = pd.read_csv('..\\data\\dataset.csv', sep=';', na_values=[r'\N'])

In [14]:
data = data.sort_values(by=['startYear', '_orig_order'])
# Calculate the split index
split_index = int(len(data) * split_ratio)
# Determine the split year
split_year = data.iloc[split_index]['startYear']
# Split the dataframe
df_train = data.iloc[:split_index]
df_test = data.iloc[split_index:]

# Preparing training set

### Handle missing values

In [17]:
df_train = handle_missing_actor_value(df_train, num_actors)
df_train = handle_missing_director_value(df_train)
df_train = fill_mean(df_train, 'runtimeMinutes', 'startYear')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].fillna(df.groupby(groupby)[column].transform('mean'))


### One hot encode

In [12]:
df_train = categorize_runtime(df_train, 'runtimeMinutes')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['runtime_category'] = pd.cut(df[column_name], bins=bins, labels=labels, right=False)


In [13]:
df_train = one_hot_encode_columns(df_train, ['genre_1', 'genre_2', 'genre_3', 'runtime_category'])

In [14]:
df_train = df_train.sort_values(by=['startYear', '_orig_order'])

In [15]:
df_train = df_train.dropna(subset=['tconst'])

### Drop columns

In [16]:
df_train = remove_actors_columns(df_train, num_actors)
df_train = df_train.drop(columns=columns_to_drop)

In [17]:
df_train.to_csv('..\\data\\training_dataset.csv', sep =';')

# Preparing test set

### Handle missing values

In [17]:
df_test = handle_missing_actor_value(df_test, num_actors)
df_test = fill_mean(df_test, 'runtimeMinutes', 'startYear')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].fillna(df.groupby(groupby)[column].transform('mean'))


### One hot encode

In [18]:
df_test = categorize_runtime(df_test, 'runtimeMinutes')
df_test = one_hot_encode_columns(df_test, ['genre_1', 'genre_2', 'genre_3', 'runtime_category'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['runtime_category'] = pd.cut(df[column_name], bins=bins, labels=labels, right=False)


In [19]:
df_test = df_test.dropna(subset=['tconst'])

### Drop columns

In [20]:
df_test = remove_actors_columns(df_test, num_actors)
df_test = df_test.drop(columns=columns_to_drop)

In [21]:
df_test = align_columns(df_train, df_test)

In [22]:
df_test.to_csv('..\\data\\test_dataset.csv', sep =';')