## Imports

In [1]:
import pandas as pd 
import numpy as np
import sklearn.preprocessing
from sklearn.preprocessing import MultiLabelBinarizer
import ast

## Functions

In [2]:
def handle_missing_actor_value (df, num_actors):
    """This function handles missing actor values, this being actor_sentiment, and prior actor rating.
    If there isn't an actor the value is set to -1 and if there is an actor but there isn't a value present it is set to 0.

    Args:
        df (DataFrmae): Dataframe with actor columns
        num_actors (Int): The amount of actors in the dataframe

    Returns:
        DataFrame: Returns an updated dataframe, the missing values have been handeled
    """
    # Loop over columns
    for i in range (1, num_actors + 1):
        # The active actor column that is being checked
        actor_pos = f'actor{i}_nconst'
        # Data mask for every position where there isn't an actor in the data
        actor_null_mask = df[actor_pos].isna()
        # Creates an inverse of the data mask, to know where there is an actor in the data
        actor_non_null_mask = ~actor_null_mask
        # Sets the value to -1 if there isn't a actor present
        df.loc[actor_null_mask, actor_pos] = -1
        # Loops through the different columns attached to the actor
        for suffix in ['_actor_sentiment', '_prior1_rating_actor', '_prior2_rating_actor', '_prior3_rating_actor', '_prior_movie_actor_sentiment']:
            col_name = f'actor{i}{suffix}'
            if col_name in df.columns:
                # If the actor isn't present sets the value to -1
                df.loc[actor_null_mask, col_name] = -1
                # If the actor is present and the value in the given cell i null then 0
                df.loc[actor_non_null_mask & df[col_name].isna(), col_name] = 0
    return df

In [3]:
def handle_missing_director_value (df):
    """This function handles missing director values, this being director_sentiment and prior director sentiment.
    If there isn't a director attached to the movie then every value is set to -1 otherwise it is set to 0.

    Args:
        df (DataFrame): Dataframe with director columns

    Returns:
        DataFrame: Returns an updated dataframe, where the values have been handled.
    """
    # Data mask for every row where there isn't a director in the data
    director_null_mask = df['directors'].isna()
    # Inverse data mask, to know where there is a director in the data
    director_non_null_mask = ~director_null_mask
    df.loc[director_null_mask, 'directors'] = -1
    # Loops through the different director columns.
    for col_name in ['prior_movie_director_sentiment', 'prior1_rating_director', 'prior2_rating_director', 'prior3_rating_director']:
        if col_name in df.columns:
            # If the director isn't present in on the row then set value to -1
            df.loc[director_null_mask, col_name] = -1
            # If the director is present on the row but the value is null then 0.
            df.loc[director_non_null_mask & df[col_name].isna(), col_name] = 0
    return df

In [4]:
def fill_mean (df, column, groupby):
    """Fill the empty values in the a column with the mean of a group in the column.

    Args:
        df (DataFrame): The dataframe that needs to be filled
        column (String): The column that needs to null handled
        groupby (String): The value that needs to be grouped by 

    Returns:
        DataFrame: Updated dataframe
    """
    # Groups the rows via the groupby value given, calculates the mean of that group, and fills in the null cells in the 
    df[column] = (df[column]
                  .fillna(
                      df.groupby(groupby)[column]
                      .transform('mean')
                      )
                  )
    return df

In [5]:
def categorize_runtime(df, column_name='runtime'):
    """Categorize 

    Args:
        df (_type_): _description_
        column_name (str, optional): _description_. Defaults to 'runtime'.

    Returns:
        _type_: _description_
    """
    # Define bins (in minutes)
    bins = [0, 90, 120, 150, float('inf')]
    labels = ['short', 'medium', 'long', 'very_long']

    # Create a new column for runtime category and categorize them
    df['runtime_category'] = pd.cut(df[column_name], bins=bins, labels=labels, right=False)

    return df

In [6]:
def one_hot_encode_columns(df, columns, encoder=None):
    """
    One-hot encode columns. If encoder is provided, use it for transform only.
    Otherwise, fit a new encoder.
    Returns: (df, encoder_dict) where encoder_dict maps column names to fitted encoders
    """
    encoder_dict = {}
    for col in columns:
        if encoder is None or col not in encoder:
            # Fit new encoder
            enc = sklearn.preprocessing.OneHotEncoder(sparse_output=False, handle_unknown='ignore')
            encoded = enc.fit_transform(df[[col]])
            encoder_dict[col] = enc
        else:
            # Use provided encoder
            enc = encoder[col]
            encoded = enc.transform(df[[col]])
        # Preserve original index to avoid new rows being created
        encoded_df = pd.DataFrame(encoded, columns=enc.get_feature_names_out([col]), index=df.index)
        # Concatenate safely
        df = pd.concat([df.drop(col, axis=1), encoded_df], axis=1)
    
    return df, encoder_dict


In [7]:
def remove_actors_columns(df, num_actors):
    """Removes unwanted actor (nconst, primaryName and sentiment) columns from the dataframe.

    Args:
        df (DataFrame): A dataframe with unwanted actor columns
        num_actors (Int): The amount of actors in the dataframe

    Returns:
        DataFrame: An updated dataframe, with the removed actor columns
    """
    # Loops over the number over actors
    for i in range (1, num_actors + 1):
        # Defines columns to be removed
        actor_id = f'actor{i}_nconst'
        actor_name = f'actor{i}_primaryName'
        #Drops the columns
        df = df.drop(columns=[actor_id, actor_name])
    return df

In [8]:
def align_columns(df_train, df_test):
    """Makes sure the testing dataset have the same amount of columns

    Args:
        df_train (DataFrame): Dataframe with training data
        df_test (DataFrame): Dataframe with testing data

    Returns:
        DataFrame: Testing dataframe with the same amount of columns
    """
    for col in df_train.columns:
        if col not in df_test.columns:
            df_test[col] = 0
    df_test = df_test[df_train.columns]
    return df_test

# Split data

## Variables

In [9]:
num_actors = 10
split_ratio = 0.8
columns_to_drop = ['Unnamed: 0', 'tconst', 'titleType', 'primaryTitle', 'originalTitle', 'directors', 'category', 'job']
votes_max = 1000

### Retrieve data from the prepared dataset

In [10]:
useBackslash = True 
retrieveDatasetFrom = r'..\common_datasets\number_votes_dataset_log_scaled.csv' if useBackslash else r'../common_datasets/number_votes_dataset_log_scaled.csv'
data = pd.read_csv(retrieveDatasetFrom, sep=';')
data["genres_list"] = data["genres_list"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

Replaces directors where the value is \N to be able to better handle missing values

In [11]:
data['directors'] = data['directors'].replace(r'\N', np.nan)

### Splits the data on a given year determined by the split ratio

In [12]:
data = data.sort_values(by=['startYear', '_orig_order'])
# Calculate the split index
split_index = int(len(data) * split_ratio)
# Determine the split year
split_year = data.iloc[split_index]['startYear']
# Split the dataframe
df_train = data[data['startYear'] < split_year]
df_test  = data[data['startYear'] >= split_year]

# Preparing training set

### Handle missing values

In [None]:
# Fills missing director values
df_train = handle_missing_director_value(df_train)
# Fills missing actor values
df_train = handle_missing_actor_value(df_train, num_actors)
# Fills the missing runtimeMinutes with the mean of the whole startYear
df_train = fill_mean(df_train, 'runtimeMinutes', 'startYear')
# Fills in missing values with zero if there aren't any movies from the year with a runtime (2 rows)
df_train['runtimeMinutes'] = df_train['runtimeMinutes'].fillna(0)

### One hot encode

In [None]:
# Categorize the runtime into the different bins defined in the function
df_train = categorize_runtime(df_train, 'runtimeMinutes')
# One hot encodes the genres and the runtime 
df_train["genres_list"] = df_train["genres_list"].apply(
    lambda lst: [g for g in lst if g != "\\N"]
)

# Fit MultiLabelBinarizer on training data
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(df_train["genres_list"])

genre_df = pd.DataFrame(
    genre_encoded,
    columns=[f"genre_{g.lower().replace(' ', '_')}" for g in mlb.classes_],
    index=df_train.index
)


df_train = pd.concat([df_train.drop(columns=["genres_list"]), genre_df], axis=1)
df_train, runtime_encoder = one_hot_encode_columns(df_train, ['runtime_category'])


In [15]:
# Sorts the data to keep the temporal order
df_train = df_train.sort_values(by=['startYear', '_orig_order'])

In [16]:
# Removes rows where the movie isn't present
df_train = df_train.dropna(subset=['tconst'])

### Drop columns

In [17]:
df_train = remove_actors_columns(df_train, num_actors)
df_train = df_train.drop(columns=columns_to_drop)

Saves the data to the folder data inside the folder

In [18]:
saveTrainingDatasetAt = r'.\data\training_dataset.csv' if useBackslash else r'./data/training_dataset.csv'
df_train.to_csv(saveTrainingDatasetAt, sep =';')

# Preparing test set

### Handle missing values

In [None]:
# Fills missing director values
df_test = handle_missing_director_value(df_test)
# Fills missing actor values
df_test = handle_missing_actor_value(df_test, num_actors)
# Fills the missing runtimeMinutes with the mean of the whole startYear
df_test = fill_mean(df_test, 'runtimeMinutes', 'startYear')
# Fills in missing values with zero if there aren't any movies from the year with a runtime (0 rows)
df_train['runtimeMinutes'] = df_train['runtimeMinutes'].fillna(0)

### One hot encode

In [None]:
# Categorize the runtime into the different bins defined in the function
df_test = categorize_runtime(df_test, 'runtimeMinutes')
# One hot encodes the genres and the runtime 
df_test["genres_list"] = df_test["genres_list"].apply(
    lambda lst: [g for g in lst if g != "\\N"]
)

# Use the same MultiLabelBinarizer fitted on training data (transform only, don't fit)
genre_encoded = mlb.transform(df_test["genres_list"])

genre_df_test = pd.DataFrame(
    genre_encoded,
    columns=[f"genre_{g.lower().replace(' ', '_')}" for g in mlb.classes_],
    index=df_test.index
)

df_test = pd.concat([df_test.drop(columns=["genres_list"]), genre_df_test], axis=1)
# Use the same runtime encoder fitted on training data
df_test, _ = one_hot_encode_columns(df_test, ['runtime_category'], encoder=runtime_encoder)


In [21]:
# Removes rows where the movie isn't present
df_test = df_test.dropna(subset=['tconst'])

### Drop columns

In [22]:
df_test = remove_actors_columns(df_test, num_actors)
df_test = df_test.drop(columns=columns_to_drop)

### Align the columns from the training data with the test data
This is done for the to dataframe to have the same dimensions

In [23]:
df_test = align_columns(df_train, df_test)

### Saves the data

In [24]:
saveTestDatasetAt = r'.\data\test_dataset.csv' if useBackslash else r'./data/test_dataset.csv'
df_test.to_csv(saveTestDatasetAt, sep =';')