In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

sns.set_style("whitegrid")

In [2]:
data=pd.read_csv("../2. Train Test Split/training_data.csv")

1. To Prevent Overfitting:
Since a model can easily overfit to the noise in small sample sizes from a categorical feature, we identifying these rare categories based on a defined frequency threshold and grouping them into a single, more statistically stable "other" category. 

2. To Prevent Multicollinearity:
We do this by drop one column in the `temp` dataframe.

In [3]:
def get_dummies_and_sift(x, df, percent_threshold=0.01):
    """
    Creates dummy variables for a column after grouping rare categories.
    This version is self-contained and has no side effects.

    Args:
        x (str): The name of the column to process.
        df (pd.DataFrame): The DataFrame containing the data.
        threshold (int): Categories with a count below this threshold will be grouped.
    
    Returns:
        tuple: A tuple containing:
            - pd.DataFrame: The modified DataFrame.
            - list: A list of the new dummy column names that were created.
    """

    threshold = len(df) * percent_threshold
    counts = df[x].value_counts()
    categories_to_lump = counts[counts < threshold].index
    df.loc[df[x].isin(categories_to_lump), x] = f'Other_{x}'

    temp = pd.get_dummies(df[x], drop_first = True)
    temp = temp * 1
    
    new_column_names = temp.columns.tolist()
    
    df = pd.concat([df, temp], axis = 1)
    df.drop([x], axis = 1, inplace = True)
    
    return df, new_column_names

In [4]:
initial_feature_cols=["age_at_baseline", "visit_month"]

data, new_cols_edu_hist = get_dummies_and_sift('education_level_years',data)
initial_feature_cols += new_cols_edu_hist

data, new_cols_race = get_dummies_and_sift('race', data)

data, new_cols = get_dummies_and_sift('sex', data)

Rename the target columns so that they can be more easily referred to.

In [5]:
data=data.rename(columns={"mds_updrs_part_i_summary_score":"updrs_1", "mds_updrs_part_ii_summary_score":"updrs_2", "mds_updrs_part_iii_summary_score":"updrs_3"})

This columns only contain "Yes", "No" or NaN values, we replace "Yes" by 1 and "No" by 0, set their data type to be "float64", and add all these to `initial_feature_cols`.

In [6]:
map_cols = ["caff_drinks_current_use",
                "caff_drinks_ever_used_regularly", 
                "biological_mother_with_pd", 
                "biological_father_with_pd", 
                "other_relative_with_pd"]
for col in map_cols:
    if col in data.columns:
        data[col] = data[col].map({'Yes': 1, 'No': 0})
        data[col].astype("float64")

initial_feature_cols += map_cols


Observe that in the dataframe, there are time dependent features and static (non-time-dependent data) features, we classify them by appending these features into different list and then output them so that they can be referred to in the later notebooks.

By our analysis in the `Data Preparation`, only the features in the `releases_2023_v4release_1027_clinical_DaTSCAN_SBR.csv` are time-dependent features, all other features are static.

In [7]:
time_dependent_clinical_features = ['sbr_caudate_r', 'sbr_caudate_l', 'sbr_putamen_r', 'sbr_putamen_l']

with open('time_dependent_clinical_features.txt', 'w') as file:
    for item in time_dependent_clinical_features:
        file.write(item + '\n')


static_clinical_features = initial_feature_cols + [i for i in data.columns if ("Eigenvalue" in i) or ("Fractional Anisotropy" in i)]

with open('static_clinical_features.txt', 'w') as file:
    for item in static_clinical_features:
        file.write(item + '\n')


In [8]:
data.to_csv("preprocessed_training_data.csv", index=False)