In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from utils import delete_patients_with_the_same_GUID



sns.set_style("whitegrid")

We load the data into aptly made variables and delete patients with different participant id but same GUID, to ensure that there is no data leakage.

In [75]:
updrs_1=pd.read_csv("../Data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_I.csv")
updrs_1_cleaned=delete_patients_with_the_same_GUID(updrs_1)

In [76]:
updrs_2=pd.read_csv("../Data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_II.csv")
updrs_2_cleaned=delete_patients_with_the_same_GUID(updrs_2)

In [77]:
updrs_3=pd.read_csv("../Data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_III.csv")
updrs_3_cleaned=delete_patients_with_the_same_GUID(updrs_3)


In [78]:
demographic_data=pd.read_csv("../Data/releases_2023_v4release_1027_clinical_Demographics.csv")
demographic_data_cleaned=delete_patients_with_the_same_GUID(demographic_data)

In [79]:
family_data=pd.read_csv("../Data/releases_2023_v4release_1027_clinical_Family_History_PD.csv")
family_data_cleaned=delete_patients_with_the_same_GUID(family_data)

In [94]:
caffeine_data=pd.read_csv("../Data/releases_2023_v4release_1027_clinical_Caffeine_history.csv")
caffeine_data_cleaned=delete_patients_with_the_same_GUID(caffeine_data)


In [96]:
DTI_data=pd.read_csv("../Data/releases_2023_v4release_1027_clinical_DTI.csv")
DTI_data_cleaned=delete_patients_with_the_same_GUID(DTI_data)


In [80]:
updrs_1_cleaned.columns

Index(['participant_id', 'GUID', 'visit_name', 'visit_month',
       'mds_updrs_part_i_primary_info_source',
       'code_upd2101_cognitive_impairment',
       'code_upd2102_hallucinations_and_psychosis',
       'code_upd2103_depressed_mood', 'code_upd2104_anxious_mood',
       'code_upd2105_apathy',
       'code_upd2106_dopamine_dysregulation_syndrome_features',
       'upd2101_cognitive_impairment', 'upd2102_hallucinations_and_psychosis',
       'upd2103_depressed_mood', 'upd2104_anxious_mood', 'upd2105_apathy',
       'upd2106_dopamine_dysregulation_syndrome_features',
       'mds_updrs_part_i_sub_score',
       'mds_updrs_part_i_pat_quest_primary_info_source',
       'code_upd2107_pat_quest_sleep_problems',
       'code_upd2108_pat_quest_daytime_sleepiness',
       'code_upd2109_pat_quest_pain_and_other_sensations',
       'code_upd2110_pat_quest_urinary_problems',
       'code_upd2111_pat_quest_constipation_problems',
       'code_upd2112_pat_quest_lightheadedness_on_standing',
  

We include as features all the UPDRS Scores at the baseline except the indicators of freezing of gait, the 13th question and the 11th question of  the UPDRS Part II and Part III scores respectively.

In [81]:
updrs_1_at_baseline = updrs_1_cleaned[(updrs_1_cleaned["visit_month"] == 0)].drop(columns = ['GUID', 'visit_name', 'visit_month'])
updrs_2_at_baseline = updrs_2_cleaned[(updrs_2_cleaned["visit_month"] == 0)].drop(columns = ["code_upd2213_freezing",'GUID', "visit_month", "visit_name"])
updrs_3_at_baseline = updrs_3_cleaned[(updrs_3_cleaned["visit_month"] == 0)].drop(columns= ["code_upd2311_freezing_of_gait", 'GUID', "visit_month", "visit_name"])


In [82]:
updrs_at_baseline = updrs_1_at_baseline.merge(updrs_2_at_baseline, on ='participant_id', how = 'outer').merge(updrs_3_at_baseline, on ='participant_id', how = 'outer')

In [83]:
updrs_at_baseline.columns

Index(['participant_id', 'mds_updrs_part_i_primary_info_source',
       'code_upd2101_cognitive_impairment',
       'code_upd2102_hallucinations_and_psychosis',
       'code_upd2103_depressed_mood', 'code_upd2104_anxious_mood',
       'code_upd2105_apathy',
       'code_upd2106_dopamine_dysregulation_syndrome_features',
       'upd2101_cognitive_impairment', 'upd2102_hallucinations_and_psychosis',
       ...
       'upd2317d_rest_tremor_amplitude_left_lower_extremity',
       'upd2317e_rest_tremor_amplitude_lip_or_jaw',
       'upd2318_consistency_of_rest_tremor', 'upd2da_dyskinesias_during_exam',
       'upd2db_movements_interfere_with_ratings',
       'code_upd2hy_hoehn_and_yahr_stage', 'upd2hy_hoehn_and_yahr_stage',
       'upd23a_medication_for_pd', 'upd23b_clinical_state_on_medication',
       'mds_updrs_part_iii_summary_score'],
      dtype='object', length=131)

The updrs_fog on the other hand stores the indicator scores of freezing of gate for each month.

In [84]:
updrs_2_visitname = updrs_2_cleaned[~((updrs_2_cleaned['visit_name'].str.contains('#')) | (updrs_2_cleaned['visit_name'] == 'SC'))]
updrs_2_fog =updrs_2_visitname[["participant_id",  "visit_month", "code_upd2213_freezing"]]

updrs_3_visitname = updrs_3_cleaned[~((updrs_3_cleaned['visit_name'].str.contains('#')) | (updrs_3_cleaned['visit_name'] == 'SC'))]

updrs_3_fog =updrs_3_visitname[["participant_id", "visit_month", "code_upd2311_freezing_of_gait"]]
updrs_fog = updrs_2_fog.merge(updrs_3_fog, on = ["participant_id", "visit_month"], how = "outer")
updrs = updrs_at_baseline.merge(updrs_fog, on = ['participant_id'], how = 'outer')

  updrs_fog = updrs_2_fog.merge(updrs_3_fog, on = ["participant_id", "visit_month"], how = "outer")


In [85]:
updrs_at_baseline.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5084 entries, 0 to 5083
Columns: 131 entries, participant_id to mds_updrs_part_iii_summary_score
dtypes: float64(63), object(68)
memory usage: 5.1+ MB


In [86]:
updrs['code_upd2213_freezing'].unique()

array([nan,  0.,  1.,  2.,  3.,  4.])

In [87]:
updrs['code_upd2311_freezing_of_gait'].unique()

array([ 0.,  1.,  2.,  3.,  4., nan])

We define a new time-dependent binary variable, `FOG`, indicating the presence of freezing of gait: `FOG` = 1 if freezing of gait is present, and `FOG` = 0 otherwise. In accordance with established research protocols, freezing of gait is considered present if either Question 13 of UPDRS Part II or Question 11 of UPDRS Part III has a score greater than 1.

In [88]:
updrs['FOG'] = ((updrs['code_upd2213_freezing'] >= 1) | 
                        (updrs['code_upd2311_freezing_of_gait'] >= 1)).astype(int)


In [89]:
updrs.columns

Index(['participant_id', 'mds_updrs_part_i_primary_info_source',
       'code_upd2101_cognitive_impairment',
       'code_upd2102_hallucinations_and_psychosis',
       'code_upd2103_depressed_mood', 'code_upd2104_anxious_mood',
       'code_upd2105_apathy',
       'code_upd2106_dopamine_dysregulation_syndrome_features',
       'upd2101_cognitive_impairment', 'upd2102_hallucinations_and_psychosis',
       ...
       'upd2db_movements_interfere_with_ratings',
       'code_upd2hy_hoehn_and_yahr_stage', 'upd2hy_hoehn_and_yahr_stage',
       'upd23a_medication_for_pd', 'upd23b_clinical_state_on_medication',
       'mds_updrs_part_iii_summary_score', 'visit_month',
       'code_upd2213_freezing', 'code_upd2311_freezing_of_gait', 'FOG'],
      dtype='object', length=135)

We only consider patients who do not have freezing of gait at the baseline.

In [90]:
participants_fog0_baseline = updrs[
    (updrs['FOG'] == 0) & 
    (updrs['visit_month'] == 0)
]['participant_id'].unique()

updrs_no_fog_at_baseline = updrs[
    updrs['participant_id'].isin(participants_fog0_baseline)
]

In [91]:
updrs_no_fog_at_baseline.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19301 entries, 0 to 21630
Columns: 135 entries, participant_id to FOG
dtypes: float64(66), int64(1), object(68)
memory usage: 20.0+ MB


In [92]:
len(participants_fog0_baseline)

4316

In [None]:
updrs_freezing_dropped = updrs_no_fog_at_baseline.drop(columns = ['code_upd2213_freezing', 'code_upd2311_freezing_of_gait']   )

We process the rest of the data in a similar way as we do for the Parkinsons Progression prediction. We only include time independant features for our Cox Model.

In [175]:
demographic_data_drop =demographic_data_cleaned.drop(["GUID","visit_name","visit_month"],axis=1)
data = updrs_freezing_dropped.merge(
    demographic_data_drop,
    on="participant_id", 
    how="left"
)


In [176]:
def consolidate_history(group):
    """
    Consolidates multiple family history records for a single participant
    into one record based on the "if any is Yes, then Yes" rule.
    
    It also keeps the visit information from the most recent record.
    """

    latest_record = group.sort_values('visit_month', ascending=False).iloc[0]
    

    mother_history = 'Yes' if 'Yes' in group['biological_mother_with_pd'].values else 'No'
    father_history = 'Yes' if 'Yes' in group['biological_father_with_pd'].values else 'No'
    other_history = 'Yes' if 'Yes' in group['other_relative_with_pd'].values else 'No'
    
    consolidated_data = {
        'participant_id': group['participant_id'].iloc[0],
        'GUID': group['GUID'].iloc[0],
        'visit_name': latest_record['visit_name'],
        'visit_month': latest_record['visit_month'],
        'biological_mother_with_pd': mother_history,
        'biological_father_with_pd': father_history,
        'other_relative_with_pd': other_history
    }
    
    return pd.Series(consolidated_data)

try:
    family_data_cleaned = family_data_cleaned.groupby('participant_id').apply(consolidate_history).reset_index(drop=True)

except FileNotFoundError:
    print("Error: The file 'releases_2023_v4release_1027_clinical_Family_History_PD.csv' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

  family_data_cleaned = family_data_cleaned.groupby('participant_id').apply(consolidate_history).reset_index(drop=True)


In [177]:
family_data_drop =family_data_cleaned.drop(["GUID","visit_name","visit_month"],axis=1)
data =data.merge(
    family_data_drop,
    on="participant_id", 
    how="left"
)



In [178]:
def appropriate_pivotting(df):
    df=df.drop(["GUID", "visit_name", "visit_month","dti_brain_tissue"], axis=1)
    df_pivotted=df.pivot(columns=["dti_measure"], index="participant_id")
    df_pivotted.columns = ['_'.join(map(str, col)).strip() for col in df_pivotted.columns.values]
    cols=df_pivotted.columns
    for col in cols:
        if "#" in col:
            df_pivotted=df_pivotted.drop(col, axis=1)
    return df_pivotted 
DTI_data_cleaned_and_pivotted=appropriate_pivotting(DTI_data_cleaned)



In [179]:
DTI_data_cleaned_and_pivotted=DTI_data_cleaned_and_pivotted.reset_index()
DTI_data_cleaned_and_pivotted.columns

Index(['participant_id', 'roi1_left_rostral_Eigenvalue1',
       'roi1_left_rostral_Eigenvalue2', 'roi1_left_rostral_Eigenvalue3',
       'roi1_left_rostral_Fractional Anisotropy',
       'roi2_left_middle_Eigenvalue1', 'roi2_left_middle_Eigenvalue2',
       'roi2_left_middle_Eigenvalue3',
       'roi2_left_middle_Fractional Anisotropy',
       'roi3_left_caudal_Eigenvalue1', 'roi3_left_caudal_Eigenvalue2',
       'roi3_left_caudal_Eigenvalue3',
       'roi3_left_caudal_Fractional Anisotropy',
       'roi4_right_rostral_Eigenvalue1', 'roi4_right_rostral_Eigenvalue2',
       'roi4_right_rostral_Eigenvalue3',
       'roi4_right_rostral_Fractional Anisotropy',
       'roi5_right_middle_Eigenvalue1', 'roi5_right_middle_Eigenvalue2',
       'roi5_right_middle_Eigenvalue3',
       'roi5_right_middle_Fractional Anisotropy',
       'roi6_right_caudal_Eigenvalue1', 'roi6_right_caudal_Eigenvalue2',
       'roi6_right_caudal_Eigenvalue3',
       'roi6_right_caudal_Fractional Anisotropy',
       '

In [180]:
data = data.merge(DTI_data_cleaned_and_pivotted,
    on="participant_id", 
    how="left"
)

In [181]:
data.isna().sum()

participant_id                                    0
mds_updrs_part_i_primary_info_source           1035
code_upd2101_cognitive_impairment              1004
code_upd2102_hallucinations_and_psychosis      1004
code_upd2103_depressed_mood                    1004
                                              ...  
ref1_left_reference_Fractional Anisotropy     16716
ref2_right_reference_Eigenvalue1              16716
ref2_right_reference_Eigenvalue2              16716
ref2_right_reference_Eigenvalue3              16716
ref2_right_reference_Fractional Anisotropy    16716
Length: 172, dtype: int64

We clean up the columns with too many missing values and then drop all the NAN values.


In [182]:
cols_more_nan = data.columns[data.isna().sum() > 1500].tolist()
print(len(cols_more_nan))
print(cols_more_nan)

39
['mds_updrs_part_i_pat_quest_primary_info_source', 'mds_updrs_part_ii_primary_info_source', 'upd2db_movements_interfere_with_ratings', 'upd23b_clinical_state_on_medication', 'biological_mother_with_pd', 'biological_father_with_pd', 'other_relative_with_pd', 'roi1_left_rostral_Eigenvalue1', 'roi1_left_rostral_Eigenvalue2', 'roi1_left_rostral_Eigenvalue3', 'roi1_left_rostral_Fractional Anisotropy', 'roi2_left_middle_Eigenvalue1', 'roi2_left_middle_Eigenvalue2', 'roi2_left_middle_Eigenvalue3', 'roi2_left_middle_Fractional Anisotropy', 'roi3_left_caudal_Eigenvalue1', 'roi3_left_caudal_Eigenvalue2', 'roi3_left_caudal_Eigenvalue3', 'roi3_left_caudal_Fractional Anisotropy', 'roi4_right_rostral_Eigenvalue1', 'roi4_right_rostral_Eigenvalue2', 'roi4_right_rostral_Eigenvalue3', 'roi4_right_rostral_Fractional Anisotropy', 'roi5_right_middle_Eigenvalue1', 'roi5_right_middle_Eigenvalue2', 'roi5_right_middle_Eigenvalue3', 'roi5_right_middle_Fractional Anisotropy', 'roi6_right_caudal_Eigenvalue1', 

In [183]:
data_drop_cols = data.drop(columns = cols_more_nan)

In [184]:
data_final = data_drop_cols.dropna()

In [185]:
data_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18030 entries, 947 to 19300
Columns: 133 entries, participant_id to education_level_years
dtypes: float64(64), int64(2), object(67)
memory usage: 18.4+ MB


In [186]:
data_final['participant_id'].nunique()


3275

In [269]:
data_final.to_csv("data_final.csv")