# Basic Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

In [2]:
clinical_updrs1 = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_I.csv")
clinical_updrs1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20468 entries, 0 to 20467
Data columns (total 35 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   participant_id                                         20468 non-null  object 
 1   GUID                                                   12648 non-null  object 
 2   visit_name                                             20468 non-null  object 
 3   visit_month                                            20468 non-null  int64  
 4   mds_updrs_part_i_primary_info_source                   20414 non-null  object 
 5   code_upd2101_cognitive_impairment                      20450 non-null  float64
 6   code_upd2102_hallucinations_and_psychosis              20450 non-null  float64
 7   code_upd2103_depressed_mood                            20450 non-null  float64
 8   code_upd2104_anxious_mood                     

In [16]:
clinical_updrs1.head()

Unnamed: 0,participant_id,GUID,visit_name,visit_month,mds_updrs_part_i_primary_info_source,code_upd2101_cognitive_impairment,code_upd2102_hallucinations_and_psychosis,code_upd2103_depressed_mood,code_upd2104_anxious_mood,code_upd2105_apathy,...,code_upd2113_pat_quest_fatigue,upd2107_pat_quest_sleep_problems,upd2108_pat_quest_daytime_sleepiness,upd2109_pat_quest_pain_and_other_sensations,upd2110_pat_quest_urinary_problems,upd2111_pat_quest_constipation_problems,upd2112_pat_quest_lightheadedness_on_standing,upd2113_pat_quest_fatigue,mds_updrs_part_i_pat_quest_sub_score,mds_updrs_part_i_summary_score
0,BF-1002,PDCB969UGG,M0,0,Patient,0.0,0.0,0.0,1.0,0.0,...,1.0,Slight,Mild,Mild,Mild,Normal,Normal,Slight,8.0,9
1,BF-1003,PDLW805AHT,M0,0,Patient,0.0,0.0,0.0,0.0,0.0,...,2.0,Moderate,Slight,Slight,Slight,Normal,Normal,Mild,8.0,8
2,BF-1004,PDKW284DYW,M0,0,Patient,0.0,0.0,0.0,1.0,0.0,...,1.0,Moderate,Slight,Mild,Slight,Normal,Slight,Slight,9.0,10
3,BF-1006,PDKY484YDC,M0,0,Patient,0.0,0.0,0.0,2.0,0.0,...,0.0,Severe,Normal,Moderate,Normal,Normal,Normal,Normal,7.0,9
4,BF-1008,PDEA056CRM,M0,0,Patient,0.0,0.0,0.0,0.0,0.0,...,1.0,Slight,Normal,Slight,Slight,Normal,Normal,Slight,4.0,4


In [5]:
clinical_updrs1.nunique()

participant_id                                           4066
GUID                                                     2882
visit_name                                                 22
visit_month                                                22
mds_updrs_part_i_primary_info_source                        3
code_upd2101_cognitive_impairment                           5
code_upd2102_hallucinations_and_psychosis                   5
code_upd2103_depressed_mood                                 5
code_upd2104_anxious_mood                                   5
code_upd2105_apathy                                         5
code_upd2106_dopamine_dysregulation_syndrome_features       5
upd2101_cognitive_impairment                                5
upd2102_hallucinations_and_psychosis                        5
upd2103_depressed_mood                                      5
upd2104_anxious_mood                                        5
upd2105_apathy                                              5
upd2106_

In [6]:
clinical_updrs1['visit_month'].unique()

array([   0,   -1,   12,   18,   24,   30,   36,   42,   48,    6,   60,
         54,    3,    9,   72,   84,   96,   -2,  108, -132,  -96, -108])

In [14]:
clinical_updrs1_new = clinical_updrs1.dropna(subset=['GUID'])
clinical_updrs1_new.nunique()


participant_id                                           2887
GUID                                                     2882
visit_name                                                 20
visit_month                                                18
mds_updrs_part_i_primary_info_source                        3
code_upd2101_cognitive_impairment                           5
code_upd2102_hallucinations_and_psychosis                   5
code_upd2103_depressed_mood                                 5
code_upd2104_anxious_mood                                   5
code_upd2105_apathy                                         5
code_upd2106_dopamine_dysregulation_syndrome_features       5
upd2101_cognitive_impairment                                5
upd2102_hallucinations_and_psychosis                        5
upd2103_depressed_mood                                      5
upd2104_anxious_mood                                        5
upd2105_apathy                                              5
upd2106_

In [3]:
clinical_updrs2 = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_II.csv")
clinical_updrs2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21385 entries, 0 to 21384
Data columns (total 32 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   participant_id                                   21385 non-null  object 
 1   GUID                                             13574 non-null  object 
 2   visit_name                                       21385 non-null  object 
 3   visit_month                                      21385 non-null  int64  
 4   mds_updrs_part_ii_primary_info_source            13776 non-null  object 
 5   code_upd2201_speech                              21099 non-null  float64
 6   code_upd2202_saliva_and_drooling                 20453 non-null  float64
 7   code_upd2203_chewing_and_swallowing              20453 non-null  float64
 8   code_upd2204_eating_tasks                        21096 non-null  float64
 9   code_upd2205_dressing       

In [7]:
clinical_updrs2.nunique()

participant_id                                     4895
GUID                                               3714
visit_name                                           21
visit_month                                          19
mds_updrs_part_ii_primary_info_source                 3
code_upd2201_speech                                   5
code_upd2202_saliva_and_drooling                      5
code_upd2203_chewing_and_swallowing                   5
code_upd2204_eating_tasks                             5
code_upd2205_dressing                                 5
code_upd2206_hygiene                                  5
code_upd2207_handwriting                              5
code_upd2208_doing_hobbies_and_other_activities       5
code_upd2209_turning_in_bed                           5
code_upd2210_tremor                                   5
code_upd2211_get_out_of_bed_car_or_deep_chair         5
code_upd2212_walking_and_balance                      5
code_upd2213_freezing                           

In [8]:
clinical_updrs2['visit_month'].unique()

array([  0,  -1,  12,  18,  24,  30,  36,  42,  48,   6,  60,  54,   3,
         9,  72,  96,  84,  -2, 108])

In [4]:
clinical_updrs3 = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_III.csv")
clinical_updrs3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23563 entries, 0 to 23562
Data columns (total 77 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   participant_id                                             23563 non-null  object 
 1   GUID                                                       14681 non-null  object 
 2   visit_name                                                 23563 non-null  object 
 3   visit_month                                                23563 non-null  float64
 4   code_upd2301_speech_problems                               23509 non-null  float64
 5   code_upd2302_facial_expression                             22876 non-null  float64
 6   code_upd2303a_rigidity_neck                                22873 non-null  float64
 7   code_upd2303b_rigidity_rt_upper_extremity                  22873 non-null  float64
 8   code_u

In [9]:
clinical_updrs3.nunique()


participant_id                         5007
GUID                                   3805
visit_name                               40
visit_month                              38
code_upd2301_speech_problems              5
                                       ... 
code_upd2hy_hoehn_and_yahr_stage          8
upd2hy_hoehn_and_yahr_stage               8
upd23a_medication_for_pd                  2
upd23b_clinical_state_on_medication       2
mds_updrs_part_iii_summary_score         96
Length: 77, dtype: int64

In [10]:
clinical_updrs3['visit_month'].unique()

array([   0. ,    0.5,   -1. ,   12. ,   18. ,   24. ,   30. ,   36. ,
         42. ,   48. ,    6. ,   60. ,   54. ,    3. ,    9. ,   72. ,
         84. ,   96. ,   -2. ,  108. ,   -4. ,  -35. ,   -5. , -120. ,
        -84. , -142. , -132. ,  -60. , -108. ,  -72. , -156. ,  -96. ,
       -144. , -168. ,  -36. ,  -24. ,  -48. ,  -12. ])