# Basic Setup and Data Loading

In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

In [9]:
clinical_updrs1 = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_I.csv")
demographics_new = pd.read_csv("demographics_new.csv")
clinical_updrs1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20468 entries, 0 to 20467
Data columns (total 35 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   participant_id                                         20468 non-null  object 
 1   GUID                                                   12648 non-null  object 
 2   visit_name                                             20468 non-null  object 
 3   visit_month                                            20468 non-null  int64  
 4   mds_updrs_part_i_primary_info_source                   20414 non-null  object 
 5   code_upd2101_cognitive_impairment                      20450 non-null  float64
 6   code_upd2102_hallucinations_and_psychosis              20450 non-null  float64
 7   code_upd2103_depressed_mood                            20450 non-null  float64
 8   code_upd2104_anxious_mood                     

In [10]:
clinical_updrs1.head()

Unnamed: 0,participant_id,GUID,visit_name,visit_month,mds_updrs_part_i_primary_info_source,code_upd2101_cognitive_impairment,code_upd2102_hallucinations_and_psychosis,code_upd2103_depressed_mood,code_upd2104_anxious_mood,code_upd2105_apathy,...,code_upd2113_pat_quest_fatigue,upd2107_pat_quest_sleep_problems,upd2108_pat_quest_daytime_sleepiness,upd2109_pat_quest_pain_and_other_sensations,upd2110_pat_quest_urinary_problems,upd2111_pat_quest_constipation_problems,upd2112_pat_quest_lightheadedness_on_standing,upd2113_pat_quest_fatigue,mds_updrs_part_i_pat_quest_sub_score,mds_updrs_part_i_summary_score
0,BF-1002,PDCB969UGG,M0,0,Patient,0.0,0.0,0.0,1.0,0.0,...,1.0,Slight,Mild,Mild,Mild,Normal,Normal,Slight,8.0,9
1,BF-1003,PDLW805AHT,M0,0,Patient,0.0,0.0,0.0,0.0,0.0,...,2.0,Moderate,Slight,Slight,Slight,Normal,Normal,Mild,8.0,8
2,BF-1004,PDKW284DYW,M0,0,Patient,0.0,0.0,0.0,1.0,0.0,...,1.0,Moderate,Slight,Mild,Slight,Normal,Slight,Slight,9.0,10
3,BF-1006,PDKY484YDC,M0,0,Patient,0.0,0.0,0.0,2.0,0.0,...,0.0,Severe,Normal,Moderate,Normal,Normal,Normal,Normal,7.0,9
4,BF-1008,PDEA056CRM,M0,0,Patient,0.0,0.0,0.0,0.0,0.0,...,1.0,Slight,Normal,Slight,Slight,Normal,Normal,Slight,4.0,4


# Data Preprocessing And Cleaning 

## Checking For Duplicates And Nan Values

We begin our analysis by removing patients who have missing (NaN) values in the `GUID` column, as well as those with conflicting identifiers — that is, cases where multiple `participant_id`s share the same `GUID`. To ensure consistency, we retain only the participants whose `participant_id`s appear in the cleaned reference file "demographics_new.csv".

In [11]:
clinical_updrs1 = clinical_updrs1[clinical_updrs1['participant_id'].isin(demographics_new['participant_id'])]


In [12]:
clinical_updrs1.nunique()

participant_id                                           2877
GUID                                                     2877
visit_name                                                 20
visit_month                                                18
mds_updrs_part_i_primary_info_source                        3
code_upd2101_cognitive_impairment                           5
code_upd2102_hallucinations_and_psychosis                   5
code_upd2103_depressed_mood                                 5
code_upd2104_anxious_mood                                   5
code_upd2105_apathy                                         5
code_upd2106_dopamine_dysregulation_syndrome_features       5
upd2101_cognitive_impairment                                5
upd2102_hallucinations_and_psychosis                        5
upd2103_depressed_mood                                      5
upd2104_anxious_mood                                        5
upd2105_apathy                                              5
upd2106_

In [13]:
clinical_updrs1['GUID'].isna().sum()
   

np.int64(0)

We remove the GUID column from the dataset, as it is no longer required for the subsequent steps of our analysis. Then we assess the data for duplicate entries by examining combinations of the `participant_id` and `visit_month` columns to ensure each participant's visit is uniquely represented.

In [14]:
clinical_updrs1.drop('GUID', axis = 1,  inplace = True)

In [15]:
clinical_updrs1.duplicated(subset = ['visit_month','participant_id']).sum()

np.int64(5)

Since there are 5 duplicate records based on the combination of `visit_month` and `participant_id`, we need to extract and review all rows where the combination of `visit_month` and `participant_id` are not unique.

In [16]:
UPDRS1_duplicates =  clinical_updrs1.duplicated(subset=['visit_month', 'participant_id'], keep=False) 
clinical_updrs1[UPDRS1_duplicates]

Unnamed: 0,participant_id,visit_name,visit_month,mds_updrs_part_i_primary_info_source,code_upd2101_cognitive_impairment,code_upd2102_hallucinations_and_psychosis,code_upd2103_depressed_mood,code_upd2104_anxious_mood,code_upd2105_apathy,code_upd2106_dopamine_dysregulation_syndrome_features,...,code_upd2113_pat_quest_fatigue,upd2107_pat_quest_sleep_problems,upd2108_pat_quest_daytime_sleepiness,upd2109_pat_quest_pain_and_other_sensations,upd2110_pat_quest_urinary_problems,upd2111_pat_quest_constipation_problems,upd2112_pat_quest_lightheadedness_on_standing,upd2113_pat_quest_fatigue,mds_updrs_part_i_pat_quest_sub_score,mds_updrs_part_i_summary_score
5547,PP-3066,M18,18,Patient,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,Moderate,Mild,Moderate,Slight,Normal,Normal,Slight,10.0,10
5548,PP-3066,M18#2,18,Patient,0.0,0.0,0.0,0.0,2.0,0.0,...,1.0,Mild,Mild,Slight,Slight,Normal,Normal,Slight,7.0,9
5565,PP-3067,M18,18,Patient,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,Slight,Mild,Moderate,Slight,Normal,Normal,Normal,7.0,10
5566,PP-3067,M18#2,18,Patient,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,Normal,Mild,Moderate,Normal,Normal,Normal,Normal,5.0,6
5675,PP-3078,M18,18,Patient,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,Normal,Slight,Normal,Slight,Normal,Normal,Slight,3.0,5
5676,PP-3078,M18#2,18,Patient,1.0,0.0,0.0,1.0,1.0,0.0,...,1.0,Slight,Slight,Slight,Slight,Normal,Slight,Slight,6.0,9
6919,PP-3251,M3#2,3,Patient,0.0,0.0,3.0,1.0,0.0,0.0,...,3.0,Slight,Mild,Slight,Mild,Slight,Moderate,Moderate,13.0,17
6920,PP-3251,M3,3,Patient and Caregiver - Equal Proportion,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,Mild,Normal,Normal,Normal,Mild,Normal,Slight,5.0,6
6933,PP-3252,M3#2,3,Patient,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,Mild,Slight,Slight,Normal,Normal,Normal,Slight,5.0,6
6934,PP-3252,M3,3,Patient,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,Slight,Slight,Slight,Normal,Normal,Normal,Slight,4.0,4


The analysis indicates that there are no genuine duplicates, as the repeated entries are due to multiple visits by the same participant, which can be seen in the `visit_name` column. 

In [17]:
visit_months = clinical_updrs1['visit_month'].unique()
visit_months.sort()
print(visit_months)

[ -1   0   3   6   9  12  18  24  30  36  42  48  54  60  72  84  96 108]


In [18]:
clinical_updrs1.to_csv('clinical_updrs1_cleaned.csv', index=False)
