# Basic Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

In [2]:
clinical_updrs2 = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_II.csv")
demographics_new = pd.read_csv("demographics_new.csv")
clinical_updrs2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21385 entries, 0 to 21384
Data columns (total 32 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   participant_id                                   21385 non-null  object 
 1   GUID                                             13574 non-null  object 
 2   visit_name                                       21385 non-null  object 
 3   visit_month                                      21385 non-null  int64  
 4   mds_updrs_part_ii_primary_info_source            13776 non-null  object 
 5   code_upd2201_speech                              21099 non-null  float64
 6   code_upd2202_saliva_and_drooling                 20453 non-null  float64
 7   code_upd2203_chewing_and_swallowing              20453 non-null  float64
 8   code_upd2204_eating_tasks                        21096 non-null  float64
 9   code_upd2205_dressing       

In [3]:
clinical_updrs2.head()

Unnamed: 0,participant_id,GUID,visit_name,visit_month,mds_updrs_part_ii_primary_info_source,code_upd2201_speech,code_upd2202_saliva_and_drooling,code_upd2203_chewing_and_swallowing,code_upd2204_eating_tasks,code_upd2205_dressing,...,upd2205_dressing,upd2206_hygiene,upd2207_handwriting,upd2208_doing_hobbies_and_other_activities,upd2209_turning_in_bed,upd2210_tremor,upd2211_get_out_of_bed_car_or_deep_chair,upd2212_walking_and_balance,upd2213_freezing,mds_updrs_part_ii_summary_score
0,BF-1002,PDCB969UGG,M0,0,Patient,0.0,3.0,1.0,1.0,1.0,...,Slight,Slight,Normal,Normal,Slight,Slight,Mild,Normal,Normal,11.0
1,BF-1003,PDLW805AHT,M0,0,Patient,1.0,2.0,0.0,0.0,1.0,...,Slight,Slight,Mild,Mild,Normal,Slight,Normal,Slight,Normal,11.0
2,BF-1004,PDKW284DYW,M0,0,Patient,0.0,2.0,0.0,1.0,1.0,...,Slight,Slight,Slight,Slight,Slight,Slight,Slight,Slight,Normal,11.0
3,BF-1006,PDKY484YDC,M0,0,Patient,2.0,1.0,0.0,1.0,2.0,...,Mild,Slight,Severe,Normal,Slight,Slight,Slight,Normal,Normal,14.0
4,BF-1008,PDEA056CRM,M0,0,Patient,0.0,0.0,0.0,0.0,0.0,...,Normal,Normal,Normal,Normal,Slight,Slight,Slight,Normal,Normal,3.0


# Data Preprocessing And Cleaning 

## Checking For Duplicates And Nan Values

We begin our analysis by removing patients who have missing (NaN) values in the `GUID` column, as well as those with conflicting identifiers — that is, cases where multiple `participant_id`s share the same `GUID`. To ensure consistency, we retain only the participants whose `participant_id`s appear in the cleaned reference file "demographics_new.csv".

In [4]:
clinical_updrs2 = clinical_updrs2[clinical_updrs2['participant_id'].isin(demographics_new['participant_id'])]


In [5]:
clinical_updrs2.nunique()

participant_id                                     3709
GUID                                               3709
visit_name                                           20
visit_month                                          18
mds_updrs_part_ii_primary_info_source                 3
code_upd2201_speech                                   5
code_upd2202_saliva_and_drooling                      5
code_upd2203_chewing_and_swallowing                   5
code_upd2204_eating_tasks                             5
code_upd2205_dressing                                 5
code_upd2206_hygiene                                  5
code_upd2207_handwriting                              5
code_upd2208_doing_hobbies_and_other_activities       5
code_upd2209_turning_in_bed                           5
code_upd2210_tremor                                   5
code_upd2211_get_out_of_bed_car_or_deep_chair         5
code_upd2212_walking_and_balance                      5
code_upd2213_freezing                           

In [6]:
clinical_updrs2['GUID'].isna().sum()

np.int64(0)

We remove the GUID column from the dataset, as it is no longer required for the subsequent steps of our analysis. Then we assess the data for duplicate entries by examining combinations of the `participant_id` and `visit_month` columns to ensure each participant's visit is uniquely represented.

In [7]:
clinical_updrs2.drop('GUID', axis = 1,  inplace = True)

In [8]:
clinical_updrs2.duplicated(subset = ['visit_month','participant_id']).sum()

np.int64(5)

Since there are 5 duplicate records based on the combination of `visit_month` and `participant_id`, we need to extract and review all rows where the combination of `visit_month` and `participant_id` are not unique.

In [10]:
UPDRS1_duplicates =  clinical_updrs2.duplicated(subset=['visit_month', 'participant_id'], keep=False) 
clinical_updrs2[UPDRS1_duplicates]

Unnamed: 0,participant_id,visit_name,visit_month,mds_updrs_part_ii_primary_info_source,code_upd2201_speech,code_upd2202_saliva_and_drooling,code_upd2203_chewing_and_swallowing,code_upd2204_eating_tasks,code_upd2205_dressing,code_upd2206_hygiene,...,upd2205_dressing,upd2206_hygiene,upd2207_handwriting,upd2208_doing_hobbies_and_other_activities,upd2209_turning_in_bed,upd2210_tremor,upd2211_get_out_of_bed_car_or_deep_chair,upd2212_walking_and_balance,upd2213_freezing,mds_updrs_part_ii_summary_score
6474,PP-3066,M18,18,Patient,1.0,1.0,0.0,1.0,1.0,1.0,...,Slight,Slight,Slight,Slight,Slight,Mild,Slight,Slight,Slight,13.0
6475,PP-3066,M18#2,18,Patient,1.0,1.0,0.0,1.0,1.0,1.0,...,Slight,Slight,Slight,Mild,Slight,Mild,Mild,Slight,Slight,15.0
6492,PP-3067,M18,18,Patient,2.0,0.0,0.0,1.0,1.0,0.0,...,Slight,Normal,Severe,Moderate,Normal,Mild,Slight,Normal,Normal,14.0
6493,PP-3067,M18#2,18,Patient,2.0,0.0,1.0,1.0,0.0,0.0,...,Normal,Normal,Moderate,Normal,Normal,Mild,Normal,Normal,Normal,9.0
6602,PP-3078,M18,18,Patient,0.0,0.0,0.0,1.0,0.0,0.0,...,Normal,Normal,Slight,Slight,Normal,Mild,Normal,Normal,Normal,5.0
6603,PP-3078,M18#2,18,Patient,0.0,0.0,0.0,1.0,1.0,1.0,...,Slight,Slight,Slight,Slight,Normal,Mild,Normal,Normal,Normal,7.0
7846,PP-3251,M3#2,3,Patient,0.0,0.0,1.0,0.0,1.0,1.0,...,Slight,Slight,Normal,Slight,Slight,Slight,Mild,Slight,Mild,11.0
7847,PP-3251,M3,3,Patient,2.0,0.0,1.0,0.0,0.0,1.0,...,Normal,Slight,Normal,Normal,Normal,Slight,Slight,Slight,Normal,7.0
7860,PP-3252,M3#2,3,Patient,0.0,0.0,0.0,1.0,1.0,1.0,...,Slight,Slight,Slight,Slight,Slight,Slight,Normal,Slight,Slight,9.0
7861,PP-3252,M3,3,Patient,0.0,0.0,0.0,1.0,1.0,1.0,...,Slight,Slight,Slight,Slight,Slight,Slight,Slight,Slight,Normal,9.0


The analysis indicates that there are no genuine duplicates, as the repeated entries are due to multiple visits by the same participant, which can be seen in the `visit_name` column. 