# Basic Setup and Data Loading

In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

In [17]:
clinical_updrs3 = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_III.csv")
demographics_new = pd.read_csv("demographics_new.csv")
clinical_updrs3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23563 entries, 0 to 23562
Data columns (total 77 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   participant_id                                             23563 non-null  object 
 1   GUID                                                       14681 non-null  object 
 2   visit_name                                                 23563 non-null  object 
 3   visit_month                                                23563 non-null  float64
 4   code_upd2301_speech_problems                               23509 non-null  float64
 5   code_upd2302_facial_expression                             22876 non-null  float64
 6   code_upd2303a_rigidity_neck                                22873 non-null  float64
 7   code_upd2303b_rigidity_rt_upper_extremity                  22873 non-null  float64
 8   code_u

In [18]:
clinical_updrs3.head()

Unnamed: 0,participant_id,GUID,visit_name,visit_month,code_upd2301_speech_problems,code_upd2302_facial_expression,code_upd2303a_rigidity_neck,code_upd2303b_rigidity_rt_upper_extremity,code_upd2303c_rigidity_left_upper_extremity,code_upd2303d_rigidity_rt_lower_extremity,...,upd2317d_rest_tremor_amplitude_left_lower_extremity,upd2317e_rest_tremor_amplitude_lip_or_jaw,upd2318_consistency_of_rest_tremor,upd2da_dyskinesias_during_exam,upd2db_movements_interfere_with_ratings,code_upd2hy_hoehn_and_yahr_stage,upd2hy_hoehn_and_yahr_stage,upd23a_medication_for_pd,upd23b_clinical_state_on_medication,mds_updrs_part_iii_summary_score
0,BF-1001,PDNW781VHY,M0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Normal,Normal,Normal,No,,0.0,Stage 0,,,0.0
1,BF-1002,PDCB969UGG,M0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Normal,Slight,Slight,No,,1.0,Stage 1,,,15.0
2,BF-1002,PDCB969UGG,M0_5,0.5,1.0,1.0,0.0,1.0,1.0,1.0,...,Normal,Slight,Normal,No,,1.0,Stage 1,,,16.0
3,BF-1003,PDLW805AHT,M0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,Normal,Normal,Slight,No,,1.0,Stage 1,,,12.0
4,BF-1003,PDLW805AHT,M0_5,0.5,1.0,1.0,0.0,0.0,0.0,0.0,...,Normal,Slight,Moderate,No,,2.0,Stage 2,,,24.0


# Data Preprocessing And Cleaning 

## Checking For Duplicates And Nan Values

We begin our analysis by removing patients who have missing (NaN) values in the `GUID` column, as well as those with conflicting identifiers — that is, cases where multiple `participant_id`s share the same `GUID`. To ensure consistency, we retain only the participants whose `participant_id`s appear in the cleaned reference file "demographics_new.csv".

In [19]:
clinical_updrs3 = clinical_updrs3[clinical_updrs3['participant_id'].isin(demographics_new['participant_id'])]


In [20]:
clinical_updrs3.nunique()

participant_id                         3800
GUID                                   3800
visit_name                               36
visit_month                              20
code_upd2301_speech_problems              5
                                       ... 
code_upd2hy_hoehn_and_yahr_stage          8
upd2hy_hoehn_and_yahr_stage               8
upd23a_medication_for_pd                  2
upd23b_clinical_state_on_medication       2
mds_updrs_part_iii_summary_score         91
Length: 77, dtype: int64

In [21]:
clinical_updrs3['GUID'].isna().sum()

np.int64(0)

We remove the GUID column from the dataset, as it is no longer required for the subsequent steps of our analysis. Then we assess the data for duplicate entries by examining combinations of the `participant_id` and `visit_month` columns to ensure each participant's visit is uniquely represented.

In [22]:
clinical_updrs3.drop('GUID', axis = 1,  inplace = True)

In [23]:
clinical_updrs3.duplicated(subset = ['visit_month','participant_id']).sum()

np.int64(928)

Since there are 928 duplicate records based on the combination of `visit_month` and `participant_id`, we need to extract and review all rows where the combination of `visit_month` and `participant_id` are not unique.

In [24]:
UPDRS3_duplicates =  clinical_updrs3.duplicated(subset=['visit_month', 'participant_id'], keep=False) 
clinical_updrs3[UPDRS3_duplicates]

Unnamed: 0,participant_id,visit_name,visit_month,code_upd2301_speech_problems,code_upd2302_facial_expression,code_upd2303a_rigidity_neck,code_upd2303b_rigidity_rt_upper_extremity,code_upd2303c_rigidity_left_upper_extremity,code_upd2303d_rigidity_rt_lower_extremity,code_upd2303e_rigidity_left_lower_extremity,...,upd2317d_rest_tremor_amplitude_left_lower_extremity,upd2317e_rest_tremor_amplitude_lip_or_jaw,upd2318_consistency_of_rest_tremor,upd2da_dyskinesias_during_exam,upd2db_movements_interfere_with_ratings,code_upd2hy_hoehn_and_yahr_stage,upd2hy_hoehn_and_yahr_stage,upd23a_medication_for_pd,upd23b_clinical_state_on_medication,mds_updrs_part_iii_summary_score
6262,PP-3002,M12,12.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,...,Mild,Normal,Slight,No,,2.0,Stage 2,Yes,Off,27.0
6263,PP-3002,M12#2,12.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,...,Slight,Normal,Slight,No,,2.0,Stage 2,Yes,On,23.0
6264,PP-3002,M18,18.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Slight,Normal,Slight,No,,2.0,Stage 2,Yes,Off,22.0
6265,PP-3002,M18#2,18.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Normal,Normal,Normal,No,,2.0,Stage 2,Yes,On,20.0
6267,PP-3002,M30#2,30.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,Normal,Normal,Normal,No,,2.0,Stage 2,Yes,On,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23431,PP-74199,M0,0.0,3.0,2.0,2.0,2.0,0.0,3.0,0.0,...,Normal,Normal,Severe,No,,1.0,Stage 1,Yes,Off,38.0
23432,PP-74199,M0#2,0.0,2.0,2.0,1.0,1.0,0.0,2.0,0.0,...,Normal,Normal,Mild,No,,1.0,Stage 1,Yes,,23.0
23446,PP-70188,M24#2,24.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,...,Normal,Normal,Normal,No,,2.0,Stage 2,Yes,,14.0
23447,PP-70188,M0,0.0,2.0,2.0,1.0,2.0,1.0,3.0,2.0,...,Normal,Normal,Normal,No,,2.0,Stage 2,Yes,,30.0


In [25]:
clinical_updrs3.duplicated(subset = ['visit_month','participant_id', 'visit_name']).sum()

np.int64(0)

The analysis indicates that there are no genuine duplicates, as the repeated entries are due to multiple visits by the same participant, which can be seen in the `visit_name` column. 

In [26]:
visit_months = clinical_updrs3['visit_month'].unique()
visit_months.sort()
print(visit_months)

[ -2.   -1.    0.    0.5   3.    6.    9.   12.   18.   24.   30.   36.
  42.   48.   54.   60.   72.   84.   96.  108. ]


In [27]:
clinical_updrs3.to_csv('clinical_updrs3_cleaned.csv', index=False)
