# Basic Setup and Data Loading

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

In [3]:
clinical_updrs4 = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_MDS_UPDRS_Part_IV.csv")
demographics_new = pd.read_csv("demographics_new.csv")
clinical_updrs4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13091 entries, 0 to 13090
Data columns (total 17 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   participant_id                                  13091 non-null  object 
 1   GUID                                            9606 non-null   object 
 2   visit_name                                      13091 non-null  object 
 3   visit_month                                     13091 non-null  float64
 4   code_upd2401_time_spent_with_dyskinesias        11984 non-null  float64
 5   code_upd2402_functional_impact_of_dyskinesias   11344 non-null  float64
 6   code_upd2403_time_spent_in_the_off_state        11817 non-null  float64
 7   code_upd2404_functional_impact_of_fluctuations  11340 non-null  float64
 8   code_upd2405_complexity_of_motor_fluctuations   11342 non-null  float64
 9   code_upd2406_painful_off_state_dystonia

In [5]:
clinical_updrs4.head()

Unnamed: 0,participant_id,GUID,visit_name,visit_month,code_upd2401_time_spent_with_dyskinesias,code_upd2402_functional_impact_of_dyskinesias,code_upd2403_time_spent_in_the_off_state,code_upd2404_functional_impact_of_fluctuations,code_upd2405_complexity_of_motor_fluctuations,code_upd2406_painful_off_state_dystonia,upd2401_time_spent_with_dyskinesias,upd2402_functional_impact_of_dyskinesias,upd2403_time_spent_in_the_off_state,upd2404_functional_impact_of_fluctuations,upd2405_complexity_of_motor_fluctuations,upd2406_painful_off_state_dystonia,mds_updrs_part_iv_summary_score
0,BF-1002,PDCB969UGG,M0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,Normal,Normal,Slight,Normal,Slight,Normal,9.0
1,BF-1003,PDLW805AHT,M0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,Normal,Normal,Slight,Normal,Slight,Normal,4.0
2,BF-1004,PDKW284DYW,M0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,Slight,Slight,Slight,Slight,Slight,Slight,6.0
3,BF-1006,PDKY484YDC,M0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,Normal,Normal,Slight,Slight,Slight,Slight,0.0
4,BF-1008,PDEA056CRM,M0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,Normal,Normal,Slight,Slight,Slight,Slight,6.0


# Data Preprocessing And Cleaning 

## Checking For Duplicates And Nan Values

We begin our analysis by removing patients who have missing (NaN) values in the `GUID` column, as well as those with conflicting identifiers — that is, cases where multiple `participant_id`s share the same `GUID`. To ensure consistency, we retain only the participants whose `participant_id`s appear in the cleaned reference file "demographics_new.csv".

In [6]:
clinical_updrs4 = clinical_updrs4[clinical_updrs4['participant_id'].isin(demographics_new['participant_id'])]


In [7]:
clinical_updrs4.nunique()

participant_id                                    3022
GUID                                              3022
visit_name                                          20
visit_month                                         19
code_upd2401_time_spent_with_dyskinesias             5
code_upd2402_functional_impact_of_dyskinesias        5
code_upd2403_time_spent_in_the_off_state             5
code_upd2404_functional_impact_of_fluctuations       5
code_upd2405_complexity_of_motor_fluctuations        5
code_upd2406_painful_off_state_dystonia              5
upd2401_time_spent_with_dyskinesias                  5
upd2402_functional_impact_of_dyskinesias             5
upd2403_time_spent_in_the_off_state                  5
upd2404_functional_impact_of_fluctuations            5
upd2405_complexity_of_motor_fluctuations             5
upd2406_painful_off_state_dystonia                   5
mds_updrs_part_iv_summary_score                     24
dtype: int64

In [8]:
clinical_updrs4['GUID'].isna().sum()

np.int64(0)

We remove the GUID column from the dataset, as it is no longer required for the subsequent steps of our analysis. Then we assess the data for duplicate entries by examining combinations of the `participant_id` and `visit_month` columns to ensure each participant's visit is uniquely represented.

In [9]:
clinical_updrs4.drop('GUID', axis = 1,  inplace = True)

In [10]:
clinical_updrs4.duplicated(subset = ['visit_month','participant_id']).sum()

np.int64(0)

This ensures that there are no repeated entries for the UPDRS4 scores

In [12]:
visit_months = clinical_updrs4['visit_month'].unique()
visit_months.sort()
print(visit_months)

[ -1.    0.    0.5   3.    6.    9.   12.   18.   24.   30.   36.   42.
  48.   54.   60.   72.   84.   96.  108. ]


In [13]:
clinical_updrs4.to_csv('clinical_updrs4_cleaned.csv', index=False)
