In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from utils import delete_patients_with_the_same_GUID, averaging_scores
warnings.filterwarnings('ignore')

sns.set_style("whitegrid")

We prepare our cleaned data for clustering. First of all we restrict only to PD patients by filtering them out from the clinical data. Next, we load medication data to check the biological relevance of our data.

In [2]:
enrollment = delete_patients_with_the_same_GUID(pd.read_csv("new-data/releases_2023_v4release_1027_clinical_Enrollment.csv"))

updrs_scores = pd.read_csv("cleaned_data.csv")[['participant_id', 'visit_name', 'visit_month',
       'mds_updrs_part_i_summary_score', 'mds_updrs_part_ii_summary_score',
       'mds_updrs_part_iii_summary_score']]

enrollment_pd = enrollment[enrollment['study_arm'] == 'PD']

updrs_scores = updrs_scores[updrs_scores['participant_id'].isin(enrollment_pd['participant_id'])]

updrs_scores.columns = ['participant_id', 'visit_name', 'visit_month', 'updrs_1', 'updrs_2', 'updrs_3']

updrs_scores = averaging_scores(updrs_scores, ['updrs_1', 'updrs_2', 'updrs_3'])

updrs_scores = updrs_scores.drop(columns = ['visit_name'])

updrs_scores.to_csv("cleaned_updrs_scores.csv", index=False)

Processing screening records by participant...
Consolidated 474 screening records into 457 baseline records.
Processing regular visit records by participant and month...
Consolidated 10864 regular records into 10633 records.


In [3]:
med_data = delete_patients_with_the_same_GUID(pd.read_csv("new-data/releases_2023_v4release_1027_clinical_PD_Medical_History.csv"))

med_data = med_data[med_data['participant_id'].isin(enrollment_pd['participant_id'])]

med_columns = ['on_levodopa', 'on_dopamine_agonist', 'on_other_pd_medications']
for col in med_columns:
    med_data[col] = med_data[col].apply(lambda x: 1 if x == 'Yes' else 0)

med_data = med_data.groupby('participant_id').agg({
    'on_levodopa': 'max',
    'on_dopamine_agonist': 'max',
    'on_other_pd_medications': 'max'
}).reset_index()

med_data.to_csv("cleaned_med_data.csv", index=False)

In [4]:
med_data.on_levodopa.value_counts()

on_levodopa
1    1730
0     966
Name: count, dtype: int64