# Basic Setup and Data Loading

In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

In [14]:
Family_history = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_Family_History_PD.csv")
demographics_new = pd.read_csv("demographics_new.csv")
Family_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8139 entries, 0 to 8138
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   participant_id             8139 non-null   object 
 1   GUID                       3798 non-null   object 
 2   visit_name                 8139 non-null   object 
 3   visit_month                7014 non-null   float64
 4   biological_mother_with_pd  8087 non-null   object 
 5   biological_father_with_pd  8085 non-null   object 
 6   other_relative_with_pd     8105 non-null   object 
dtypes: float64(1), object(6)
memory usage: 445.2+ KB


In [15]:
Family_history.head()

Unnamed: 0,participant_id,GUID,visit_name,visit_month,biological_mother_with_pd,biological_father_with_pd,other_relative_with_pd
0,BF-1001,PDNW781VHY,M0,0.0,No,No,No
1,BF-1002,PDCB969UGG,M0,0.0,No,No,No
2,BF-1003,PDLW805AHT,M0,0.0,No,No,No
3,BF-1004,PDKW284DYW,M0,0.0,No,No,No
4,BF-1005,PDTM274KX6,M0,0.0,No,No,No


# Data Preprocessing And Cleaning 

## Checking For Duplicates And Nan Values

We begin our analysis by removing patients who have missing (NaN) values in the `GUID` column, as well as those with conflicting identifiers — that is, cases where multiple `participant_id`s share the same `GUID`. To ensure consistency, we retain only the participants whose `participant_id`s appear in the cleaned reference file "demographics_new.csv".

In [16]:
Family_history = Family_history[Family_history['participant_id'].isin(demographics_new['participant_id'])]


In [17]:
Family_history.nunique()

participant_id               3776
GUID                         3776
visit_name                      6
visit_month                     5
biological_mother_with_pd       3
biological_father_with_pd       3
other_relative_with_pd          3
dtype: int64

In [18]:
Family_history['GUID'].isna().sum()
   

np.int64(0)

We remove the GUID column from the dataset, as it is no longer required for the subsequent steps of our analysis. Then we assess the data for duplicate entries by examining combinations of the `participant_id` and `visit_month` columns to ensure each participant's visit is uniquely represented.

In [19]:
Family_history.drop('GUID', axis = 1,  inplace = True)

In [20]:
Family_history.duplicated(subset = ['visit_month','participant_id']).sum()

np.int64(0)