# Basic Setup and Data Loading

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt

sns.set_style("whitegrid")

In [2]:
DTI = pd.read_csv("new-data/releases_2023_v4release_1027_clinical_DTI.csv")
demographics_new = pd.read_csv("demographics_new.csv")
DTI.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1196 entries, 0 to 1195
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   participant_id        1196 non-null   object 
 1   GUID                  364 non-null    object 
 2   visit_name            1196 non-null   object 
 3   visit_month           0 non-null      float64
 4   dti_brain_tissue      1196 non-null   object 
 5   dti_measure           1196 non-null   object 
 6   roi1_left_rostral     1196 non-null   float64
 7   roi2_left_middle      1196 non-null   float64
 8   roi3_left_caudal      1196 non-null   float64
 9   roi4_right_rostral    1196 non-null   float64
 10  roi5_right_middle     1196 non-null   float64
 11  roi6_right_caudal     1196 non-null   float64
 12  ref1_left_reference   1196 non-null   float64
 13  ref2_right_reference  1196 non-null   float64
dtypes: float64(9), object(5)
memory usage: 130.9+ KB


In [3]:
DTI.head()

Unnamed: 0,participant_id,GUID,visit_name,visit_month,dti_brain_tissue,dti_measure,roi1_left_rostral,roi2_left_middle,roi3_left_caudal,roi4_right_rostral,roi5_right_middle,roi6_right_caudal,ref1_left_reference,ref2_right_reference
0,PP-3101,,LOG,,Substantia Nigra,Eigenvalue1,0.000926,0.000877,0.000812,0.001246,0.001144,0.001088,0.00147,0.001434
1,PP-3101,,LOG,,Substantia Nigra,Eigenvalue2,0.000604,0.000511,0.000587,0.00112,0.000945,0.000743,0.000537,0.000411
2,PP-3101,,LOG,,Substantia Nigra,Eigenvalue3,0.000387,0.000337,0.000322,0.0009,0.000655,0.000463,0.000258,0.000243
3,PP-3101,,LOG,,Substantia Nigra,Fractional Anisotropy,0.327429,0.364471,0.329588,0.130218,0.214409,0.317038,0.565928,0.603173
4,PP-3102,,LOG,,Substantia Nigra,Eigenvalue1,0.000926,0.000933,0.000967,0.000994,0.000971,0.001101,0.001348,0.001318


# Data Preprocessing And Cleaning 

## Checking For Duplicates And Nan Values

We begin our analysis by removing patients who have missing (NaN) values in the `GUID` column, as well as those with conflicting identifiers — that is, cases where multiple `participant_id`s share the same `GUID`. To ensure consistency, we retain only the participants whose `participant_id`s appear in the cleaned reference file "demographics_new.csv".

In [4]:
DTI = DTI[DTI['participant_id'].isin(demographics_new['participant_id'])]


In [5]:
DTI.nunique()

participant_id           77
GUID                     77
visit_name                1
visit_month               0
dti_brain_tissue          1
dti_measure              12
roi1_left_rostral       332
roi2_left_middle        321
roi3_left_caudal        329
roi4_right_rostral      333
roi5_right_middle       327
roi6_right_caudal       330
ref1_left_reference     315
ref2_right_reference    335
dtype: int64

In [6]:
DTI['GUID'].isna().sum()
   

np.int64(0)

We remove the GUID column from the dataset, as it is no longer required for the subsequent steps of our analysis. Then we assess the data for duplicate entries by examining combinations of the `participant_id` and `visit_month` columns to ensure each participant's visit is uniquely represented.

In [7]:
DTI.drop('GUID', axis = 1,  inplace = True)

In [8]:
DTI.duplicated(subset = ['visit_month','participant_id']).sum()

np.int64(287)