In [1]:
import sys 
import os
import pandas as pd  
from pathlib import Path  
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import shapiro
import glob

In [2]:
rootfolder = os.path.abspath(os.path.join(Path.cwd(),"..")) # rootpath --> top of git repo
sys.path.append(os.path.join(rootfolder))

from src.data.data_loader import load_data
import src.utility.ut_general as ut_general
import src.utility.ut_stats as ut_stats

In [3]:
rootfolder = Path.cwd() 
# rootfolder = "/Users/bingli/Documents/GitHub/fMRI-AHDH"
datafolder = os.path.join(rootfolder, "data")

In [4]:
train_data_dic = load_data(datafolder, filetype = "train") 
train_quant = train_data_dic["train_quant"]
train_outcome = train_data_dic["train_outcome"]
train_cate = train_data_dic["train_cate"]
train_fmri = train_data_dic["train_fmri"]

In [5]:
test_data_dic = load_data(datafolder, filetype = "test")
test_quant = test_data_dic["test_quant"]
test_cate = test_data_dic["test_cate"]
test_fmri = test_data_dic["test_fmri"]

In [5]:
# Check missing values for train dataset 
train_missing_cols = [] 
for name, df in train_data_dic.items():
    missing_lst = ut_general.return_missing_list(df, name)  
    train_missing_cols.append(missing_lst)
print("Missing columns in training dataset: " + str(train_missing_cols))


col MRI_Track_Age_at_Scan has 29.67848309975268 % missing.
train_outcome has no missing value
col PreInt_Demos_Fam_Child_Ethnicity has 0.9068425391591096 % missing.
train_fmri has no missing value
Missing columns in training dataset: [['MRI_Track_Age_at_Scan'], [], ['PreInt_Demos_Fam_Child_Ethnicity'], []]


In [6]:
# Check missing values --> Test dataset 
# It seems that the test dataset has a lot more missing :-(  
test_missing_cols = [] 
for name, df in test_data_dic.items():
    missing_lst = ut_general.return_missing_list(df, name)   
    test_missing_cols.append(missing_lst)
print("Missing columns in testing dataset" + str(test_missing_cols))
    

col PreInt_Demos_Fam_Child_Ethnicity has 0.9868421052631579 % missing.
col PreInt_Demos_Fam_Child_Race has 1.9736842105263157 % missing.
col Barratt_Barratt_P1_Edu has 0.3289473684210526 % missing.
col Barratt_Barratt_P1_Occ has 0.3289473684210526 % missing.
col Barratt_Barratt_P2_Edu has 11.842105263157894 % missing.
col Barratt_Barratt_P2_Occ has 13.815789473684212 % missing.
test_fmri has no missing value
col EHQ_EHQ_Total has 0.3289473684210526 % missing.
col ColorVision_CV_Score has 2.9605263157894735 % missing.
col APQ_P_APQ_P_CP has 4.934210526315789 % missing.
col APQ_P_APQ_P_ID has 4.934210526315789 % missing.
col APQ_P_APQ_P_INV has 4.934210526315789 % missing.
col APQ_P_APQ_P_OPD has 4.934210526315789 % missing.
col APQ_P_APQ_P_PM has 4.934210526315789 % missing.
col APQ_P_APQ_P_PP has 4.934210526315789 % missing.
col SDQ_SDQ_Conduct_Problems has 9.868421052631579 % missing.
col SDQ_SDQ_Difficulties_Total has 9.868421052631579 % missing.
col SDQ_SDQ_Emotional_Problems has 9.

In [8]:
# MCAR test for MRI_Track_Age_at_Scan column 
# Goal: 1) if data is missing completely at random (MCAR); 
# 2) Missing at random (MAR); 
# 3) Missing NOT at random (MNAR) 

# see if the missing / presence of columns with missing are correlated with any columns in the dataset
# Create an indicator variable for missing values
train_quant["MRI_track_age_missing"] = np.where(train_quant["MRI_Track_Age_at_Scan"].isnull(), 1, 0)
train_cate["child_eth_missing"] = np.where(train_cate["PreInt_Demos_Fam_Child_Ethnicity"].isnull(), 1, 0)

# Here, enroll year is categorical but it could be better recognized as a numerical column
for col in train_cate.columns[2:]: # Other columns are converted to string for chi2-test for now
    train_cate[col] = train_cate[col].astype(str)
type(train_cate["Basic_Demos_Study_Site"])

pa_id = list(set(train_cate.columns) & set(train_quant.columns))[0]
train_demo = train_cate.merge(train_quant, on=pa_id) 

numerical_columns = train_demo.select_dtypes(include=['number']).columns.tolist()
categorical_columns = train_demo.select_dtypes(include=['object', 'category']).columns.tolist()
print(len(train_demo.columns))
print(len(numerical_columns) + len(categorical_columns)) # check if all included 
numerical_columns.remove("MRI_track_age_missing")
categorical_columns.remove("child_eth_missing")
print(numerical_columns)
print(categorical_columns)

30
30
['Basic_Demos_Enroll_Year', 'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan']
['participant_id', 'Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ']


In [9]:
ut_stats.chi2_with_one_columns(categorical_columns, "MRI_track_age_missing", train_demo)
# Nothing correlated 

Chi-Square Test between participant_id and MRI_track_age_missing:
Chi-Square Stat: 1213.00, P-value: 0.48650
Chi-Square Test between Basic_Demos_Study_Site and MRI_track_age_missing:
Chi-Square Stat: 1.25, P-value: 0.74114
Chi-Square Test between PreInt_Demos_Fam_Child_Ethnicity and MRI_track_age_missing:
Chi-Square Stat: 2.24, P-value: 0.69161
Chi-Square Test between PreInt_Demos_Fam_Child_Race and MRI_track_age_missing:
Chi-Square Stat: 4.36, P-value: 0.88652
Chi-Square Test between MRI_Track_Scan_Location and MRI_track_age_missing:
Chi-Square Stat: 2.12, P-value: 0.71382
Chi-Square Test between Barratt_Barratt_P1_Edu and MRI_track_age_missing:
Chi-Square Stat: 3.32, P-value: 0.85348
Chi-Square Test between Barratt_Barratt_P1_Occ and MRI_track_age_missing:
Chi-Square Stat: 11.92, P-value: 0.21772
Chi-Square Test between Barratt_Barratt_P2_Edu and MRI_track_age_missing:
Chi-Square Stat: 2.96, P-value: 0.88875
Chi-Square Test between Barratt_Barratt_P2_Occ and MRI_track_age_missing:
Ch

In [10]:
ut_stats.chi2_with_one_columns(categorical_columns, "child_eth_missing", train_demo)
# The missing values of PreInt_Demos_Fam_Child_Race is strongly correlated with MRI_Track_Scan_Location 

Chi-Square Test between participant_id and child_eth_missing:
Chi-Square Stat: 1213.00, P-value: 0.48650
Chi-Square Test between Basic_Demos_Study_Site and child_eth_missing:
Chi-Square Stat: 0.52, P-value: 0.91469
Chi-Square Test between PreInt_Demos_Fam_Child_Ethnicity and child_eth_missing:
Chi-Square Stat: 1213.00, P-value: 0.00000
Chi-Square Test between PreInt_Demos_Fam_Child_Race and child_eth_missing:
Chi-Square Stat: 11.75, P-value: 0.22753
Chi-Square Test between MRI_Track_Scan_Location and child_eth_missing:
Chi-Square Stat: 14.28, P-value: 0.00645
Chi-Square Test between Barratt_Barratt_P1_Edu and child_eth_missing:
Chi-Square Stat: 9.74, P-value: 0.20392
Chi-Square Test between Barratt_Barratt_P1_Occ and child_eth_missing:
Chi-Square Stat: 12.18, P-value: 0.20342
Chi-Square Test between Barratt_Barratt_P2_Edu and child_eth_missing:
Chi-Square Stat: 8.34, P-value: 0.30337
Chi-Square Test between Barratt_Barratt_P2_Occ and child_eth_missing:
Chi-Square Stat: 12.74, P-value: 

In [11]:
ut_stats.ttest_utest_with_one_column(numerical_columns, "MRI_track_age_missing", train_demo)
# It seems that this is marginally correlated with SDQ_SDQ_Conduct_Problems 

p-value for u-test between Basic_Demos_Enroll_Year and MRI_track_age_missing is 0.5715224370356592 with u-stats 150498.5
p-value for u-test between EHQ_EHQ_Total and MRI_track_age_missing is 0.8766271009404033 with u-stats 152678.5
p-value for u-test between ColorVision_CV_Score and MRI_track_age_missing is 0.3070002041542754 with u-stats 157520.5
p-value for u-test between APQ_P_APQ_P_CP and MRI_track_age_missing is 0.4150781857209046 with u-stats 149559.5
p-value for u-test between APQ_P_APQ_P_ID and MRI_track_age_missing is 0.10157876729291905 with u-stats 144444.5
p-value for u-test between APQ_P_APQ_P_INV and MRI_track_age_missing is 0.707169017803704 with u-stats 155630.5
p-value for u-test between APQ_P_APQ_P_OPD and MRI_track_age_missing is 0.5537932544724726 with u-stats 156827.0
p-value for u-test between APQ_P_APQ_P_PM and MRI_track_age_missing is 0.21686517380602588 with u-stats 146675.5
p-value for u-test between APQ_P_APQ_P_PP and MRI_track_age_missing is 0.34703374747506

In [12]:
ut_stats.ttest_utest_with_one_column(numerical_columns, "child_eth_missing", train_demo)
# PreInt_Demos_Fam_Child_Race is correlated with
# 1) Basic_Demos_Enroll_Year
# 2) APQ_P_APQ_P_CP (marginal)
# 3) APQ_P_APQ_P_OPD
# 4) SDQ_SDQ_Peer_Problems (marginal)

p-value for u-test between Basic_Demos_Enroll_Year and child_eth_missing is 0.013841320862761559 with u-stats 3865.5
p-value for u-test between EHQ_EHQ_Total and child_eth_missing is 0.24426404664347467 with u-stats 5270.5
p-value for u-test between ColorVision_CV_Score and child_eth_missing is 0.6899575506518704 with u-stats 6288.0
p-value for u-test between APQ_P_APQ_P_CP and child_eth_missing is 0.0607257134741712 with u-stats 8512.0
p-value for u-test between APQ_P_APQ_P_ID and child_eth_missing is 0.7793125774892082 with u-stats 6287.5
p-value for u-test between APQ_P_APQ_P_INV and child_eth_missing is 0.7899605516412364 with u-stats 6919.0
p-value for u-test between APQ_P_APQ_P_OPD and child_eth_missing is 0.012251899256974544 with u-stats 9496.5
p-value for u-test between APQ_P_APQ_P_PM and child_eth_missing is 0.6023075475342772 with u-stats 6009.5
p-value for u-test between APQ_P_APQ_P_PP and child_eth_missing is 0.2724521671633804 with u-stats 7874.5
p-value for u-test betwee

# Conclusion: 
Two columns with missing values (PreInt_Demos_Fam_Child_Ethnicity - 1% missing & MRI_Track_Age_at_Scan - 30 % missing). 

Notes: 

1) Missing values in MRI_Track_Age_at_Scan doesn't seem to correlate with any quantatitive and categorical variables stronglu, while there is a huge percentage missing.
    
2) Missing values in PreInt_Demos_Fam_Child_Ethnicity only accounts for 1% but it has high correlation with several quantatitive and categorical variables. 
    
Therefore, *imputation of missing data is important*. 

For PreInt_Demos_Fam_Child_Ethnicity --> use KNN to impute. 

For MRI_Track_Age_at_Scan --> Not sure. Could use some cool approaches but let's proceed with KNN first to see the performance. 

Also, the test dataset has a lot more columns with missing values. Need to think of this later. 
