In [1]:
import pandas as pd 
import os 
from pathlib import Path  
import utility as ut # see scripts/utilities.py 
import matplotlib.pyplot as plt
import numpy as np
import openpyxl # use read_excel from pd
from scipy.stats import shapiro

In [2]:
# rootfolder = Path.cwd() 
rootfolder = "/Users/bingli/Documents/GitHub/fMRI-AHDH"
traindatafolder = os.path.join(rootfolder, "data","TRAIN")
train_data_dic = ut.load_train_data(traindatafolder)
testdatafolder = os.path.join(rootfolder, "data","TEST")
test_data_dic = ut.load_test_data(testdatafolder)

train_quant = train_data_dic["train_quant"]
train_outcome = train_data_dic["train_outcome"]
train_cate = train_data_dic["train_cate"]
train_fmri = train_data_dic["train_fmri"]

test_quant = test_data_dic["test_quant"]
test_cate = test_data_dic["test_cate"]
test_fmri = test_data_dic["test_fmri"]

loading /Users/bingli/Documents/GitHub/fMRI-AHDH/data/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx
loading /Users/bingli/Documents/GitHub/fMRI-AHDH/data/TRAIN/TRAINING_SOLUTIONS.xlsx
loading /Users/bingli/Documents/GitHub/fMRI-AHDH/data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx
loading /Users/bingli/Documents/GitHub/fMRI-AHDH/data/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv
loading /Users/bingli/Documents/GitHub/fMRI-AHDH/data/TEST/TEST_CATEGORICAL.xlsx
loading /Users/bingli/Documents/GitHub/fMRI-AHDH/data/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv
loading /Users/bingli/Documents/GitHub/fMRI-AHDH/data/TEST/TEST_QUANTITATIVE_METADATA.xlsx


In [3]:
# Check missing values for train dataset 
train_missing_cols = [] 
for name, df in train_data_dic.items():
    missing_lst = ut.return_missing_list(df, name)  
    train_missing_cols.append(missing_lst)
print("Missing columns in training dataset: " + str(train_missing_cols))


col MRI_Track_Age_at_Scan has 29.67848309975268 % missing.
col PreInt_Demos_Fam_Child_Ethnicity has 0.9068425391591096 % missing.
Missing columns in training dataset: [[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 'MRI_Track_Age_at_Scan'], [None, None, None], [None, None, None, 'PreInt_Demos_Fam_Child_Ethnicity', None, None, None, None, None, None], [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, No

In [None]:
# Check missing values --> Test dataset 
# It seems that the test dataset has a lot more missing :-(  
test_missing_cols = [] 
for name, df in test_data_dic.items():
    missing_lst = ut.return_missing_list(df, name)   
    test_missing_cols.append(missing_lst)
print("Missing columns in testing dataset" + str(test_missing_cols))
    

In [None]:
# MCAR test for MRI_Track_Age_at_Scan column 
# Goal: 1) if data is missing completely at random (MCAR); 
# 2) Missing at random (MAR); 
# 3) Missing NOT at random (MNAR) 

# see if the missing / presence of columns with missing are correlated with any columns in the dataset
# Create an indicator variable for missing values
train_quant["MRI_track_age_missing"] = np.where(train_quant["MRI_Track_Age_at_Scan"].isnull(), 1, 0)
train_cate["child_eth_missing"] = np.where(train_cate["PreInt_Demos_Fam_Child_Ethnicity"].isnull(), 1, 0)

# Here, enroll year is categorical but it could be better recognized as a numerical column
for col in train_cate.columns[2:]: # Other columns are converted to string for chi2-test for now
    train_cate[col] = train_cate[col].astype(str)
type(train_cate["Basic_Demos_Study_Site"])

pa_id = list(set(train_cate.columns) & set(train_quant.columns))[0]
train_demo = train_cate.merge(train_quant, on=pa_id) 

numerical_columns = train_demo.select_dtypes(include=['number']).columns.tolist()
categorical_columns = train_demo.select_dtypes(include=['object', 'category']).columns.tolist()
print(len(train_demo.columns))
print(len(numerical_columns) + len(categorical_columns)) # check if all included 
numerical_columns.remove("MRI_track_age_missing")
categorical_columns.remove("child_eth_missing")
print(numerical_columns)
print(categorical_columns)

In [None]:
ut.chi2_with_one_columns(categorical_columns, "MRI_track_age_missing", train_demo)
# Nothing correlated 

In [None]:
ut.chi2_with_one_columns(categorical_columns, "child_eth_missing", train_demo)
# The missing values of PreInt_Demos_Fam_Child_Race is strongly correlated with MRI_Track_Scan_Location 

In [None]:
ut.ttest_utest_with_one_column(numerical_columns, "MRI_track_age_missing", train_demo)
# It seems that this is marginally correlated with SDQ_SDQ_Conduct_Problems 

In [None]:
ut.ttest_utest_with_one_column(numerical_columns, "child_eth_missing", train_demo)
# PreInt_Demos_Fam_Child_Race is correlated with
# 1) Basic_Demos_Enroll_Year
# 2) APQ_P_APQ_P_CP (marginal)
# 3) APQ_P_APQ_P_OPD
# 4) SDQ_SDQ_Peer_Problems (marginal)

# Conclusion: 
Two columns with missing values (PreInt_Demos_Fam_Child_Ethnicity - 1% missing & MRI_Track_Age_at_Scan - 30 % missing). 

Notes: 

1) Missing values in MRI_Track_Age_at_Scan doesn't seem to correlate with any quantatitive and categorical variables stronglu, while there is a huge percentage missing.
    
2) Missing values in PreInt_Demos_Fam_Child_Ethnicity only accounts for 1% but it has high correlation with several quantatitive and categorical variables. 
    
Therefore, *imputation of missing data is important*. 

For PreInt_Demos_Fam_Child_Ethnicity --> use KNN to impute. 

For MRI_Track_Age_at_Scan --> Not sure. Could use some cool approaches but let's proceed with KNN first to see the performance. 

Also, the test dataset has a lot more columns with missing values. Need to think of this later. 
