In [1]:
import os 
import sys
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

rootfolder = os.path.abspath(os.path.join(Path.cwd(),"..")) # rootpath --> top of git repo
# Change the above if you are not in $root/notebook/

# If you are in the $root, uncomment below: 
# rootfolder = Path.cwd() 

sys.path.append(os.path.join(rootfolder))

In [2]:

from src.data.data_loader import load_data
import src.utility.ut_general as ut_general
import src.utility.ut_stats as ut_stats 
datafolder = os.path.join(rootfolder, "data")

In [3]:
# Load test data: 
test_data_dic = load_data(datafolder, filetype = "test")
test_quant = test_data_dic["test_quant"]
test_cate = test_data_dic["test_cate"]
test_fmri = test_data_dic["test_fmri"]

In [4]:
# Load trainning data: 
train_data_dic = load_data(datafolder, filetype = "train") 
train_quant = train_data_dic["train_quant"]
train_outcome = train_data_dic["train_outcome"]
train_cate = train_data_dic["train_cate"]
train_fmri = train_data_dic["train_fmri"] 

In [5]:
cate_cols = train_cate.columns 
print(cate_cols) 
quant_col = train_quant.columns
print(quant_col)

Index(['participant_id', 'Basic_Demos_Enroll_Year', 'Basic_Demos_Study_Site',
       'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
       'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu',
       'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Edu',
       'Barratt_Barratt_P2_Occ'],
      dtype='object')
Index(['participant_id', 'EHQ_EHQ_Total', 'ColorVision_CV_Score',
       'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV',
       'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP',
       'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total',
       'SDQ_SDQ_Emotional_Problems', 'SDQ_SDQ_Externalizing',
       'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
       'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
       'MRI_Track_Age_at_Scan'],
      dtype='object')


In [6]:
set(train_cate['Basic_Demos_Enroll_Year']) # from 2015 to 2020 
# Should this be treated as quantatitive or ordinal?

{2015, 2016, 2017, 2018, 2019, 2020}

In [7]:
ordinal_var = ['Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ', 
               'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ',
              'Basic_Demos_Enroll_Year']
cate_var = ['Basic_Demos_Study_Site', 'PreInt_Demos_Fam_Child_Ethnicity',
             'PreInt_Demos_Fam_Child_Race', 'MRI_Track_Scan_Location'] 
quant_var = quant_col[1:]
print(quant_var)

Index(['EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP',
       'APQ_P_APQ_P_ID', 'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD',
       'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP', 'SDQ_SDQ_Conduct_Problems',
       'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems',
       'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact',
       'SDQ_SDQ_Hyperactivity', 'SDQ_SDQ_Internalizing',
       'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial', 'MRI_Track_Age_at_Scan'],
      dtype='object')


In [60]:
missing_columns_cate = set(train_cate.columns) - set(test_cate.columns)
print(missing_columns_cate)
missing_columns_quant = set(train_quant.columns) - set(test_quant.columns)
print(missing_columns_quant)

set()
set()


Test dataset and train dataset have the same columns in the demographical data. 

In [61]:
ut_general.count_levels_for_columns(ordinal_var, train_cate)

# Use ordinal encoding: 
# 1) Education level and occupation both have a ordinal ordering 
# 2) Many levels in each variable. One-hot encoding will introduce high-dimensionality. 

There are 8 levels in Barratt_Barratt_P1_Edu, including:
{0, 3, 6, 9, 12, 15, 18, 21}
There are 10 levels in Barratt_Barratt_P1_Occ, including:
{0, 35, 5, 40, 10, 45, 15, 20, 25, 30}
There are 8 levels in Barratt_Barratt_P2_Edu, including:
{0, 3, 6, 9, 12, 15, 18, 21}
There are 10 levels in Barratt_Barratt_P2_Occ, including:
{0, 35, 5, 40, 10, 45, 15, 20, 25, 30}
There are 6 levels in Basic_Demos_Enroll_Year, including:
{2016, 2017, 2018, 2019, 2020, 2015}


[8, 10, 8, 10, 6]

All ordinal columns have more than 5 levels. If I use one-hot encoder, this might increase the dimensionality too much. 

In [62]:
ut_general.count_levels_for_columns(cate_var, train_cate, True)
    
# Here, most of the variable have >=5 levels. 
# Using one-hot encoding alone might have a high dimensionality
# Handle different encoding type in the code and perhaps use corss-validation to test latter? 

There are 4 levels in Basic_Demos_Study_Site, including:
{1, 2, 3, 4}
There are 15 levels in PreInt_Demos_Fam_Child_Ethnicity, including:
{0.0, 1.0, 2.0, 3.0, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan}
There are 10 levels in PreInt_Demos_Fam_Child_Race, including:
{0, 1, 2, 3, 4, 7, 8, 9, 10, 11}
There are 5 levels in MRI_Track_Scan_Location, including:
{0, 1, 2, 3, 4}


[4, 15, 10, 5]

It seems that some nominal variable also preserves >=5 levels for each category. 

I think the ordinal data should be encoded as ordinal encoding and the rest could be encoded with one-hot encoding. However, another issue is listed below: 

In [10]:
ut_general.check_columns_set(train_cate, test_cate)

For Basic_Demos_Enroll_Year, below are in test but not in train:

{2021, 2022, 2023}
For Basic_Demos_Enroll_Year, below are in train but not in test:

{2016, 2017, 2018, 2015}
For Basic_Demos_Study_Site, below are in test but not in train:

{5}
For Basic_Demos_Study_Site, below are in train but not in test:

{1, 2, 3}
For PreInt_Demos_Fam_Child_Ethnicity, below are in test but not in train:

{nan, nan, nan}
For PreInt_Demos_Fam_Child_Ethnicity, below are in train but not in test:

{nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan}
For PreInt_Demos_Fam_Child_Race, below are in test but not in train:

{nan, nan, nan, nan, nan, nan}
For PreInt_Demos_Fam_Child_Race, below are in train but not in test:

{10}
For MRI_Track_Scan_Location, below are in train but not in test:

{0, 1, 2}
For Barratt_Barratt_P1_Edu, below are in test but not in train:

{nan}
For Barratt_Barratt_P1_Edu, below are in train but not in test:

{0}
For Barratt_Barratt_P1_Occ, below are in test but not in train:

{

Although columns match between test data and train data, it seems that a lot of columns in the categorical dataset in the train have different values compared between train and test. 

This makes ordinal encoding a bit tricky. Since if using ordinal encoding, KNN_imputer cannot handle a category seen in the test but not seen in the train. 

For now, let's encode every categorical data with one-hot encoding. Later, if the ordinal nature of ordinal data is truly important, we can change later. 