In [40]:
import numpy as np
import pandas as pd
import os
import random
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import VotingRegressor, RandomForestClassifier
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder, StandardScaler
from scipy.optimize import minimize, minimize_scalar
from scipy import stats
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from colorama import Fore, Style
from IPython.display import clear_output
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [2]:
# Create output folders
output_folder = 'output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Create separate analysis output folders
analysis_output_folder = 'analysis_output'
os.makedirs(analysis_output_folder, exist_ok=True)

physical_analysis_output_folder = 'analysis_output/physical'
os.makedirs(physical_analysis_output_folder, exist_ok=True)

fitness_analysis_output_folder = 'analysis_output/fitness'
os.makedirs(fitness_analysis_output_folder, exist_ok=True)

bia_analysis_output_folder = 'analysis_output/bia'
os.makedirs(bia_analysis_output_folder, exist_ok=True)

child_info_analysis_output_folder = 'analysis_output/child_info'
os.makedirs(child_info_analysis_output_folder, exist_ok=True)

actigraphy_analysis_output_folder = 'analysis_output/actigraphy'
os.makedirs(actigraphy_analysis_output_folder, exist_ok=True)

# Load data functions
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

# Set display all columns in dataframes property
pd.options.display.max_columns = None

In [3]:
# Load data
train_data = pd.read_csv('input/train.csv')
test_data = pd.read_csv('input/test.csv')
sample_data = pd.read_csv('input/sample_submission.csv')

train_ts_data = load_time_series("input/series_train.parquet")
test_ts_data = load_time_series("input/series_test.parquet")

100%|██████████| 996/996 [00:21<00:00, 45.81it/s]
100%|██████████| 2/2 [00:00<00:00, 27.15it/s]


In [4]:
# Remove id column from time series data
time_series_columns = train_ts_data.columns.tolist()
time_series_columns.remove("id")

# Merge data
train_data = pd.merge(train_data, train_ts_data, how="left", on='id')
test_data = pd.merge(test_data, test_ts_data, how="left", on='id')
train_data = train_data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

train_data.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,stat_10,stat_11,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,-0.316384,0.016009,-0.16789,0.047388,-10.580416,0.0,42.29631,4053.579102,50462150000000.0,4.470182,3.0,53.201683,0.453665,0.502702,0.58571,0.106351,42.94717,0.0,208.168976,112.404045,19428420000000.0,1.931421,0.0,14.244914,-1.746094,-2.905339,-1.048372,0.0,-89.833092,0.0,0.0,3824.0,55000000000.0,1.0,3.0,41.0,-0.68418,-0.309863,-0.649974,0.006432,-41.541863,0.0,2.392969,4028.666748,36890000000000.0,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,53477500000000.0,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.04142,12.220764,0.0,15.0,4147.0,66408750000000.0,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
4,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
# Skew removal for some BIA columns
skewed_columns = [
    'BIA-BIA_BMC', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_Fat',
    'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 
    'BIA-BIA_TBW', 'CGAS-CGAS_Score', 'stat_23', 'stat_35', 'stat_38', 'stat_40', 'stat_47',
    'stat_54', 'stat_66', 'stat_78', 'stat_80', 'stat_88', 'stat_90'
]
lambda_params = {}

# Define the box-cox function to remove skew
def box_cox_transform(df, column, lambda_param=None):
    # Create a copy of the dataframe
    df_copy = df.copy()
    
    # Drop NaN values for the specific column
    df_copy = df_copy.dropna(subset=[column])
    
    # Ensure all values are positive
    min_value = df_copy[column].min()
    if min_value <= 0:
        df_copy[column] = df_copy[column] - min_value + 1  # Add 1 to ensure all values are positive
    
    # Perform Box-Cox transformation
    if lambda_param is None:
        df_copy[f'{column}_boxcox'], lambda_param = stats.boxcox(df_copy[column])
        print(f"Transforming column: {column}")
        print(f"Optimal lambda for Box-Cox transformation: {lambda_param}")
    else:
        df_copy[f'{column}_boxcox'] = stats.boxcox(df_copy[column], lmbda=lambda_param)
        print(f"Applying transformation to column: {column} with lambda: {lambda_param}")
    
    print(f"Number of rows before transformation: {len(df)}")
    print(f"Number of rows after removing NaN values: {len(df_copy)}")
    
    return df_copy, lambda_param

# Apply Box-Cox transformation to train data and store lambda values
for column in skewed_columns:
    transformed_train_data, lambda_params[column] = box_cox_transform(train_data, column)
    # Update only the new transformed column in the original dataframe
    train_data[f'{column}_boxcox'] = transformed_train_data[f'{column}_boxcox']

# Apply the same transformation to test data using stored lambda values
for column in skewed_columns:
    transformed_test_data, _ = box_cox_transform(test_data, column, lambda_param=lambda_params[column])
    # Update only the new transformed column in the original dataframe
    test_data[f'{column}_boxcox'] = transformed_test_data[f'{column}_boxcox']

# Function to handle infinite values
def replace_inf_with_max(df):
    for column in df.columns:
        if df[column].dtype == 'float64':
            max_value = df[column][~np.isinf(df[column])].max()
            df[column] = df[column].replace([np.inf, -np.inf], max_value)
    return df

# Replace infinite values with the maximum non-infinite value in each column
train_data = replace_inf_with_max(train_data)
test_data = replace_inf_with_max(test_data)

Transforming column: BIA-BIA_BMC
Optimal lambda for Box-Cox transformation: -0.26544288750244394
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_BMR
Optimal lambda for Box-Cox transformation: -2.024016452566404
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_DEE
Optimal lambda for Box-Cox transformation: -0.9862196352522961
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_ECW
Optimal lambda for Box-Cox transformation: -0.11312798067663181
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_Fat
Optimal lambda for Box-Cox transformation: 27.718481796974547
Number of rows before transformation: 3960
Number of rows after removing NaN values: 1991
Transforming column: BIA-BIA_FFM
Optimal lambda for Box-Cox transforma

In [6]:
test_data.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,stat_10,stat_11,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95,BIA-BIA_BMC_boxcox,BIA-BIA_BMR_boxcox,BIA-BIA_DEE_boxcox,BIA-BIA_ECW_boxcox,BIA-BIA_Fat_boxcox,BIA-BIA_FFM_boxcox,BIA-BIA_FFMI_boxcox,BIA-BIA_FMI_boxcox,BIA-BIA_ICW_boxcox,BIA-BIA_LDM_boxcox,BIA-BIA_LST_boxcox,BIA-BIA_TBW_boxcox,CGAS-CGAS_Score_boxcox,stat_23_boxcox,stat_35_boxcox,stat_38_boxcox,stat_40_boxcox,stat_47_boxcox,stat_54_boxcox,stat_66_boxcox,stat_78_boxcox,stat_80_boxcox,stat_88_boxcox,stat_90_boxcox
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,,,,Fall,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.864087,0.494067,1.013221,1.877806,1.949711e+25,1.210694,0.378516,143.784676,0.836481,1.443716,1.461227,1.296982,2.887303,,,,,,,,,,,
1,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.837811,0.494067,1.013225,1.624546,1436718000000000.0,1.211268,0.378436,0.36296,0.832549,1.635205,1.462669,1.279664,,,,,,,,,,,,
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.054867,,,,,,,,,,,
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,-0.316384,0.016009,-0.16789,0.047388,-10.580416,0.0,42.29631,4053.579102,50462150000000.0,4.470182,3.0,53.201683,0.453665,0.502702,0.58571,0.106351,42.94717,0.0,208.168976,112.404045,19428420000000.0,1.931421,0.0,14.244914,-1.746094,-2.905339,-1.048372,0.0,-89.833092,0.0,0.0,3824.0,55000000000.0,1.0,3.0,41.0,-0.68418,-0.309863,-0.649974,0.006432,-41.541863,0.0,2.392969,4028.666748,36890000000000.0,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,53477500000000.0,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.04142,12.220764,0.0,15.0,4147.0,66408750000000.0,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0,1.131777,0.494067,1.013388,2.360996,7.771438e+33,1.229847,0.378534,1003.888757,0.841114,1.671884,1.5003,1.322946,3.054867,5.607984,4.046808,0.0,0.0,5.063352,0.747576,2.035364,3.067454,5.434934e+31,2.016626e+102,786.87414
4,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
# Feature engineering
def engineer_features(df, is_train=True):
    # Create a copy of the dataframe to avoid modifying the original
    df = df.copy()
    
    # Combine all grip strength
    df['FGC-FGC_GS'] = df['FGC-FGC_GSD_Zone'] + df['FGC-FGC_GSND_Zone']

    # Combine all sit and reach
    df['FGC-FGC_SR'] = df['FGC-FGC_SRL_Zone'] + df['FGC-FGC_SRR_Zone']

    # Create a fitness score by adding the zone fitness data
    df['fitness_score'] = df['FGC-FGC_GS'] + df['FGC-FGC_SR'] + df['FGC-FGC_CU_Zone'] + df['FGC-FGC_PU_Zone'] + df['FGC-FGC_TL_Zone']

    # Combine PAQ_A-PAQ_A_Total and PAQ_C-PAQ_C_Total into one column
    df['PAQ_Total'] = df['PAQ_A-PAQ_A_Total'].combine_first(df['PAQ_C-PAQ_C_Total'])

    # Create interaction features
    interaction_features = [
        ('BIA-BIA_Fat', 'PCIAT-PCIAT_Total', 'BIA_Fat_X_PCIAT_Total'),
        ('Physical-Weight', 'BIA-BIA_BMI', 'Physical_Weight_X_BIA_BMI'),
        ('CGAS-CGAS_Score', 'PCIAT-PCIAT_Total', 'CGAS_Score_X_PCIAT_Total'),
        ('BIA-BIA_Fat', 'Physical-Weight', 'BIA_Fat_X_Physical_Weight'),
        ('BIA-BIA_BMI', 'PCIAT-PCIAT_Total', 'BIA_BMI_X_PCIAT_Total'),
        ('Physical-Weight', 'PCIAT-PCIAT_Total', 'Physical_Weight_X_PCIAT_Total')
    ]

    for feature1, feature2, new_feature in interaction_features:
        if is_train or (feature2 != 'PCIAT-PCIAT_Total' and feature1 != 'PCIAT-PCIAT_Total'):
            df[new_feature] = df[feature1] * df[feature2]
        else:
            # For test set, create a placeholder column filled with NaN
            df[new_feature] = np.nan
            print(f"Warning: {new_feature} could not be created for test set due to missing 'PCIAT-PCIAT_Total'")

    # Combine identical actigraphy stats
    columns_to_combine = ['stat_0', 'stat_1', 'stat_2', 'stat_3', 'stat_4', 'stat_5', 
                          'stat_6', 'stat_7', 'stat_8', 'stat_9', 'stat_10', 'stat_11']
    
    # Check if columns are identical
    is_identical = df[columns_to_combine].nunique().eq(1).all()
    
    if is_identical:
        # If they are identical, we can just take the first column and rename it
        df['combined_actigraphy_stat'] = df[columns_to_combine[0]]
        print(f"Columns {columns_to_combine} have been combined into combined_actigraphy_stat")
    else:
        # If not exactly identical, take the mean
        print("Warning: The actigraphy stat columns are not identical. Taking the average instead.")
        df['combined_actigraphy_stat'] = df[columns_to_combine].mean(axis=1)
    
    # Drop the original columns
    df = df.drop(columns=columns_to_combine)

    return df

# Apply feature engineering to train data
train_data = engineer_features(train_data, is_train=True)

# Apply feature engineering to test data
test_data = engineer_features(test_data, is_train=False)

print("Feature engineering completed for both train and test data.")

Feature engineering completed for both train and test data.


In [8]:
train_data.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95,BIA-BIA_BMC_boxcox,BIA-BIA_BMR_boxcox,BIA-BIA_DEE_boxcox,BIA-BIA_ECW_boxcox,BIA-BIA_Fat_boxcox,BIA-BIA_FFM_boxcox,BIA-BIA_FFMI_boxcox,BIA-BIA_FMI_boxcox,BIA-BIA_ICW_boxcox,BIA-BIA_LDM_boxcox,BIA-BIA_LST_boxcox,BIA-BIA_TBW_boxcox,CGAS-CGAS_Score_boxcox,stat_23_boxcox,stat_35_boxcox,stat_38_boxcox,stat_40_boxcox,stat_47_boxcox,stat_54_boxcox,stat_66_boxcox,stat_78_boxcox,stat_80_boxcox,stat_88_boxcox,stat_90_boxcox,FGC-FGC_GS,FGC-FGC_SR,fitness_score,PAQ_Total,BIA_Fat_X_PCIAT_Total,Physical_Weight_X_BIA_BMI,CGAS_Score_X_PCIAT_Total,BIA_Fat_X_Physical_Weight,BIA_BMI_X_PCIAT_Total,Physical_Weight_X_PCIAT_Total,combined_actigraphy_stat
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.795346,0.494067,1.013221,1.877806,6.776256e+107,1.210694,0.378516,13083250000000.0,0.836481,1.443716,1.461227,1.296982,2.887303,,,,,,,,,,,,,0.0,,,506.75735,857.46336,2805.0,468.059516,928.356,2794.0,
1,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.791257,0.494067,1.013225,1.624546,6.664675e+107,1.211268,0.378436,12361790000000.0,0.832549,1.635205,1.462669,1.279664,,,,,,,,,,,,,,2.0,,2.34,0.0,645.7066,,182.6591,0.0,0.0,
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.054867,,,,,,,,,,,,3.0,2.0,7.0,2.17,,,1988.0,,,2116.8,
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,-0.316384,0.016009,-0.16789,0.047388,-10.580416,0.0,42.29631,4053.579102,50462150000000.0,4.470182,3.0,53.201683,0.453665,0.502702,0.58571,0.106351,42.94717,0.0,208.168976,112.404045,19428420000000.0,1.931421,0.0,14.244914,-1.746094,-2.905339,-1.048372,0.0,-89.833092,0.0,0.0,3824.0,55000000000.0,1.0,3.0,41.0,-0.68418,-0.309863,-0.649974,0.006432,-41.541863,0.0,2.392969,4028.666748,36890000000000.0,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,53477500000000.0,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.04142,12.220764,0.0,15.0,4147.0,66408750000000.0,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0,1.845723,0.494067,1.013388,2.360996,6.985483e+107,1.229847,0.378534,13552920000000.0,0.841114,1.671884,1.5003,1.322946,3.054867,8.272084,4.197842,8.16162e+59,0.126963,8.138407,0.986268,2.184901,3.15009,5.434934e+31,2.016626e+102,786.87414,,0.0,,2.451,828.2692,1492.81488,3124.0,1536.06288,804.9492,3590.4,43330.0
4,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,


In [9]:
# Isolate the physical attribute columns and some contextual columns for analysis
physical_columns = [
    'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'Physical_Weight_X_PCIAT_Total',
    'CGAS-Season', 'CGAS-CGAS_Score_boxcox', 'Physical-Season', 'Physical-BMI',
    'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference', 'BIA_Fat_X_Physical_Weight',
    'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'BIA_BMI_X_PCIAT_Total',
    'fitness_score', 'BIA-BIA_Frame_num', 'BIA-BIA_BMI', 'PreInt_EduHx-computerinternet_hoursday',
    'PCIAT-PCIAT_Total', 'BIA_Fat_X_PCIAT_Total', 'Physical_Weight_X_BIA_BMI', 'CGAS_Score_X_PCIAT_Total',
    'sii'
]

# Isolate the fitness attributes
# Removed columns: 'FGC-FGC_CU' 'FGC-FGC_PU', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_SRL' 
# 'FGC-FGC_GSND_Zone' 'FGC-FGC_GSND' 'FGC-FGC_GSD' 'FGC-FGC_GSD_Zone' 'Fitness_Endurance-Time_Sec'
fitness_columns = [
    'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 'Physical_Weight_X_PCIAT_Total',
    'FGC-Season', 'FGC-FGC_CU_Zone', 'FGC-FGC_SR', 'FGC-FGC_PU_Zone', 'FGC-FGC_TL', 'BIA_Fat_X_Physical_Weight',
    'FGC-FGC_TL_Zone', 'FGC-FGC_GS', 'fitness_score', 'BIA-BIA_BMI', 'Physical-BMI', 'BIA_BMI_X_PCIAT_Total',
    'BIA-BIA_Frame_num', 'PreInt_EduHx-computerinternet_hoursday', 'PCIAT-PCIAT_Total', 'BIA_Fat_X_PCIAT_Total',
    'Physical_Weight_X_BIA_BMI', 'CGAS_Score_X_PCIAT_Total', 'sii'
]

# Isolate the BIA attributes
bia_columns = [
    'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_Frame_num', 'BIA-BIA_BMC_boxcox', 'BIA_Fat_X_Physical_Weight',
    'BIA-BIA_BMI', 'BIA-BIA_BMR_boxcox', 'BIA-BIA_DEE_boxcox', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_FFM_boxcox', 'BIA-BIA_FFMI_boxcox',
    'BIA-BIA_FMI_boxcox', 'BIA-BIA_Fat_boxcox', 'BIA-BIA_ICW_boxcox', 'BIA-BIA_LDM_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_TBW_boxcox',
    'fitness_score', 'Physical-BMI', 'PreInt_EduHx-computerinternet_hoursday', 'PCIAT-PCIAT_Total', 'BIA_Fat_X_PCIAT_Total',
    'Physical_Weight_X_BIA_BMI', 'CGAS_Score_X_PCIAT_Total', 'BIA_BMI_X_PCIAT_Total', 'Physical_Weight_X_PCIAT_Total',
    'sii'
]

# Isolate the PAQ, PCIAT, and SDS
child_info_columns = [
    'PreInt_EduHx-computerinternet_hoursday', 'PAQ_A-Season', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total',
    'PAQ_Total', 'PCIAT-Season', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02', 'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04',
    'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06', 'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'BIA_Fat_X_Physical_Weight',
    'PCIAT-PCIAT_10', 'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14', 'BIA_BMI_X_PCIAT_Total',
    'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18', 'PCIAT-PCIAT_19', 'Physical_Weight_X_PCIAT_Total',
    'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total', 'SDS-Season', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T',
    'PreInt_EduHx-Season', 'PreInt_EduHx-computerinternet_hoursday', 'BIA-BIA_BMI', 'fitness_score', 'BIA_Fat_X_PCIAT_Total',
    'Physical-BMI', 'Physical_Weight_X_BIA_BMI', 'CGAS_Score_X_PCIAT_Total', 'sii'
]

# Isolate the Actigraphy data
# removed columns: 'stat_41', 'stat_42', 'stat_39','stat_92_boxcox' 'stat_0', 'stat_1', 'stat_2', 'stat_3', 'stat_4', 'stat_5', 'stat_6', 'stat_7', 
# 'stat_8', 'stat_9', 'stat_10','stat_11'
actigraphy_columns = [
    'combined_actigraphy_stat', 'stat_12', 'stat_13', 'stat_14', 'stat_15', 'stat_16', 'stat_17', 'stat_18', 'stat_19', 'stat_20',
    'stat_21', 'stat_22', 'stat_23_boxcox', 'stat_24', 'stat_25', 'stat_26', 'stat_27', 'stat_28', 'stat_29', 'stat_30',
    'stat_31', 'stat_32', 'stat_33', 'stat_34', 'stat_35_boxcox', 'stat_36', 'stat_37', 'stat_38_boxcox', 'stat_40_boxcox',
    'stat_43', 'stat_44', 'stat_45', 'stat_46', 'stat_47_boxcox', 'stat_48', 'stat_49', 'stat_50', 'BIA_Fat_X_Physical_Weight',
    'stat_51', 'stat_52', 'stat_53', 'stat_54_boxcox', 'stat_55', 'stat_56', 'stat_57', 'stat_58', 'stat_59', 'stat_60',
    'stat_61', 'stat_62', 'stat_63', 'stat_64', 'stat_65', 'stat_66_boxcox', 'stat_67', 'stat_68', 'stat_69', 'stat_70',
    'stat_71', 'stat_72', 'stat_73', 'stat_74', 'stat_75', 'stat_76', 'stat_77', 'stat_78_boxcox', 'stat_79', 'stat_80_boxcox',
    'stat_81', 'stat_82', 'stat_83', 'stat_84', 'stat_85', 'stat_86', 'stat_87', 'stat_88_boxcox', 'stat_89', 'stat_90_boxcox',
    'stat_91', 'stat_93', 'stat_94', 'stat_95', 'PreInt_EduHx-computerinternet_hoursday', 'CGAS_Score_X_PCIAT_Total', 'BIA_BMI_X_PCIAT_Total',
    'BIA-BIA_Frame_num', 'SDS-SDS_Total_T', 'BIA-BIA_BMI', 'Physical-BMI', 'BIA_Fat_X_PCIAT_Total', 'Physical_Weight_X_PCIAT_Total', 'sii'
]

In [10]:
# Function to analyze columns
def analyze_column(column):
    total_count = len(train_data)
    missing_count = train_data[column].isnull().sum()
    missing_percentage = (missing_count / total_count) * 100
    unique_values = train_data[column].nunique()
    
    if pd.api.types.is_numeric_dtype(train_data[column]):
        mean_value = train_data[column].mean()
        median_value = train_data[column].median()
        std_dev = train_data[column].std()
        min_value = train_data[column].min()
        max_value = train_data[column].max()
        return {
            "Column": column,
            "Total Count": total_count,
            "Missing Count": missing_count,
            "Missing Percentage": f"{missing_percentage:.2f}%",
            "Unique Values": unique_values,
            "Data Type": train_data[column].dtype,
            "Mean": mean_value,
            "Median": median_value,
            "Standard Deviation": std_dev,
            "Minimum": min_value,
            "Maximum": max_value
        }
    else:
        top_values = train_data[column].value_counts().head(3).to_dict()
        return {
            "Column": column,
            "Total Count": total_count,
            "Missing Count": missing_count,
            "Missing Percentage": f"{missing_percentage:.2f}%",
            "Unique Values": unique_values,
            "Data Type": train_data[column].dtype,
            "Top 3 Values": top_values
        }

# Physical column profiles        
physical_column_profiles = [analyze_column(col) for col in physical_columns]
physical_column_profiles_df = pd.DataFrame(physical_column_profiles)

# Save column profiles to CSV
physical_column_profiles_df.to_csv(os.path.join(physical_analysis_output_folder, 'physical_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(physical_analysis_output_folder, 'physical_column_profiles.csv')}")

# Fitness column profiles
fitness_column_profiles = [analyze_column(col) for col in fitness_columns]
fitness_column_profiles_df = pd.DataFrame(fitness_column_profiles)

# Save column profiles to CSV
fitness_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'fitness_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'fitness_column_profiles.csv')}")

# BIA column profiles
bia_column_profiles = [analyze_column(col) for col in bia_columns]
bia_column_profiles_df = pd.DataFrame(bia_column_profiles)

# Save column profiles to CSV
bia_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'bia_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'bia_column_profiles.csv')}")

# Child info column profiles
child_info_column_profiles = [analyze_column(col) for col in child_info_columns]
child_info_column_profiles_df = pd.DataFrame(child_info_column_profiles)

# Save column profiles to CSV
child_info_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'child_info_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'child_info_column_profiles.csv')}")

# Actigraphy info column profiles
actigraphy_column_profiles = [analyze_column(col) for col in actigraphy_columns]
actigraphy_column_profiles_df = pd.DataFrame(actigraphy_column_profiles)

# Save column profiles to CSV
actigraphy_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'actigraphy_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'actigraphy_column_profiles.csv')}")

Column profiles saved to analysis_output/physical\physical_column_profiles.csv
Column profiles saved to analysis_output\fitness_column_profiles.csv
Column profiles saved to analysis_output\bia_column_profiles.csv
Column profiles saved to analysis_output\child_info_column_profiles.csv
Column profiles saved to analysis_output\actigraphy_column_profiles.csv


In [11]:
# Visualize missing data
# Physical columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_data[physical_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Physical Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(physical_analysis_output_folder, 'physical_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(physical_analysis_output_folder, 'physical_missing_data_heatmap.png')}")

# Fitness columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_data[fitness_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Fitness Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(fitness_analysis_output_folder, 'fitness_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(fitness_analysis_output_folder, 'fitness_missing_data_heatmap.png')}")

# BIA columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_data[bia_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in BIA Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(bia_analysis_output_folder, 'bia_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(bia_analysis_output_folder, 'bia_missing_data_heatmap.png')}")

# Child info columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_data[child_info_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Child info Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(child_info_analysis_output_folder, 'child_info_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(child_info_analysis_output_folder, 'child_info_missing_data_heatmap.png')}")

# Actigraphy info columns
plt.figure(figsize=(40, 6))
sns.heatmap(train_data[actigraphy_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Actigraphy info Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(actigraphy_analysis_output_folder, 'actigraphy_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(actigraphy_analysis_output_folder, 'actigraphy_missing_data_heatmap.png')}")

Missing data heatmap saved to analysis_output/physical\physical_missing_data_heatmap.png
Missing data heatmap saved to analysis_output/fitness\fitness_missing_data_heatmap.png
Missing data heatmap saved to analysis_output/bia\bia_missing_data_heatmap.png
Missing data heatmap saved to analysis_output/child_info\child_info_missing_data_heatmap.png
Missing data heatmap saved to analysis_output/actigraphy\actigraphy_missing_data_heatmap.png


In [12]:
# Correlation matrix for physical numeric columns
physical_numeric_columns = train_data[physical_columns].select_dtypes(include=[np.number]).columns
physical_correlation_matrix = train_data[physical_numeric_columns].corr()

plt.figure(figsize=(20, 18))
sns.heatmap(physical_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Physical Attributes')
plt.tight_layout()
plt.savefig(os.path.join(physical_analysis_output_folder, 'physical_correlation_matrix.png'))
plt.close()
print(f"Physical correlation matrix saved to {os.path.join(physical_analysis_output_folder, 'physical_correlation_matrix.png')}")

# Correlation matrix for fitness numeric columns
fitness_numeric_columns = train_data[fitness_columns].select_dtypes(include=[np.number]).columns
fitness_correlation_matrix = train_data[fitness_numeric_columns].corr()

plt.figure(figsize=(20, 18))
sns.heatmap(fitness_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Fitness Attributes')
plt.tight_layout()
plt.savefig(os.path.join(fitness_analysis_output_folder, 'fitness_correlation_matrix.png'))
plt.close()
print(f"Fitness correlation matrix saved to {os.path.join(fitness_analysis_output_folder, 'fitness_correlation_matrix.png')}")

# Correlation matrix for bia numeric columns
bia_numeric_columns = train_data[bia_columns].select_dtypes(include=[np.number]).columns
bia_correlation_matrix = train_data[bia_numeric_columns].corr()

plt.figure(figsize=(20, 18))
sns.heatmap(bia_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric BIA Attributes')
plt.tight_layout()
plt.savefig(os.path.join(bia_analysis_output_folder, 'bia_correlation_matrix.png'))
plt.close()
print(f"BIA correlation matrix saved to {os.path.join(bia_analysis_output_folder, 'BIA_correlation_matrix.png')}")

# Correlation matrix for child info numeric columns
child_info_numeric_columns = train_data[child_info_columns].select_dtypes(include=[np.number]).columns
child_info_correlation_matrix = train_data[child_info_numeric_columns].corr()

plt.figure(figsize=(24, 22))
sns.heatmap(child_info_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Child info Attributes')
plt.tight_layout()
plt.savefig(os.path.join(child_info_analysis_output_folder, 'child_info_correlation_matrix.png'))
plt.close()
print(f"BIA correlation matrix saved to {os.path.join(child_info_analysis_output_folder, 'child_info_correlation_matrix.png')}")

# Correlation matrix for actigraphy numeric columns
actigraphy_numeric_columns = train_data[actigraphy_columns].select_dtypes(include=[np.number]).columns
actigraphy_correlation_matrix = train_data[actigraphy_numeric_columns].corr()

plt.figure(figsize=(80, 78))
sns.heatmap(actigraphy_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Actigraphy Attributes')
plt.tight_layout()
plt.savefig(os.path.join(actigraphy_analysis_output_folder, 'actigraphy_correlation_matrix.png'))
plt.close()
print(f"BIA correlation matrix saved to {os.path.join(actigraphy_analysis_output_folder, 'actigraphy_correlation_matrix.png')}")

Physical correlation matrix saved to analysis_output/physical\physical_correlation_matrix.png
Fitness correlation matrix saved to analysis_output/fitness\fitness_correlation_matrix.png
BIA correlation matrix saved to analysis_output/bia\BIA_correlation_matrix.png
BIA correlation matrix saved to analysis_output/child_info\child_info_correlation_matrix.png
BIA correlation matrix saved to analysis_output/actigraphy\actigraphy_correlation_matrix.png


In [13]:
# Combine all columns into a single list
all_columns = []
all_columns.extend(physical_numeric_columns)
all_columns.extend(fitness_numeric_columns)
all_columns.extend(bia_numeric_columns)
all_columns.extend(child_info_numeric_columns)
all_columns.extend(actigraphy_numeric_columns)

# Create a tqdm progress bar
with tqdm(total=len(all_columns), desc="Creating distribution plots") as pbar:
    # Distribution plots for physical numeric columns
    for column in physical_numeric_columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(train_data[column].dropna(), kde=True)
        plt.title(f'Distribution of physical {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig(os.path.join(physical_analysis_output_folder, f'physical_{column}_distribution.png'))
        plt.close()
        pbar.update(1)

    # Distribution plots for fitness numeric columns
    for column in fitness_numeric_columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(train_data[column].dropna(), kde=True)
        plt.title(f'Distribution of fitness {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig(os.path.join(fitness_analysis_output_folder, f'fitness_{column}_distribution.png'))
        plt.close()
        pbar.update(1)

    # Distribution plots for BIA numeric columns
    for column in bia_numeric_columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(train_data[column].dropna(), kde=True)
        plt.title(f'Distribution of bia {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig(os.path.join(bia_analysis_output_folder, f'bia_{column}_distribution.png'))
        plt.close()
        pbar.update(1)
        
    # Distribution plots for Child info numeric columns
    for column in child_info_numeric_columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(train_data[column].dropna(), kde=True)
        plt.title(f'Distribution of child info {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig(os.path.join(child_info_analysis_output_folder, f'child_info_{column}_distribution.png'))
        plt.close()
        pbar.update(1)
        
    # Distribution plots for actigraphy numeric columns
    for column in actigraphy_numeric_columns:
        plt.figure(figsize=(10, 6))
        sns.histplot(train_data[column].dropna(), kde=True)
        plt.title(f'Distribution of actigraphy {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.tight_layout()
        plt.savefig(os.path.join(actigraphy_analysis_output_folder, f'actigraphy_{column}_distribution.png'))
        plt.close()
        pbar.update(1)

print("All analyses completed and saved to the 'analysis_output' folder.")

Creating distribution plots: 100%|██████████| 199/199 [00:24<00:00,  8.02it/s]

All analyses completed and saved to the 'analysis_output' folder.





In [14]:
# Supplement missing data with data from WHO
# Load who data and group
def load_who_bmi_data(file_path):
    who_data = pd.read_csv(file_path)
    who_data = who_data.groupby(['age', 'sex']).agg({
        'L': 'mean', 'mean_bmi': 'mean', 'S': 'mean'
    }).reset_index()
    who_data = who_data.set_index(['sex', 'age'])
    return who_data

def load_who_height_data(file_path):
    who_data = pd.read_csv(file_path)
    who_data = who_data.groupby(['age', 'sex']).agg({
        'mean_height': 'mean'
    }).reset_index()
    who_data = who_data.set_index(['sex', 'age'])
    return who_data

who_bmi_data = load_who_bmi_data('supplemental_data/bmi_for_age_5_to_19.csv')
who_height_data = load_who_height_data('supplemental_data/height_for_age_5_to_19.csv')

In [15]:
# Defining functions to Impute with data from WHO
def get_who_stats(age, sex, data_type='bmi'):
    try:
        if data_type == 'bmi':
            stats = who_bmi_data.loc[(sex, age), ['mean_bmi', 'S']]
            return stats['mean_bmi'], stats['S']
        elif data_type == 'height':
            stats = who_height_data.loc[(sex, age), 'mean_height']
            return stats
    except KeyError:
        return None, None if data_type == 'bmi' else None
    
def impute_bmi(age, sex):
    mean_bmi, sd = get_who_stats(age, sex, 'bmi')
    if mean_bmi is not None and sd is not None:
        imputed_bmi = np.random.normal(mean_bmi, sd)
        return round(imputed_bmi, 2)
    else:
        return None

def impute_height(age, sex):
    mean_height_cm = get_who_stats(age, sex, 'height')
    if mean_height_cm is not None:
        mean_height_inches = mean_height_cm / 2.54  # Convert cm to inches
        return round(mean_height_inches, 2)
    else:
        return None
    
def impute_weight(bmi, height_inches):
    if bmi is not None and height_inches is not None:
        height_meters = height_inches * 0.0254  # Convert inches to meters
        weight_kg = bmi * (height_meters ** 2)
        weight_lbs = weight_kg * 2.20462  # Convert kg to lbs
        return round(weight_lbs, 2)
    else:
        return None
    
def apply_imputation(df):
    def impute_if_missing(row):
        age = row['Basic_Demos-Age']
        sex = row['Basic_Demos-Sex']
        
        if pd.isna(row['Physical-BMI']) and 5 <= age <= 19:
            row['Physical-BMI'] = impute_bmi(age, sex)
        
        if pd.isna(row['Physical-Height']) and 5 <= age <= 19:
            row['Physical-Height'] = impute_height(age, sex)
        
        if pd.isna(row['Physical-Weight']) and row['Physical-BMI'] is not None and row['Physical-Height'] is not None:
            row['Physical-Weight'] = impute_weight(row['Physical-BMI'], row['Physical-Height'])
        
        return row
    
    return df.apply(impute_if_missing, axis=1)

In [16]:
train_data.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95,BIA-BIA_BMC_boxcox,BIA-BIA_BMR_boxcox,BIA-BIA_DEE_boxcox,BIA-BIA_ECW_boxcox,BIA-BIA_Fat_boxcox,BIA-BIA_FFM_boxcox,BIA-BIA_FFMI_boxcox,BIA-BIA_FMI_boxcox,BIA-BIA_ICW_boxcox,BIA-BIA_LDM_boxcox,BIA-BIA_LST_boxcox,BIA-BIA_TBW_boxcox,CGAS-CGAS_Score_boxcox,stat_23_boxcox,stat_35_boxcox,stat_38_boxcox,stat_40_boxcox,stat_47_boxcox,stat_54_boxcox,stat_66_boxcox,stat_78_boxcox,stat_80_boxcox,stat_88_boxcox,stat_90_boxcox,FGC-FGC_GS,FGC-FGC_SR,fitness_score,PAQ_Total,BIA_Fat_X_PCIAT_Total,Physical_Weight_X_BIA_BMI,CGAS_Score_X_PCIAT_Total,BIA_Fat_X_Physical_Weight,BIA_BMI_X_PCIAT_Total,Physical_Weight_X_PCIAT_Total,combined_actigraphy_stat
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.795346,0.494067,1.013221,1.877806,6.776256e+107,1.210694,0.378516,13083250000000.0,0.836481,1.443716,1.461227,1.296982,2.887303,,,,,,,,,,,,,0.0,,,506.75735,857.46336,2805.0,468.059516,928.356,2794.0,
1,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.791257,0.494067,1.013225,1.624546,6.664675e+107,1.211268,0.378436,12361790000000.0,0.832549,1.635205,1.462669,1.279664,,,,,,,,,,,,,,2.0,,2.34,0.0,645.7066,,182.6591,0.0,0.0,
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.054867,,,,,,,,,,,,3.0,2.0,7.0,2.17,,,1988.0,,,2116.8,
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,-0.316384,0.016009,-0.16789,0.047388,-10.580416,0.0,42.29631,4053.579102,50462150000000.0,4.470182,3.0,53.201683,0.453665,0.502702,0.58571,0.106351,42.94717,0.0,208.168976,112.404045,19428420000000.0,1.931421,0.0,14.244914,-1.746094,-2.905339,-1.048372,0.0,-89.833092,0.0,0.0,3824.0,55000000000.0,1.0,3.0,41.0,-0.68418,-0.309863,-0.649974,0.006432,-41.541863,0.0,2.392969,4028.666748,36890000000000.0,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,53477500000000.0,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.04142,12.220764,0.0,15.0,4147.0,66408750000000.0,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0,1.845723,0.494067,1.013388,2.360996,6.985483e+107,1.229847,0.378534,13552920000000.0,0.841114,1.671884,1.5003,1.322946,3.054867,8.272084,4.197842,8.16162e+59,0.126963,8.138407,0.986268,2.184901,3.15009,5.434934e+31,2.016626e+102,786.87414,,0.0,,2.451,828.2692,1492.81488,3124.0,1536.06288,804.9492,3590.4,43330.0
4,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,


In [17]:
# Apply to datasets
train_data = apply_imputation(train_data)
test_data = apply_imputation(test_data)

# Check the results
print("Number of missing values after imputation:")
print("BMI:", train_data['Physical-BMI'].isna().sum())
print("Height:", train_data['Physical-Height'].isna().sum())
print("Weight:", train_data['Physical-Weight'].isna().sum())

print("\nSample of imputed BMI, Height, and Weight values:")
imputed_sample = train_data[
    (train_data['Physical-BMI'].notnull() | train_data['Physical-Height'].notnull() | train_data['Physical-Weight'].notnull()) &
    ((train_data['Physical-BMI'].notnull() != train_data['Physical-BMI'].notnull().shift()) |
     (train_data['Physical-Height'].notnull() != train_data['Physical-Height'].notnull().shift()) |
     (train_data['Physical-Weight'].notnull() != train_data['Physical-Weight'].notnull().shift()))
].sample(5)[['Basic_Demos-Age', 'Basic_Demos-Sex', 'Physical-BMI', 'Physical-Height', 'Physical-Weight']]
print(imputed_sample)

Number of missing values after imputation:
BMI: 21
Height: 21
Weight: 20

Sample of imputed BMI, Height, and Weight values:
      Basic_Demos-Age  Basic_Demos-Sex  Physical-BMI  Physical-Height  \
998                 7                0     15.610000            48.93   
641                13                1     25.279931            63.50   
3011               10                1     16.610000            55.72   
854                12                0     17.950000            59.92   
2153               13                0     20.649384            64.50   

      Physical-Weight  
998             53.16  
641            145.00  
3011            73.35  
854             91.67  
2153           122.20  


In [18]:
# Additional analysis of imputation results
print("\nImputation summary by age:")
age_summary = train_data.groupby('Basic_Demos-Age').agg({
    'Physical-BMI': ['count', 'mean', 'std', 'min', 'max'],
    'Physical-Height': ['count', 'mean', 'std', 'min', 'max'],
    'Physical-Weight': ['count', 'mean', 'std', 'min', 'max'],
    'Basic_Demos-Sex': 'count'
})
print(age_summary)


Imputation summary by age:
                Physical-BMI                                             \
                       count       mean       std        min        max   
Basic_Demos-Age                                                           
5                        112  16.644545  2.713656  12.853139  26.512004   
6                        369  16.233201  2.604812   0.000000  34.055363   
7                        436  16.486701  2.854347   9.693766  43.468833   
8                        490  17.303096  3.606365   0.000000  59.132048   
9                        467  17.507577  3.138702  11.925153  33.729946   
10                       420  18.392221  4.043273   0.000000  44.554097   
11                       334  19.045918  3.976996  11.915254  37.484035   
12                       291  20.112896  4.432479  14.223937  40.399621   
13                       236  20.472202  4.766772   0.000000  39.494148   
14                       200  21.746431  5.146248  13.740613  44.835548 

In [19]:
# Plot of imputed vs. original data
plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-BMI', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('BMI vs Age (After Imputation)')
plt.savefig('analysis_output/bmi_vs_age_imputed.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-Height', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('Height vs Age (After Imputation)')
plt.savefig('analysis_output/height_vs_age_imputed.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-Weight', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('Weight vs Age (After Imputation)')
plt.savefig('analysis_output/weight_vs_age_imputed.png')
plt.close()

print("Imputation analysis plots saved in the 'analysis_output' folder.")

Imputation analysis plots saved in the 'analysis_output' folder.


In [20]:
# Export train_data to CSV
train_output_path = os.path.join(output_folder, 'train_data_imputed.csv')
train_data.to_csv(train_output_path, index=False)
print(f"Imputed train data exported to: {train_output_path}")

# Export test_data to CSV
test_output_path = os.path.join(output_folder, 'test_data_imputed.csv')
test_data.to_csv(test_output_path, index=False)
print(f"Imputed test data exported to: {test_output_path}")

print("Data export completed.")

Imputed train data exported to: output\train_data_imputed.csv
Imputed test data exported to: output\test_data_imputed.csv
Data export completed.


In [21]:
# Define categorical columns
category_columns =[
    'Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season',
    'FGC-Season', 'BIA-Season', 'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season',
    'PreInt_EduHx-Season'
]

# Enumerate categorical columns
def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

for col in category_columns:
    mapping = create_mapping(col, train_data)
    train_data[col] = train_data[col].replace(mapping).infer_objects(copy=False).astype(int)
    test_data[col] = test_data[col].replace(create_mapping(col, test_data)).infer_objects(copy=False).astype(int)

  train_data[col] = train_data[col].replace(mapping).infer_objects(copy=False).astype(int)
  test_data[col] = test_data[col].replace(create_mapping(col, test_data)).infer_objects(copy=False).astype(int)
  train_data[col] = train_data[col].replace(mapping).infer_objects(copy=False).astype(int)
  test_data[col] = test_data[col].replace(create_mapping(col, test_data)).infer_objects(copy=False).astype(int)
  train_data[col] = train_data[col].replace(mapping).infer_objects(copy=False).astype(int)
  test_data[col] = test_data[col].replace(create_mapping(col, test_data)).infer_objects(copy=False).astype(int)
  train_data[col] = train_data[col].replace(mapping).infer_objects(copy=False).astype(int)
  test_data[col] = test_data[col].replace(create_mapping(col, test_data)).infer_objects(copy=False).astype(int)
  train_data[col] = train_data[col].replace(mapping).infer_objects(copy=False).astype(int)
  test_data[col] = test_data[col].replace(create_mapping(col, test_data)).infer_objects(copy=Fals

In [22]:
train_data.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95,BIA-BIA_BMC_boxcox,BIA-BIA_BMR_boxcox,BIA-BIA_DEE_boxcox,BIA-BIA_ECW_boxcox,BIA-BIA_Fat_boxcox,BIA-BIA_FFM_boxcox,BIA-BIA_FFMI_boxcox,BIA-BIA_FMI_boxcox,BIA-BIA_ICW_boxcox,BIA-BIA_LDM_boxcox,BIA-BIA_LST_boxcox,BIA-BIA_TBW_boxcox,CGAS-CGAS_Score_boxcox,stat_23_boxcox,stat_35_boxcox,stat_38_boxcox,stat_40_boxcox,stat_47_boxcox,stat_54_boxcox,stat_66_boxcox,stat_78_boxcox,stat_80_boxcox,stat_88_boxcox,stat_90_boxcox,FGC-FGC_GS,FGC-FGC_SR,fitness_score,PAQ_Total,BIA_Fat_X_PCIAT_Total,Physical_Weight_X_BIA_BMI,CGAS_Score_X_PCIAT_Total,BIA_Fat_X_Physical_Weight,BIA_BMI_X_PCIAT_Total,Physical_Weight_X_PCIAT_Total,combined_actigraphy_stat
0,0,5,0,0,51.0,0,16.877316,46.0,50.8,,,,,0,,,,0,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,0,,0,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,0,,,0,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.795346,0.494067,1.013221,1.877806,6.776256e+107,1.210694,0.378516,13083250000000.0,0.836481,1.443716,1.461227,1.296982,2.887303,,,,,,,,,,,,,0.0,,,506.75735,857.46336,2805.0,468.059516,928.356,2794.0,
1,1,9,0,1,,0,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,0,,,,0,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,1,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,0,,1,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,46.0,64.0,1,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.791257,0.494067,1.013225,1.624546,6.664675e+107,1.211268,0.378436,12361790000000.0,0.832549,1.635205,1.462669,1.279664,,,,,,,,,,,,,,2.0,,2.34,0.0,645.7066,,182.6591,0.0,0.0,
2,1,10,1,2,71.0,0,16.648696,56.5,75.6,,65.0,94.0,117.0,1,5.0,7.0,33.0,0,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,2,,,,,,,,,,,,,,,,,0,,2,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,1,38.0,54.0,1,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.054867,,,,,,,,,,,,3.0,2.0,7.0,2.17,,,1988.0,,,2116.8,
3,2,9,0,2,71.0,1,18.292347,56.0,81.6,,60.0,97.0,117.0,2,6.0,9.0,37.0,1,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,3,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,0,,3,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,2,31.0,45.0,2,0.0,1.0,-0.316384,0.016009,-0.16789,0.047388,-10.580416,0.0,42.29631,4053.579102,50462150000000.0,4.470182,3.0,53.201683,0.453665,0.502702,0.58571,0.106351,42.94717,0.0,208.168976,112.404045,19428420000000.0,1.931421,0.0,14.244914,-1.746094,-2.905339,-1.048372,0.0,-89.833092,0.0,0.0,3824.0,55000000000.0,1.0,3.0,41.0,-0.68418,-0.309863,-0.649974,0.006432,-41.541863,0.0,2.392969,4028.666748,36890000000000.0,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,53477500000000.0,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.04142,12.220764,0.0,15.0,4147.0,66408750000000.0,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0,1.845723,0.494067,1.013388,2.360996,6.985483e+107,1.229847,0.378534,13552920000000.0,0.841114,1.671884,1.5003,1.322946,3.054867,8.272084,4.197842,8.16162e+59,0.126963,8.138407,0.986268,2.184901,3.15009,5.434934e+31,2.016626e+102,786.87414,,0.0,,2.451,828.2692,1492.81488,3124.0,1536.06288,804.9492,3590.4,43330.0
4,3,18,1,3,,2,21.28,64.22,124.83,,,,,0,,,,2,,,,,,,,,,,,,,,2,,,,,,,,,,,,,,,,,1,1.04,0,,,,,,,,,,,,,,,,,,,,,,,,0,,,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,


In [23]:
# High-risk vs. Low-risk analysis
# Suppress RuntimeWarnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

# List of columns to drop
columns_to_drop = [
    'FGC-FGC_CU', 'FGC-FGC_PU', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 
    'FGC-FGC_SRL', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 
    'Fitness_Endurance-Time_Sec', 'stat_41', 'stat_42', 'stat_39', 'stat_92_boxcox', 'stat_92',
    'stat_44', 'PCIAT-Season', 'PCIAT-PCIAT_Total'
]

train_data = train_data.drop(columns=columns_to_drop, errors='ignore')

# Separate high-risk and low-risk groups
high_risk = train_data[train_data['sii'] == 3]
low_risk = train_data[train_data['sii'] != 3]

# Function to calculate summary statistics
def get_summary_stats(group):
    return pd.DataFrame({
        'mean': group.mean(),
        'median': group.median(),
        'std': group.std(),
        'min': group.min(),
        'max': group.max()
    })

# List of columns to analyze (excluding 'sii' and non-numeric columns)
columns_to_analyze = train_data.select_dtypes(include=[np.number]).columns.drop('sii')

# Calculate summary statistics for both groups
high_risk_stats = get_summary_stats(high_risk[columns_to_analyze])
low_risk_stats = get_summary_stats(low_risk[columns_to_analyze])

# Calculate the difference in means
mean_diff = high_risk_stats['mean'] - low_risk_stats['mean']

# Perform statistical tests to check for significant differences
def perform_statistical_test(col):
    try:
        # Try t-test first
        t_stat, p_val = stats.ttest_ind(high_risk[col].dropna(), low_risk[col].dropna(), equal_var=False)
        test_type = 't-test'
    except Exception:
        # If t-test fails, use Mann-Whitney U test
        try:
            u_stat, p_val = stats.mannwhitneyu(high_risk[col].dropna(), low_risk[col].dropna(), alternative='two-sided')
            test_type = 'Mann-Whitney U'
        except Exception:
            # If both tests fail, return NaN values
            return pd.Series({'statistic': np.nan, 'p_value': np.nan, 'test_type': 'Failed'})
    
    return pd.Series({'statistic': t_stat if test_type == 't-test' else u_stat, 
                      'p_value': p_val, 
                      'test_type': test_type})

test_results = pd.DataFrame({col: perform_statistical_test(col) for col in columns_to_analyze}).T

# Combine results
comparison_results = pd.concat([
    high_risk_stats.add_prefix('high_risk_'),
    low_risk_stats.add_prefix('low_risk_'),
    mean_diff.rename('mean_difference'),
    test_results
], axis=1)

# Sort by absolute mean difference
comparison_results = comparison_results.sort_values('mean_difference', key=abs, ascending=False)

# Display the top 20 most different features
print(comparison_results.head(20))

                               high_risk_mean  high_risk_median  \
BIA-BIA_Fat_boxcox              7.889412e+107     7.714071e+107   
stat_88_boxcox                  1.439637e+102     1.487563e+102   
stat_38_boxcox                   9.403505e+59      1.029457e+60   
stat_80_boxcox                   5.244082e+31      5.190343e+31   
BIA-BIA_FMI_boxcox               1.583919e+13      1.533522e+13   
stat_68                          4.725150e+13      4.343500e+13   
stat_32                          2.349939e+13      2.494051e+13   
stat_56                          2.757488e+13      2.172500e+13   
stat_80                          6.538125e+13      6.510500e+13   
stat_20                          4.599412e+13      4.330197e+13   
combined_actigraphy_stat         2.921386e+05      3.774300e+05   
Physical_Weight_X_PCIAT_Total    1.389549e+04      1.318680e+04   
BIA_Fat_X_Physical_Weight        1.078569e+04      8.391939e+03   
BIA_Fat_X_PCIAT_Total            4.885348e+03      4.069035e+0

In [24]:
# Visualize distributions for top features
def plot_distribution(feature):
    plt.figure(figsize=(10, 6))
    sns.histplot(data=train_data, x=feature, hue='sii', kde=True, common_norm=False)
    plt.title(f'Distribution of {feature} by Risk Group')
    plt.savefig(f'analysis_output/distribution_{feature}.png')
    plt.close()

for feature in comparison_results.head(20).index:
    plot_distribution(feature)

In [25]:
train_data.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,FGC-Season,FGC-FGC_CU_Zone,FGC-FGC_PU_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_40,stat_43,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_93,stat_94,stat_95,BIA-BIA_BMC_boxcox,BIA-BIA_BMR_boxcox,BIA-BIA_DEE_boxcox,BIA-BIA_ECW_boxcox,BIA-BIA_Fat_boxcox,BIA-BIA_FFM_boxcox,BIA-BIA_FFMI_boxcox,BIA-BIA_FMI_boxcox,BIA-BIA_ICW_boxcox,BIA-BIA_LDM_boxcox,BIA-BIA_LST_boxcox,BIA-BIA_TBW_boxcox,CGAS-CGAS_Score_boxcox,stat_23_boxcox,stat_35_boxcox,stat_38_boxcox,stat_40_boxcox,stat_47_boxcox,stat_54_boxcox,stat_66_boxcox,stat_78_boxcox,stat_80_boxcox,stat_88_boxcox,stat_90_boxcox,FGC-FGC_GS,FGC-FGC_SR,fitness_score,PAQ_Total,BIA_Fat_X_PCIAT_Total,Physical_Weight_X_BIA_BMI,CGAS_Score_X_PCIAT_Total,BIA_Fat_X_Physical_Weight,BIA_BMI_X_PCIAT_Total,Physical_Weight_X_PCIAT_Total,combined_actigraphy_stat
0,0,5,0,0,51.0,0,16.877316,46.0,50.8,,,,,0,,,0,0.0,0.0,6.0,1.0,0,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,0,,0,,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,0,,,0,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.795346,0.494067,1.013221,1.877806,6.776256e+107,1.210694,0.378516,13083250000000.0,0.836481,1.443716,1.461227,1.296982,2.887303,,,,,,,,,,,,,0.0,,,506.75735,857.46336,2805.0,468.059516,928.356,2794.0,
1,1,9,0,1,,0,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,0,,,0,0.0,0.0,3.0,0.0,1,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,0,,1,2.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,46.0,64.0,1,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.791257,0.494067,1.013225,1.624546,6.664675e+107,1.211268,0.378436,12361790000000.0,0.832549,1.635205,1.462669,1.279664,,,,,,,,,,,,,,2.0,,2.34,0.0,645.7066,,182.6591,0.0,0.0,
2,1,10,1,2,71.0,0,16.648696,56.5,75.6,,65.0,94.0,117.0,1,5.0,7.0,0,1.0,1.0,5.0,0.0,2,,,,,,,,,,,,,,,,,0,,2,2.17,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,1,38.0,54.0,1,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3.054867,,,,,,,,,,,,3.0,2.0,7.0,2.17,,,1988.0,,,2116.8,
3,2,9,0,2,71.0,1,18.292347,56.0,81.6,,60.0,97.0,117.0,2,6.0,9.0,1,1.0,0.0,7.0,1.0,3,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,0,,3,2.451,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,2,31.0,45.0,2,0.0,1.0,-0.316384,0.016009,-0.16789,0.047388,-10.580416,0.0,42.29631,4053.579102,50462150000000.0,4.470182,3.0,53.201683,0.453665,0.502702,0.58571,0.106351,42.94717,0.0,208.168976,112.404045,19428420000000.0,1.931421,0.0,14.244914,-1.746094,-2.905339,-1.048372,-89.833092,3824.0,1.0,3.0,41.0,-0.68418,-0.309863,-0.649974,0.006432,-41.541863,0.0,2.392969,4028.666748,36890000000000.0,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,53477500000000.0,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.04142,12.220764,0.0,15.0,4147.0,66408750000000.0,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,7.0,3.0,85.0,1.845723,0.494067,1.013388,2.360996,6.985483e+107,1.229847,0.378534,13552920000000.0,0.841114,1.671884,1.5003,1.322946,3.054867,8.272084,4.197842,8.16162e+59,0.126963,8.138407,0.986268,2.184901,3.15009,5.434934e+31,2.016626e+102,786.87414,,0.0,,2.451,828.2692,1492.81488,3124.0,1536.06288,804.9492,3590.4,43330.0
4,3,18,1,3,,2,21.28,64.22,124.83,,,,,0,,,2,,,,,2,,,,,,,,,,,,,,,,,1,1.04,0,,,,,,,,,,,,,,,,,,,,,,0,,,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.04,,,,,,,


In [26]:
# Correlation analysis
def get_correlations(group):
    return group.corr().iloc[:, 0].sort_values(key=abs, ascending=False)

high_risk_corr = get_correlations(high_risk[columns_to_analyze])
low_risk_corr = get_correlations(low_risk[columns_to_analyze])

print("\nTop correlations in high-risk group:")
print(high_risk_corr.head(20))
print("\nTop correlations in low-risk group:")
print(low_risk_corr.head(20))

# Calculate correlation differences
corr_diff = high_risk_corr - low_risk_corr
corr_diff = corr_diff.sort_values(key=abs, ascending=False)

print("\nTop correlation differences (high-risk minus low-risk):")
print(corr_diff.head(20))

# Save correlation results to CSV
pd.DataFrame({
    'high_risk_corr': high_risk_corr,
    'low_risk_corr': low_risk_corr,
    'correlation_difference': corr_diff
}).to_csv('analysis_output/correlation_comparison.csv')

# Save results to CSV
comparison_results.to_csv('analysis_output/high_low_risk_comparison.csv')

print("\nAnalysis complete. Results saved to 'analysis_output/high_low_risk_comparison.csv' and 'analysis_output/correlation_comparison.csv'")


Top correlations in high-risk group:
Basic_Demos-Enroll_Season       1.000000
Fitness_Endurance-Time_Mins    -1.000000
Fitness_Endurance-Max_Stage    -1.000000
PreInt_EduHx-Season             0.984765
Physical-Waist_Circumference   -0.712173
stat_73                        -0.543203
stat_78_boxcox                  0.540967
stat_54_boxcox                  0.536142
stat_35_boxcox                 -0.524103
stat_54                         0.523392
stat_69                        -0.514496
stat_35                        -0.512088
stat_78                         0.500698
stat_95                        -0.493043
stat_26                         0.492735
PAQ_A-PAQ_A_Total               0.487407
stat_13                        -0.486240
stat_28                         0.484036
BIA-BIA_SMM                    -0.481714
stat_66                         0.465325
Name: Basic_Demos-Enroll_Season, dtype: float64

Top correlations in low-risk group:
Basic_Demos-Enroll_Season    1.000000
PreInt_EduHx-Season

In [27]:
# Visualize top correlation differences
plt.figure(figsize=(16, 12))
corr_diff.head(20).plot(kind='bar')
plt.title('Top 20 Correlation Differences (High-risk minus Low-risk)')
plt.xlabel('Features')
plt.ylabel('Correlation Difference')
plt.tight_layout()
plt.savefig('analysis_output/top_correlation_differences.png')
plt.close()

print("Correlation difference plot saved to 'analysis_output/top_correlation_differences.png'")

Correlation difference plot saved to 'analysis_output/top_correlation_differences.png'


In [28]:
# Identify common columns and train-only columns
common_columns = list(set(train_data.columns) & set(test_data.columns))
train_only_columns = list(set(train_data.columns) - set(test_data.columns))

# Remove 'sii' from feature columns if present
if 'sii' in common_columns:
    common_columns.remove('sii')
if 'sii' in train_only_columns:
    train_only_columns.remove('sii')

print(f"Number of common columns: {len(common_columns)}")
print(f"Number of train-only columns: {len(train_only_columns)}")

Number of common columns: 161
Number of train-only columns: 20


In [29]:
# Separate features and target
X = train_data[common_columns]
y = train_data['sii']
X_test = test_data[common_columns]

y = y.fillna(y.mode()[0])

# KNN Imputation
imputer = KNNImputer(n_neighbors=5)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

In [30]:
# Calculate correlation differences
high_risk = X_imputed[y == 3]
low_risk = X_imputed[y != 3]

high_risk_corr = high_risk.corr().abs()
low_risk_corr = low_risk.corr().abs()

corr_diff = (high_risk_corr - low_risk_corr).abs()
mean_corr_diff = corr_diff.mean().sort_values(ascending=False)
top_corr_diff_features = mean_corr_diff.head(20).index.tolist() # Select the top 20 features

# Main feature selection process
correlation_threshold = 0.3
corr_with_target = X_imputed.corrwith(y).abs()
corr_selected_features = corr_with_target[corr_with_target > correlation_threshold].index.tolist()

# T test
def calculate_t_test(feature):
    high_risk_values = high_risk[feature]
    low_risk_values = low_risk[feature]
    _, p_value = stats.ttest_ind(high_risk_values, low_risk_values)
    return p_value

significant_difference_threshold = 0.05
diff_selected_features = [feature for feature in X_imputed.columns if calculate_t_test(feature) < significant_difference_threshold]

# Statistical feature selection
selector = SelectKBest(score_func=f_classif, k=50)
selector.fit(X_imputed, y)
stat_selected_features = X_imputed.columns[selector.get_support()].tolist()

all_selected_features = list(set(corr_selected_features + 
                                 diff_selected_features + 
                                 top_corr_diff_features + 
                                 stat_selected_features))

domain_specific_features = ['PreInt_EduHx-computerinternet_hoursday', 'Basic_Demos-Age']
all_selected_features += domain_specific_features
all_selected_features = list(set(all_selected_features))

print(f"Total number of selected features: {len(all_selected_features)}")
print("Selected features:", all_selected_features)

Total number of selected features: 86
Selected features: ['BIA-BIA_FFMI_boxcox', 'stat_19', 'stat_49', 'Physical-Weight', 'stat_63', 'BIA-BIA_DEE_boxcox', 'CGAS-Season', 'stat_67', 'BIA-BIA_ICW_boxcox', 'BIA-BIA_LST_boxcox', 'BIA-BIA_FFM_boxcox', 'stat_69', 'PreInt_EduHx-computerinternet_hoursday', 'BIA-BIA_FFM', 'stat_90_boxcox', 'BIA-BIA_FMI', 'CGAS_Score_X_PCIAT_Total', 'FGC-FGC_SR', 'stat_80_boxcox', 'BIA-BIA_DEE', 'stat_64', 'stat_90', 'stat_76', 'stat_51', 'Fitness_Endurance-Season', 'PAQ_A-Season', 'Basic_Demos-Age', 'stat_21', 'stat_85', 'BIA-BIA_LDM_boxcox', 'FGC-FGC_GS', 'stat_27', 'stat_66', 'Physical-Waist_Circumference', 'stat_65', 'Physical-Height', 'stat_26', 'SDS-Season', 'stat_28', 'stat_16', 'Physical-Systolic_BP', 'BIA-BIA_ECW_boxcox', 'BIA-BIA_ICW', 'BIA-BIA_BMI', 'stat_78_boxcox', 'stat_79', 'BIA-BIA_ECW', 'BIA_BMI_X_PCIAT_Total', 'BIA_Fat_X_PCIAT_Total', 'Physical-BMI', 'BIA_Fat_X_Physical_Weight', 'stat_40', 'PAQ_A-PAQ_A_Total', 'Physical_Weight_X_BIA_BMI', 'stat

In [None]:
# Prepare datasets
X_selected = X_imputed[all_selected_features]
X_test_selected = X_test_imputed[all_selected_features]
