In [1]:
import numpy as np
import pandas as pd
import os
import random
import warnings
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from colorama import Fore, Style
from IPython.display import clear_output
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# Create analysis_output folder if it doesn't exist
output_folder = 'analysis_output'
os.makedirs(output_folder, exist_ok=True)

# Create 'output' folder if it doesn't exist
output_folder = 'output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Load data functions
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

# Set display all columns in dataframes property
pd.options.display.max_columns = None

In [3]:
# Load data
train_data = pd.read_csv('input/train.csv')
test_data = pd.read_csv('input/test.csv')
sample_data = pd.read_csv('input/sample_submission.csv')

# Load time series data
train_ts_data = load_time_series("input/series_train.parquet")
test_ts_data = load_time_series("input/series_test.parquet")

100%|██████████| 996/996 [01:03<00:00, 15.73it/s]
100%|██████████| 2/2 [00:00<00:00,  9.98it/s]


In [4]:
# Remove id column from time series data
time_series_columns = train_ts_data.columns.tolist()
time_series_columns.remove("id")

# Merge data
train_data = pd.merge(train_data, train_ts_data, how="left", on='id')
test_data = pd.merge(test_data, test_ts_data, how="left", on='id')
train_data = train_data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

train_data.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,stat_10,stat_11,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,60.0,97.0,117.0,Summer,6.0,9.0,37.0,Summer,18.0,1.0,,,,,5.0,0.0,7.0,0.0,7.0,0.0,7.0,1.0,Summer,3.0,3.84191,18.2943,1131.43,1923.44,15.5925,62.7757,14.074,4.22033,18.8243,2.0,30.4041,16.779,58.9338,26.4798,45.9966,,,Winter,2.451,Summer,4.0,2.0,4.0,0.0,5.0,1.0,0.0,3.0,2.0,2.0,3.0,0.0,3.0,0.0,0.0,3.0,4.0,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,-0.316384,0.016009,-0.16789,0.047388,-10.580416,0.0,42.29631,4053.579102,50462150000000.0,4.470182,3.0,53.201683,0.453665,0.502702,0.58571,0.106351,42.94717,0.0,208.168976,112.404045,19428420000000.0,1.931421,0.0,14.244914,-1.746094,-2.905339,-1.048372,0.0,-89.833092,0.0,0.0,3824.0,55000000000.0,1.0,3.0,41.0,-0.68418,-0.309863,-0.649974,0.006432,-41.541863,0.0,2.392969,4028.666748,36890000000000.0,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,53477500000000.0,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.04142,12.220764,0.0,15.0,4147.0,66408750000000.0,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.25,4188.5,86110000000000.0,7.0,3.0,85.0
4,Spring,18,1,Summer,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Summer,1.04,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [5]:
# Isolate the physical attribute columns for analysis
physical_columns = [
    'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
    'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
    'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
    'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP'
]

In [6]:
# Function to analyze columns
def analyze_column(column):
    total_count = len(train_data)
    missing_count = train_data[column].isnull().sum()
    missing_percentage = (missing_count / total_count) * 100
    unique_values = train_data[column].nunique()
    
    if pd.api.types.is_numeric_dtype(train_data[column]):
        mean_value = train_data[column].mean()
        median_value = train_data[column].median()
        std_dev = train_data[column].std()
        return {
            "Column": column,
            "Total Count": total_count,
            "Missing Count": missing_count,
            "Missing Percentage": f"{missing_percentage:.2f}%",
            "Unique Values": unique_values,
            "Data Type": train_data[column].dtype,
            "Mean": mean_value,
            "Median": median_value,
            "Standard Deviation": std_dev
        }
    else:
        top_values = train_data[column].value_counts().head(3).to_dict()
        return {
            "Column": column,
            "Total Count": total_count,
            "Missing Count": missing_count,
            "Missing Percentage": f"{missing_percentage:.2f}%",
            "Unique Values": unique_values,
            "Data Type": train_data[column].dtype,
            "Top 3 Values": top_values
        }
        
column_profiles = [analyze_column(col) for col in physical_columns]
column_profiles_df = pd.DataFrame(column_profiles)

# Save column profiles to CSV
column_profiles_df.to_csv(os.path.join(output_folder, 'column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(output_folder, 'column_profiles.csv')}")

Column profiles saved to analysis_output\column_profiles.csv


In [7]:
# Visualize missing data
plt.figure(figsize=(12, 6))
sns.heatmap(train_data[physical_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Physical Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, 'missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(output_folder, 'missing_data_heatmap.png')}")

Missing data heatmap saved to analysis_output\missing_data_heatmap.png


In [8]:
# Correlation matrix for numeric columns
numeric_columns = train_data[physical_columns].select_dtypes(include=[np.number]).columns
correlation_matrix = train_data[numeric_columns].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Physical Attributes')
plt.tight_layout()
plt.savefig(os.path.join(output_folder, 'correlation_matrix.png'))
plt.close()
print(f"Correlation matrix saved to {os.path.join(output_folder, 'correlation_matrix.png')}")

Correlation matrix saved to analysis_output\correlation_matrix.png


In [9]:
# Additional analysis: Distribution plots for numeric columns
for column in numeric_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(train_data[column].dropna(), kde=True)
    plt.title(f'Distribution of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(output_folder, f'{column}_distribution.png'))
    plt.close()
    print(f"Distribution plot for {column} saved to {os.path.join(output_folder, f'{column}_distribution.png')}")

print("All analyses completed and saved to the 'analysis_output' folder.")

Distribution plot for Basic_Demos-Age saved to analysis_output\Basic_Demos-Age_distribution.png
Distribution plot for Basic_Demos-Sex saved to analysis_output\Basic_Demos-Sex_distribution.png
Distribution plot for CGAS-CGAS_Score saved to analysis_output\CGAS-CGAS_Score_distribution.png
Distribution plot for Physical-BMI saved to analysis_output\Physical-BMI_distribution.png
Distribution plot for Physical-Height saved to analysis_output\Physical-Height_distribution.png
Distribution plot for Physical-Weight saved to analysis_output\Physical-Weight_distribution.png
Distribution plot for Physical-Waist_Circumference saved to analysis_output\Physical-Waist_Circumference_distribution.png
Distribution plot for Physical-Diastolic_BP saved to analysis_output\Physical-Diastolic_BP_distribution.png
Distribution plot for Physical-HeartRate saved to analysis_output\Physical-HeartRate_distribution.png
Distribution plot for Physical-Systolic_BP saved to analysis_output\Physical-Systolic_BP_distribut

In [10]:
# Supplement missing data with data from WHO
# Load who data and group
def load_who_bmi_data(file_path):
    who_data = pd.read_csv(file_path)
    who_data = who_data.groupby(['age', 'sex']).agg({
        'L': 'mean', 'mean_bmi': 'mean', 'S': 'mean'
    }).reset_index()
    who_data = who_data.set_index(['sex', 'age'])
    return who_data

def load_who_height_data(file_path):
    who_data = pd.read_csv(file_path)
    who_data = who_data.groupby(['age', 'sex']).agg({
        'mean_height': 'mean'
    }).reset_index()
    who_data = who_data.set_index(['sex', 'age'])
    return who_data

who_bmi_data = load_who_bmi_data('supplemental_data/bmi_for_age_5_to_19.csv')
who_height_data = load_who_height_data('supplemental_data/height_for_age_5_to_19.csv')

In [11]:
# Defining functions to Impute with data from WHO
def get_who_stats(age, sex, data_type='bmi'):
    try:
        if data_type == 'bmi':
            stats = who_bmi_data.loc[(sex, age), ['mean_bmi', 'S']]
            return stats['mean_bmi'], stats['S']
        elif data_type == 'height':
            stats = who_height_data.loc[(sex, age), 'mean_height']
            return stats
    except KeyError:
        return None, None if data_type == 'bmi' else None
    
def impute_bmi(age, sex):
    mean_bmi, sd = get_who_stats(age, sex, 'bmi')
    if mean_bmi is not None and sd is not None:
        imputed_bmi = np.random.normal(mean_bmi, sd)
        return round(imputed_bmi, 2)
    else:
        return None

def impute_height(age, sex):
    mean_height_cm = get_who_stats(age, sex, 'height')
    if mean_height_cm is not None:
        mean_height_inches = mean_height_cm / 2.54  # Convert cm to inches
        return round(mean_height_inches, 2)
    else:
        return None
    
def impute_weight(bmi, height_inches):
    if bmi is not None and height_inches is not None:
        height_meters = height_inches * 0.0254  # Convert inches to meters
        weight_kg = bmi * (height_meters ** 2)
        weight_lbs = weight_kg * 2.20462  # Convert kg to lbs
        return round(weight_lbs, 2)
    else:
        return None
    
def apply_imputation(df):
    def impute_if_missing(row):
        age = row['Basic_Demos-Age']
        sex = row['Basic_Demos-Sex']
        
        if pd.isna(row['Physical-BMI']) and 5 <= age <= 19:
            row['Physical-BMI'] = impute_bmi(age, sex)
        
        if pd.isna(row['Physical-Height']) and 5 <= age <= 19:
            row['Physical-Height'] = impute_height(age, sex)
        
        if pd.isna(row['Physical-Weight']) and row['Physical-BMI'] is not None and row['Physical-Height'] is not None:
            row['Physical-Weight'] = impute_weight(row['Physical-BMI'], row['Physical-Height'])
        
        return row
    
    return df.apply(impute_if_missing, axis=1)

In [12]:
# Apply to datasets
train_data = apply_imputation(train_data)
test_data = apply_imputation(test_data)

# Check the results
print("Number of missing values after imputation:")
print("BMI:", train_data['Physical-BMI'].isna().sum())
print("Height:", train_data['Physical-Height'].isna().sum())
print("Weight:", train_data['Physical-Weight'].isna().sum())

print("\nSample of imputed BMI, Height, and Weight values:")
imputed_sample = train_data[
    (train_data['Physical-BMI'].notnull() | train_data['Physical-Height'].notnull() | train_data['Physical-Weight'].notnull()) &
    ((train_data['Physical-BMI'].notnull() != train_data['Physical-BMI'].notnull().shift()) |
     (train_data['Physical-Height'].notnull() != train_data['Physical-Height'].notnull().shift()) |
     (train_data['Physical-Weight'].notnull() != train_data['Physical-Weight'].notnull().shift()))
].sample(5)[['Basic_Demos-Age', 'Basic_Demos-Sex', 'Physical-BMI', 'Physical-Height', 'Physical-Weight']]
print(imputed_sample)

Number of missing values after imputation:
BMI: 21
Height: 21
Weight: 20

Sample of imputed BMI, Height, and Weight values:
      Basic_Demos-Age  Basic_Demos-Sex  Physical-BMI  Physical-Height  \
1046               11                0     30.000301             60.5   
2153               13                0     20.649384             64.5   
1283                5                1     15.904649             44.0   
1888                7                1     15.780022             49.5   
2023                9                0     11.925153             63.5   

      Physical-Weight  
1046            156.2  
2153            122.2  
1283             43.8  
1888             55.0  
2023             68.4  


In [13]:
# Additional analysis of imputation results
print("\nImputation summary by age:")
age_summary = train_data.groupby('Basic_Demos-Age').agg({
    'Physical-BMI': ['count', 'mean', 'std', 'min', 'max'],
    'Physical-Height': ['count', 'mean', 'std', 'min', 'max'],
    'Physical-Weight': ['count', 'mean', 'std', 'min', 'max'],
    'Basic_Demos-Sex': 'count'
})
print(age_summary)


Imputation summary by age:
                Physical-BMI                                             \
                       count       mean       std        min        max   
Basic_Demos-Age                                                           
5                        112  16.651598  2.710215  12.853139  26.512004   
6                        369  16.232875  2.604860   0.000000  34.055363   
7                        436  16.492136  2.852366   9.693766  43.468833   
8                        490  17.300994  3.607325   0.000000  59.132048   
9                        467  17.512781  3.136865  11.925153  33.729946   
10                       420  18.391149  4.043673   0.000000  44.554097   
11                       334  19.050080  3.975463  11.915254  37.484035   
12                       291  20.107088  4.435314  14.223937  40.399621   
13                       236  20.474744  4.766221   0.000000  39.494148   
14                       200  21.752981  5.143929  13.740613  44.835548 

In [14]:
# Plot of imputed vs. original data
plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-BMI', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('BMI vs Age (After Imputation)')
plt.savefig('analysis_output/bmi_vs_age_imputed.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-Height', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('Height vs Age (After Imputation)')
plt.savefig('analysis_output/height_vs_age_imputed.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-Weight', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('Weight vs Age (After Imputation)')
plt.savefig('analysis_output/weight_vs_age_imputed.png')
plt.close()

print("Imputation analysis plots saved in the 'analysis_output' folder.")

Imputation analysis plots saved in the 'analysis_output' folder.


In [16]:
# Export train_data to CSV
train_output_path = os.path.join(output_folder, 'train_data_imputed.csv')
train_data.to_csv(train_output_path, index=False)
print(f"Imputed train data exported to: {train_output_path}")

# Export test_data to CSV
test_output_path = os.path.join(output_folder, 'test_data_imputed.csv')
test_data.to_csv(test_output_path, index=False)
print(f"Imputed test data exported to: {test_output_path}")

print("Data export completed.")

Imputed train data exported to: output\train_data_imputed.csv
Imputed test data exported to: output\test_data_imputed.csv
Data export completed.
