In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import VotingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from scipy.optimize import minimize
from scipy import stats
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
from colorama import Fore, Style
from IPython.display import clear_output
from lightgbm import LGBMRegressor, LGBMClassifier
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Create output folders
analysis_output_folder = 'analysis_output'
os.makedirs(analysis_output_folder, exist_ok=True)

output_folder = 'output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Load data functions
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

# Set display all columns in dataframes property
pd.options.display.max_columns = None

In [None]:
# Load data
train_data = pd.read_csv('input/train.csv')
test_data = pd.read_csv('input/test.csv')
sample_data = pd.read_csv('input/sample_submission.csv')

train_ts_data = load_time_series("input/series_train.parquet")
test_ts_data = load_time_series("input/series_test.parquet")

In [None]:
# Remove id column from time series data
time_series_columns = train_ts_data.columns.tolist()
time_series_columns.remove("id")

# Merge data
train_data = pd.merge(train_data, train_ts_data, how="left", on='id')
test_data = pd.merge(test_data, test_ts_data, how="left", on='id')
train_data = train_data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

train_data.head()

In [5]:
# Feature engineering
train_data = train_data.copy()

# Combine all grip strength
train_data['FGC-FGC_GS'] = train_data['FGC-FGC_GSD_Zone'] + train_data['FGC-FGC_GSND_Zone']

# Combine all sit and reach
train_data['FGC-FGC_SR'] = train_data['FGC-FGC_SRL_Zone'] + train_data['FGC-FGC_SRR_Zone']

# Create a fitness score by adding the zone fitness data
train_data['fitness_score'] = train_data['FGC-FGC_GS'] + train_data['FGC-FGC_SR'] + train_data['FGC-FGC_CU_Zone'] + train_data['FGC-FGC_PU_Zone'] + train_data['FGC-FGC_TL_Zone']

In [6]:
# Isolate the physical attribute columns and some contextual columns for analysis
physical_columns = [
    'Basic_Demos-Enroll_Season', 'Basic_Demos-Age', 'Basic_Demos-Sex',
    'CGAS-Season', 'CGAS-CGAS_Score', 'Physical-Season', 'Physical-BMI',
    'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
    'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
    'fitness_score', 'BIA-BIA_Frame_num', 
    'sii'
]

# Isolate the fitness attributes
# Removed columns: 'FGC-FGC_CU' 'FGC-FGC_PU', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_SRL' 
# 'FGC-FGC_GSND_Zone' 'FGC-FGC_GSND' 'FGC-FGC_GSD' 'FGC-FGC_GSD_Zone' 'Fitness_Endurance-Time_Sec'
fitness_columns = [
    'Fitness_Endurance-Season', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
    'FGC-Season', 'FGC-FGC_CU_Zone', 'FGC-FGC_SR', 'FGC-FGC_PU_Zone', 'FGC-FGC_TL', 
    'FGC-FGC_TL_Zone', 'FGC-FGC_GS', 'fitness_score',
    'BIA-BIA_Frame_num', 'sii'
]

# Isolate the BIA attributes
bia_columns = [
    'BIA-Season', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_Frame_num', 'BIA-BIA_BMC',
    'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI',
    'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_TBW',
    'fitness_score', 'sii'
]

In [None]:
# Function to analyze columns
def analyze_column(column):
    total_count = len(train_data)
    missing_count = train_data[column].isnull().sum()
    missing_percentage = (missing_count / total_count) * 100
    unique_values = train_data[column].nunique()
    
    if pd.api.types.is_numeric_dtype(train_data[column]):
        mean_value = train_data[column].mean()
        median_value = train_data[column].median()
        std_dev = train_data[column].std()
        return {
            "Column": column,
            "Total Count": total_count,
            "Missing Count": missing_count,
            "Missing Percentage": f"{missing_percentage:.2f}%",
            "Unique Values": unique_values,
            "Data Type": train_data[column].dtype,
            "Mean": mean_value,
            "Median": median_value,
            "Standard Deviation": std_dev
        }
    else:
        top_values = train_data[column].value_counts().head(3).to_dict()
        return {
            "Column": column,
            "Total Count": total_count,
            "Missing Count": missing_count,
            "Missing Percentage": f"{missing_percentage:.2f}%",
            "Unique Values": unique_values,
            "Data Type": train_data[column].dtype,
            "Top 3 Values": top_values
        }

# Physical column profiles        
physical_column_profiles = [analyze_column(col) for col in physical_columns]
physical_column_profiles_df = pd.DataFrame(physical_column_profiles)

# Save column profiles to CSV
physical_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'physical_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'physical_column_profiles.csv')}")

# Fitness column profiles
fitness_column_profiles = [analyze_column(col) for col in fitness_columns]
fitness_column_profiles_df = pd.DataFrame(fitness_column_profiles)

# Save column profiles to CSV
fitness_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'fitness_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'fitness_column_profiles.csv')}")

# BIA column profiles
bia_column_profiles = [analyze_column(col) for col in bia_columns]
bia_column_profiles_df = pd.DataFrame(bia_column_profiles)

# Save column profiles to CSV
bia_column_profiles_df.to_csv(os.path.join(analysis_output_folder, 'bia_column_profiles.csv'), index=False)
print(f"Column profiles saved to {os.path.join(analysis_output_folder, 'bia_column_profiles.csv')}")

In [None]:
# Visualize missing data
# Physical columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_data[physical_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Physical Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(analysis_output_folder, 'physical_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(analysis_output_folder, 'physical_missing_data_heatmap.png')}")

# Fitness columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_data[fitness_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in Fitness Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(analysis_output_folder, 'fitness_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(analysis_output_folder, 'fitness_missing_data_heatmap.png')}")

# BIA columns
plt.figure(figsize=(12, 6))
sns.heatmap(train_data[bia_columns].isnull(), cbar=False, yticklabels=False, cmap='viridis')
plt.title('Missing Data in BIA Attribute Columns')
plt.xlabel('Columns')
plt.ylabel('Rows')
plt.tight_layout()
plt.savefig(os.path.join(analysis_output_folder, 'bia_missing_data_heatmap.png'))
plt.close()
print(f"Missing data heatmap saved to {os.path.join(analysis_output_folder, 'bia_missing_data_heatmap.png')}")

In [None]:
# Correlation matrix for physical numeric columns
physical_numeric_columns = train_data[physical_columns].select_dtypes(include=[np.number]).columns
physical_correlation_matrix = train_data[physical_numeric_columns].corr()

plt.figure(figsize=(16, 14))
sns.heatmap(physical_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Physical Attributes')
plt.tight_layout()
plt.savefig(os.path.join(analysis_output_folder, 'physical_correlation_matrix.png'))
plt.close()
print(f"Physical correlation matrix saved to {os.path.join(analysis_output_folder, 'physical_correlation_matrix.png')}")

# Correlation matrix for fitness numeric columns
fitness_numeric_columns = train_data[fitness_columns].select_dtypes(include=[np.number]).columns
fitness_correlation_matrix = train_data[fitness_numeric_columns].corr()

plt.figure(figsize=(16, 14))
sns.heatmap(fitness_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric Fitness Attributes')
plt.tight_layout()
plt.savefig(os.path.join(analysis_output_folder, 'fitness_correlation_matrix.png'))
plt.close()
print(f"Fitness correlation matrix saved to {os.path.join(analysis_output_folder, 'fitness_correlation_matrix.png')}")

# Correlation matrix for bia numeric columns
bia_numeric_columns = train_data[bia_columns].select_dtypes(include=[np.number]).columns
bia_correlation_matrix = train_data[bia_numeric_columns].corr()

plt.figure(figsize=(16, 14))
sns.heatmap(bia_correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numeric BIA Attributes')
plt.tight_layout()
plt.savefig(os.path.join(analysis_output_folder, 'bia_correlation_matrix.png'))
plt.close()
print(f"BIA correlation matrix saved to {os.path.join(analysis_output_folder, 'BIA_correlation_matrix.png')}")

In [None]:
# Distribution plots for physical numeric columns
for column in physical_numeric_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(train_data[column].dropna(), kde=True)
    plt.title(f'Distribution of physical {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(analysis_output_folder, f'physical_{column}_distribution.png'))
    plt.close()

# Distribution plots for fitness numeric columns
for column in fitness_numeric_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(train_data[column].dropna(), kde=True)
    plt.title(f'Distribution of fitness {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(analysis_output_folder, f'fitness_{column}_distribution.png'))
    plt.close()

# Distribution plots for BIA numeric columns
for column in bia_numeric_columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(train_data[column].dropna(), kde=True)
    plt.title(f'Distribution of fitness {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.tight_layout()
    plt.savefig(os.path.join(analysis_output_folder, f'bia_{column}_distribution.png'))
    plt.close()

print("All analyses completed and saved to the 'analysis_output' folder.")

In [11]:
# Supplement missing data with data from WHO
# Load who data and group
def load_who_bmi_data(file_path):
    who_data = pd.read_csv(file_path)
    who_data = who_data.groupby(['age', 'sex']).agg({
        'L': 'mean', 'mean_bmi': 'mean', 'S': 'mean'
    }).reset_index()
    who_data = who_data.set_index(['sex', 'age'])
    return who_data

def load_who_height_data(file_path):
    who_data = pd.read_csv(file_path)
    who_data = who_data.groupby(['age', 'sex']).agg({
        'mean_height': 'mean'
    }).reset_index()
    who_data = who_data.set_index(['sex', 'age'])
    return who_data

who_bmi_data = load_who_bmi_data('supplemental_data/bmi_for_age_5_to_19.csv')
who_height_data = load_who_height_data('supplemental_data/height_for_age_5_to_19.csv')

In [12]:
# Defining functions to Impute with data from WHO
def get_who_stats(age, sex, data_type='bmi'):
    try:
        if data_type == 'bmi':
            stats = who_bmi_data.loc[(sex, age), ['mean_bmi', 'S']]
            return stats['mean_bmi'], stats['S']
        elif data_type == 'height':
            stats = who_height_data.loc[(sex, age), 'mean_height']
            return stats
    except KeyError:
        return None, None if data_type == 'bmi' else None
    
def impute_bmi(age, sex):
    mean_bmi, sd = get_who_stats(age, sex, 'bmi')
    if mean_bmi is not None and sd is not None:
        imputed_bmi = np.random.normal(mean_bmi, sd)
        return round(imputed_bmi, 2)
    else:
        return None

def impute_height(age, sex):
    mean_height_cm = get_who_stats(age, sex, 'height')
    if mean_height_cm is not None:
        mean_height_inches = mean_height_cm / 2.54  # Convert cm to inches
        return round(mean_height_inches, 2)
    else:
        return None
    
def impute_weight(bmi, height_inches):
    if bmi is not None and height_inches is not None:
        height_meters = height_inches * 0.0254  # Convert inches to meters
        weight_kg = bmi * (height_meters ** 2)
        weight_lbs = weight_kg * 2.20462  # Convert kg to lbs
        return round(weight_lbs, 2)
    else:
        return None
    
def apply_imputation(df):
    def impute_if_missing(row):
        age = row['Basic_Demos-Age']
        sex = row['Basic_Demos-Sex']
        
        if pd.isna(row['Physical-BMI']) and 5 <= age <= 19:
            row['Physical-BMI'] = impute_bmi(age, sex)
        
        if pd.isna(row['Physical-Height']) and 5 <= age <= 19:
            row['Physical-Height'] = impute_height(age, sex)
        
        if pd.isna(row['Physical-Weight']) and row['Physical-BMI'] is not None and row['Physical-Height'] is not None:
            row['Physical-Weight'] = impute_weight(row['Physical-BMI'], row['Physical-Height'])
        
        return row
    
    return df.apply(impute_if_missing, axis=1)

In [None]:
# Apply to datasets
train_data = apply_imputation(train_data)
test_data = apply_imputation(test_data)

# Check the results
print("Number of missing values after imputation:")
print("BMI:", train_data['Physical-BMI'].isna().sum())
print("Height:", train_data['Physical-Height'].isna().sum())
print("Weight:", train_data['Physical-Weight'].isna().sum())

print("\nSample of imputed BMI, Height, and Weight values:")
imputed_sample = train_data[
    (train_data['Physical-BMI'].notnull() | train_data['Physical-Height'].notnull() | train_data['Physical-Weight'].notnull()) &
    ((train_data['Physical-BMI'].notnull() != train_data['Physical-BMI'].notnull().shift()) |
     (train_data['Physical-Height'].notnull() != train_data['Physical-Height'].notnull().shift()) |
     (train_data['Physical-Weight'].notnull() != train_data['Physical-Weight'].notnull().shift()))
].sample(5)[['Basic_Demos-Age', 'Basic_Demos-Sex', 'Physical-BMI', 'Physical-Height', 'Physical-Weight']]
print(imputed_sample)

In [None]:
# Additional analysis of imputation results
print("\nImputation summary by age:")
age_summary = train_data.groupby('Basic_Demos-Age').agg({
    'Physical-BMI': ['count', 'mean', 'std', 'min', 'max'],
    'Physical-Height': ['count', 'mean', 'std', 'min', 'max'],
    'Physical-Weight': ['count', 'mean', 'std', 'min', 'max'],
    'Basic_Demos-Sex': 'count'
})
print(age_summary)

In [None]:
# Plot of imputed vs. original data
plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-BMI', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('BMI vs Age (After Imputation)')
plt.savefig('analysis_output/bmi_vs_age_imputed.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-Height', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('Height vs Age (After Imputation)')
plt.savefig('analysis_output/height_vs_age_imputed.png')
plt.close()

plt.figure(figsize=(12, 6))
sns.scatterplot(data=train_data, x='Basic_Demos-Age', y='Physical-Weight', hue='Basic_Demos-Sex', alpha=0.5)
plt.title('Weight vs Age (After Imputation)')
plt.savefig('analysis_output/weight_vs_age_imputed.png')
plt.close()

print("Imputation analysis plots saved in the 'analysis_output' folder.")

In [None]:
# Export train_data to CSV
train_output_path = os.path.join(output_folder, 'train_data_imputed.csv')
train_data.to_csv(train_output_path, index=False)
print(f"Imputed train data exported to: {train_output_path}")

# Export test_data to CSV
test_output_path = os.path.join(output_folder, 'test_data_imputed.csv')
test_data.to_csv(test_output_path, index=False)
print(f"Imputed test data exported to: {test_output_path}")

print("Data export completed.")

In [17]:
# isolate and focus on sii=3, what makes it that way as in high problematic internet usage