### DM Final Project Progress Report #2 2.0
score - 0.392

In [None]:
import os

import warnings



import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

from tqdm import tqdm

from concurrent.futures import ThreadPoolExecutor



from sklearn.experimental import enable_iterative_imputer

from sklearn.impute import KNNImputer, IterativeImputer

from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,

                              VotingClassifier, VotingRegressor, GradientBoostingClassifier,

                              GradientBoostingRegressor, AdaBoostClassifier)

from sklearn.metrics import (classification_report, confusion_matrix, mean_squared_error,

                             accuracy_score, cohen_kappa_score, r2_score, make_scorer)

from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeRegressor 

from scipy.optimize import minimize



from keras.models import Model

from keras.layers import Input, Dense



from xgboost import XGBRegressor

from catboost import CatBoostRegressor



import torch

import torch.nn as nn

import torch.optim as optim

In [None]:
# Feature Engineering

import pandas as pd



def feature_engineering(df):

    # Calculate new features

    new_features = pd.DataFrame({

        'BMI_Age': df['Physical-BMI'] * df['Basic_Demos-Age'],

        'Internet_Hours_Age': df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age'],

        'BMI_Internet_Hours': df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday'],

        'BFP_BMI': df['BIA-BIA_Fat'] / df['BIA-BIA_BMI'],

        'FFMI_BFP': df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat'],

        'FMI_BFP': df['BIA-BIA_FMI'] / df['BIA-BIA_Fat'],

        'LST_TBW': df['BIA-BIA_LST'] / df['BIA-BIA_TBW'],

        'BFP_BMR': df['BIA-BIA_Fat'] * df['BIA-BIA_BMR'],

        'BFP_DEE': df['BIA-BIA_Fat'] * df['BIA-BIA_DEE'],

        'BMR_Weight': df['BIA-BIA_BMR'] / df['Physical-Weight'],

        'DEE_Weight': df['BIA-BIA_DEE'] / df['Physical-Weight'],

        'SMM_Height': df['BIA-BIA_SMM'] / df['Physical-Height'],

        'Muscle_to_Fat': df['BIA-BIA_SMM'] / df['BIA-BIA_FMI'],

        'Hydration_Status': df['BIA-BIA_TBW'] / df['Physical-Weight'],

        'ICW_TBW': df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']

    })



    # Concatenate new features with the original DataFrame

    df = pd.concat([df, new_features], axis=1)

    

    return df



def load_and_process_data(directory):

    files = os.listdir(directory)

    all_stats = []



    with ThreadPoolExecutor() as executor:

        futures = [executor.submit(pd.read_parquet, os.path.join(directory, file, 'part-0.parquet')) for file in files]

        for future in tqdm(futures):

            data = future.result()

            if 'step' in data.columns:

                data.drop('step', axis=1, inplace=True)



            # Calculate summary statistics

            stats = data.describe().values.reshape(-1)

            all_stats.append(stats)



    # Create a DataFrame for summary statistics

    stat_columns = [f"stat_{i}" for i in range(len(all_stats[0]))]

    summary_df = pd.DataFrame(all_stats, columns=stat_columns)

    summary_df['id'] = [file.split('=')[1] for file in files]  # Extract 'id' from filenames



    return summary_df



class SimpleAutoEncoder(nn.Module):

    def __init__(self, input_dim, encoding_dim):

        super(SimpleAutoEncoder, self).__init__()

        self.encoder = nn.Sequential(

            nn.Linear(input_dim, encoding_dim * 2),

            nn.ReLU(),

            nn.Linear(encoding_dim * 2, encoding_dim),

            nn.ReLU()

        )

        self.decoder = nn.Sequential(

            nn.Linear(encoding_dim, encoding_dim * 2),

            nn.ReLU(),

            nn.Linear(encoding_dim * 2, input_dim),

            nn.Sigmoid()

        )



    def forward(self, x):

        return self.decoder(self.encoder(x))



def train_autoencoder(data, encoding_dim=10, epochs=20, batch_size=16):

    scaler = StandardScaler()

    scaled_data = scaler.fit_transform(data)

    tensor_data = torch.FloatTensor(scaled_data)



    autoencoder = SimpleAutoEncoder(input_dim=tensor_data.shape[1], encoding_dim=encoding_dim)

    criterion = nn.MSELoss()

    optimizer = optim.Adam(autoencoder.parameters())



    for epoch in range(epochs):

        for i in range(0, len(tensor_data), batch_size):

            batch = tensor_data[i:i + batch_size]

            optimizer.zero_grad()

            loss = criterion(autoencoder(batch), batch)

            loss.backward()

            optimizer.step()

        if (epoch + 1) % 5 == 0:

            print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')



    with torch.no_grad():

        encoded_data = autoencoder.encoder(tensor_data).numpy()

    

    return pd.DataFrame(encoded_data, columns=[f'Enc_{i+1}' for i in range(encoded_data.shape[1])])

    

def impute_missing_values(data, season_columns, season_mapping):

    # Encode Seasons

    data[season_columns] = data[season_columns].map(lambda x: season_mapping.get(x, x))

    

    # Identify numeric columns

    numeric_cols = data.select_dtypes(include=['float64', 'float32', 'int64']).columns

    

    # Scale numeric features for KNN imputation

    scaler = StandardScaler()

    data_scaled = data.copy()

    data_scaled[numeric_cols] = scaler.fit_transform(data[numeric_cols])

    

    # Initialize the imputer and apply it only on numeric columns with missing values

    imputer = KNNImputer(n_neighbors=5)

    imputed_numeric_data = imputer.fit_transform(data_scaled[numeric_cols])

    imputed_scaled_df = pd.DataFrame(imputed_numeric_data, columns=numeric_cols)

    

    # Invert scaling to original scale for imputed numeric columns

    imputed_data = data.copy()

    imputed_data[numeric_cols] = scaler.inverse_transform(imputed_scaled_df)

    

    # Clip and convert 'sii' to integers

    if 'sii' in imputed_data.columns:

        imputed_data['sii'] = imputed_data['sii'].clip(lower=0, upper=3).round().astype(int)

    

    # Ensure other columns remain intact

    for col in imputed_data.columns:

        if col not in numeric_cols:

            imputed_data[col] = data[col]

    

    # Convert season columns to integers

    imputed_data[season_columns] = imputed_data[season_columns].astype(int)

    

    return imputed_data



# Define QWK calculation function

def quadratic_weighted_kappa(y_true, y_pred):

    return cohen_kappa_score(y_true, y_pred, weights='quadratic')



# Function to apply threshold-based rounding to predictions

def threshold_rounder(predictions, thresholds):

    return np.where(predictions < thresholds[0], 0,

                    np.where(predictions < thresholds[1], 1,

                             np.where(predictions < thresholds[2], 2, 3)))



# Threshold optimization to maximize QWK

def optimize_qwk_thresholds(predictions, y_true):

    def evaluate_thresholds(thresholds):

        rounded_preds = threshold_rounder(predictions, thresholds)

        return -quadratic_weighted_kappa(y_true, rounded_preds)

    

    # Optimize using the Nelder-Mead method

    result = minimize(evaluate_thresholds, x0=[0.5, 1.5, 2.5], method='Nelder-Mead')

    return result.x if result.success else [0.5, 1.5, 2.5]



from sklearn.preprocessing import StandardScaler



# Model

def train_and_evaluate(train, test, sample_submission, n_splits=5, random_state=42):

    X = train.drop(['sii'], axis=1)

    y = train['sii']

    

    # Arrays to store out-of-fold predictions and test predictions

    oof_preds = np.zeros(len(y))

    test_preds = np.zeros((len(test), n_splits))

    

    # Set up cross-validation

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):

        print(f"Training fold {fold + 1}/{n_splits}...")

        

        # Split the data into training and validation sets

        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]

        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        

        # Standardize the features using StandardScaler

        scaler = StandardScaler()

        X_train_scaled = scaler.fit_transform(X_train)

        X_val_scaled = scaler.transform(X_val)

        test_scaled = scaler.transform(test)

        

        # Fit the ensemble model on the scaled training data

        voting_regressor.fit(X_train_scaled, y_train)

        

        # Generate predictions for the validation and test sets

        oof_preds[val_idx] = voting_regressor.predict(X_val_scaled)

        test_preds[:, fold] = voting_regressor.predict(test_scaled)

    

    # Optimize thresholds based on out-of-fold predictions

    optimal_thresholds = optimize_qwk_thresholds(oof_preds, y)

    oof_preds_rounded = threshold_rounder(oof_preds, optimal_thresholds)

    

    # Calculate and print optimized QWK score

    qwk_score = quadratic_weighted_kappa(y, oof_preds_rounded)

    print(f"Optimized QWK: {qwk_score:.4f}")

    

    # Average test predictions and apply optimized thresholds for final submission

    final_test_preds = threshold_rounder(test_preds.mean(axis=1), optimal_thresholds)

    

    # Prepare submission DataFrame

    submission = pd.DataFrame({

        'id': sample_submission['id'],  # Use 'id' from sample submission

        'sii': final_test_preds.astype(int)  # Round to integer for submission

    })

    

    return submission

In [None]:
# Load Data



train_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')

test_og = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

sample_submission = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')



# Load actigraphy time series data

train_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")

test_ts = load_and_process_data("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")



df_train = train_ts.drop('id', axis=1)

df_test = test_ts.drop('id', axis=1)



# Autoencode Data

train_ts_encoded = train_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)

test_ts_encoded = train_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)



time_series_cols = train_ts_encoded.columns.tolist()

# Add 'id' back to the encoded DataFrame

train_ts_encoded["id"]=train_ts["id"]

test_ts_encoded['id']=test_ts["id"]



# Merge Data

train = pd.merge(train_og, train_ts_encoded, how="left", on='id')

test = pd.merge(test_og, test_ts_encoded, how="left", on='id')



# Impute Missing Data

# Define season mapping

season_mapping = {'Spring': 1, 'Summer': 2, 'Fall': 3, 'Winter': 4}

# For the train set

season_columns_train = [col for col in train.columns if 'Season' in col]

train_imputed = impute_missing_values(train, season_columns_train, season_mapping)

# For the test set

season_columns_test = [col for col in test.columns if 'Season' in col]

test_imputed = impute_missing_values(test, season_columns_test, season_mapping)



# Perform feature engineering

train_imputed = feature_engineering(train_imputed)

train_imputed.dropna(thresh=1, axis=0, inplace=True)

train_imputed.replace([np.inf, -np.inf], 0, inplace=True)

test_imputed = feature_engineering(test_imputed)



# Get the columns from both DataFrames

train_cols = set(train_og.columns)

test_cols = set(test_og.columns)



# Find common columns

common_cols = train_cols.intersection(test_cols)

featuresCols = [col for col in common_cols if col != 'id']

featuresCols += time_series_cols



test_imputed = test_imputed[featuresCols]

featuresCols.append('sii')

train_imputed = train_imputed[featuresCols]



# Split the dataset into features and target variable

X = train_imputed.drop('sii', axis=1)

y = train_imputed['sii']  # Target variable



# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Standardize the features

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

test_scaled = scaler.transform(test_imputed)



# Apply PCA

pca = PCA(n_components=0.95)  # Preserve 95% of variance

X_train_pca = pca.fit_transform(X_train_scaled)

X_test_pca = pca.transform(X_test_scaled)

test_pca = pca.transform(test_scaled)

In [None]:
# Model hyperparameters (keeping your original hyperparameters)

xgb_params = {

    'n_estimators': 200,

    'learning_rate': 0.05,

    'max_depth': 6,

    'min_child_weight': 1,

    'subsample': 0.8,

    'colsample_bytree': 0.8,

    'gamma': 0,

    'reg_alpha': 1,

    'reg_lambda': 5,

    'random_state': 42

}

cat_params = {

    'iterations': 200,

    'learning_rate': 0.05,

    'depth': 6,

    'l2_leaf_reg': 10,

    'subsample': 0.8,

    'rsm': 0.8,

    'border_count': 32,

    'random_state': 42,

    'silent': True

}

rf_params = {

    'n_estimators': 200,

    'max_depth': None,

    'min_samples_split': 2,

    'min_samples_leaf': 1,

    'bootstrap': True,

    'random_state': 42

}

gb_params = {

    'n_estimators': 200,

    'learning_rate': 0.05,

    'max_depth': 3,

    'min_samples_split': 2,

    'min_samples_leaf': 1,

    'subsample': 1.0,

    'random_state': 42

}



# Initialize the models

rf_model = RandomForestRegressor(**rf_params)

xgb_model = XGBRegressor(**xgb_params)

cat_model = CatBoostRegressor(**cat_params)

gb_model = GradientBoostingRegressor(**gb_params)



# Initialize the voting regressor ensemble

voting_regressor = VotingRegressor(estimators=[

    ('rf', rf_model),

    ('xgboost', xgb_model),

    ('catboost', cat_model),

    ('gb', gb_model)

])



# with scaling can vs above qwk

submission = train_and_evaluate(train_imputed, test_imputed, sample_submission)



# Display the submission DataFrame

submission

In [None]:
# Save the submission file

submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file created: submission.csv")

submission