In [1]:
#same preprocessing as Hack new dataset and Hack_new 2 (using XGBoost)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_train=pd.read_csv('Train_Data2.csv')
df_test=pd.read_csv('Test_Data2.csv')
df_samp=pd.read_csv('Sample_Submission2.csv')
df_train=df_train.dropna(subset=['age_group'])

dic={"Adult":0, "Senior":1}
df_train['age_group']=df_train['age_group'].map(dic)

train=df_train.copy()
train.drop(columns=['SEQN'], inplace=True)

numeric_cols=['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
cat_cols=['RIAGENDR', 'PAQ605', 'DIQ010']

for col in numeric_cols:
    med=train[col].median()
    train[col]=train[col].fillna(med)
    df_test[col]=df_test[col].fillna(med)

for col in cat_cols:
    mod=train[col].mode()[0]
    train[col]=train[col].fillna(mod)
    df_test[col]=df_test[col].fillna(mod)

df_test=df_test.drop(columns=['SEQN'])

#Winsorization
def get_outlier_bounds(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return lower, upper

def cap_with_bounds(df, column, lower, upper):
    df[column] = df[column].clip(lower, upper)

numeric_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']

for col in numeric_cols:
    lower, upper = get_outlier_bounds(train[col])
    cap_with_bounds(train, col, lower, upper)
    cap_with_bounds(df_test, col, lower, upper)

categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
train = pd.get_dummies(train, columns=categorical_cols, drop_first=True)
df_test = pd.get_dummies(df_test, columns=categorical_cols, drop_first=True)

# Ensure same columns in test
df_test = df_test.reindex(columns=train.columns.drop('age_group'), fill_value=0)

train['GLU_INS_RATIO'] = train['LBXGLU'] / (train['LBXIN'] + 1)
df_test['GLU_INS_RATIO'] = df_test['LBXGLU'] / (df_test['LBXIN'] + 1)

def bmi_category(bmi):
    if bmi < 18.5: return 'Underweight'
    elif bmi < 25: return 'Healthy'
    elif bmi < 30: return 'Overweight'
    else: return 'Obese'

train['BMI_Category'] = train['BMXBMI'].apply(bmi_category)
df_test['BMI_Category'] = df_test['BMXBMI'].apply(bmi_category)

# One-hot encode
train = pd.get_dummies(train, columns=['BMI_Category'], drop_first=True)
df_test = pd.get_dummies(df_test, columns=['BMI_Category'], drop_first=True)

# Align columns
df_test = df_test.reindex(columns=train.columns.drop('age_group'), fill_value=0)

if 'PAQ605_1.0' in train.columns:
    train['Activity_x_BMI'] = train['PAQ605_1.0'] * train['BMXBMI']
    df_test['Activity_x_BMI'] = df_test['PAQ605_1.0'] * df_test['BMXBMI']

In [2]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# --- Data ---
X = train.drop(columns=['age_group'])
y = train['age_group']

# --- Optional: Train-validation split for evaluation ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# --- Base models (all tree-based, scaling not required) ---
base_models = [
    ('dt', DecisionTreeClassifier(max_depth=5, random_state=42)),
    ('rf', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42))
]

# --- Meta model ---
meta_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)

# --- Stacking Classifier ---
stack_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,
    n_jobs=-1
)

# --- Train ---
stack_clf.fit(X_train, y_train)

# --- Validation prediction ---
y_val_pred = stack_clf.predict(X_val)

# --- Evaluation ---
print("F1 Score:", f1_score(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

# --- Predict on test data ---
test_preds = stack_clf.predict(df_test)

# --- Save submission ---
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("Stacking_Without_Scaling_Submission.csv", index=False)


F1 Score: 0.32142857142857145
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.76      0.81       328
           1       0.26      0.43      0.32        63

    accuracy                           0.71       391
   macro avg       0.57      0.60      0.57       391
weighted avg       0.77      0.71      0.73       391

Confusion Matrix:
 [[250  78]
 [ 36  27]]


<b>Public Score: 42.86</b>