In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("train.csv")

In [5]:
ORDER = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

In [6]:
def to_broad(cat: str) -> str:
    if cat == 'Insufficient_Weight':
        return 'Underweight'
    if cat == 'Normal_Weight':
        return 'Normal'
    if cat.startswith('Overweight'):
        return 'Overweight'
    return 'Obese'  # all Obesity_* go here

# --- BMI (meters). If Height is in cm, use (Height/100)**2 ---
df['BMI'] = df['Weight'] / (df['Height'] ** 2)
# If Height is CM, replace the above with:
# df['BMI'] = df['Weight'] / ((df['Height']/100.0) ** 2)

# --- Broad category for Stage 1 ---
df['BroadCategory'] = df['WeightCategory'].apply(to_broad)

# --- Split by gender ---
def gender_slice(df_, gender_value):
    return df_[df_['Gender'] == gender_value].copy()

male_df   = gender_slice(df, 'Male')
female_df = gender_slice(df, 'Female')

In [7]:
# -----------------------------
# 1) Preprocessing config
# -----------------------------
# Numeric and categorical columns used by the models
num_cols = ['Age', 'Height', 'Weight', 'BMI', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
cat_cols = ['family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
FEATURES = cat_cols + num_cols

def make_preprocess():
    """Fresh ColumnTransformer per pipeline to avoid reuse side-effects."""
    return ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols),
            # Impute numerics to be safe (and pass through numeric values)
            ('num', Pipeline(steps=[('imp', SimpleImputer(strategy='median'))]), num_cols),
        ],
        remainder='drop'
    )

def make_broad_pipeline():
    return Pipeline(steps=[
        ('prep', make_preprocess()),
        ('clf', RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1))
    ])

def make_over_pipeline():
    return Pipeline(steps=[
        ('prep', make_preprocess()),
        ('clf', RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1, class_weight='balanced'))
    ])

def make_obese_pipeline():
    return Pipeline(steps=[
        ('prep', make_preprocess()),
        ('clf', RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1, class_weight='balanced'))
    ])


In [9]:
# -----------------------------
# 2) Train function per gender
# -----------------------------
def train_hierarchical_for_gender(gdf):
    # Fresh, independent pipelines for this gender
    broad_pipe = make_broad_pipeline()
    over_pipe  = make_over_pipeline()
    obese_pipe = make_obese_pipeline()

    # --- Stage 1: Broad ---
    X = gdf.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
    y_broad = gdf['BroadCategory']

    # Sanity: ensure features exist
    missing = (set(FEATURES) - set(X.columns))
    if missing:
        raise ValueError(f"Missing required feature columns: {missing}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_broad, test_size=0.2, stratify=y_broad, random_state=42
    )
    broad_pipe.fit(X_train, y_train)
    y_pred_broad = broad_pipe.predict(X_test)
    broad_acc = accuracy_score(y_test, y_pred_broad)
    print("\n=== Stage 1 (Broad) ===")
    print(classification_report(y_test, y_pred_broad))
    print("Broad Accuracy:", round(broad_acc, 4))

    # --- Stage 2: Overweight (Level I vs II) ---
    over_labels = ['Overweight_Level_I', 'Overweight_Level_II']
    over_df = gdf[(gdf['BroadCategory'] == 'Overweight') & (gdf['WeightCategory'].isin(over_labels))].copy()
    over_acc = None
    if len(over_df) > 1 and over_df['WeightCategory'].nunique() == 2:
        X_over = over_df.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
        y_over = over_df['WeightCategory']
        Xo_tr, Xo_te, yo_tr, yo_te = train_test_split(X_over, y_over, test_size=0.2, stratify=y_over, random_state=42)
        over_pipe.fit(Xo_tr, yo_tr)
        yo_pred = over_pipe.predict(Xo_te)
        over_acc = accuracy_score(yo_te, yo_pred)
        print("\n=== Stage 2 (Overweight: I vs II) ===")
        print(classification_report(yo_te, yo_pred, labels=over_labels, target_names=over_labels))
        print("Overweight Accuracy:", round(over_acc, 4))
    else:
        print("\n=== Stage 2 (Overweight) ===")
        print("Not enough variation to train (need both Level I and Level II).")

    # --- Stage 2: Obese (Type I / II / III) ---
    obese_labels = ['Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III']
    obese_df = gdf[(gdf['BroadCategory'] == 'Obese') & (gdf['WeightCategory'].isin(obese_labels))].copy()
    obese_acc = None
    if len(obese_df) > 1 and obese_df['WeightCategory'].nunique() >= 2:
        X_ob = obese_df.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
        y_ob = obese_df['WeightCategory']
        Xb_tr, Xb_te, yb_tr, yb_te = train_test_split(X_ob, y_ob, test_size=0.2, stratify=y_ob, random_state=42)
        obese_pipe.fit(Xb_tr, yb_tr)
        yb_pred = obese_pipe.predict(Xb_te)
        present = [c for c in obese_labels if c in yb_te.unique()]
        obese_acc = accuracy_score(yb_te, yb_pred)
        print("\n=== Stage 2 (Obese: I/II/III) ===")
        print(classification_report(yb_te, yb_pred, labels=present, target_names=present))
        print("Obese Accuracy:", round(obese_acc, 4))
    else:
        print("\n=== Stage 2 (Obese) ===")
        print("Not enough variation to train (need at least two Obesity types).")

    return broad_pipe, over_pipe, obese_pipe, broad_acc, over_acc, obese_acc


In [10]:
# -----------------------------
# 3) Train models per gender
# -----------------------------
print("------ MALE MODELS ------")
male_broad, male_over, male_obese, male_broad_acc, male_over_acc, male_obese_acc = train_hierarchical_for_gender(male_df)

print("\n\n------ FEMALE MODELS ------")
female_broad, female_over, female_obese, female_broad_acc, female_over_acc, female_obese_acc = train_hierarchical_for_gender(female_df)

print("\n=== Summary ===")
print(f"Male   -> Broad: {male_broad_acc:.3f}, Over: {male_over_acc}, Obese: {male_obese_acc}")
print(f"Female -> Broad: {female_broad_acc:.3f}, Over: {female_over_acc}, Obese: {female_obese_acc}")


------ MALE MODELS ------

=== Stage 1 (Broad) ===
              precision    recall  f1-score   support

      Normal       0.86      0.86      0.86       215
       Obese       0.98      0.97      0.97       731
  Overweight       0.92      0.92      0.92       475
 Underweight       0.89      0.92      0.91       136

    accuracy                           0.94      1557
   macro avg       0.91      0.92      0.91      1557
weighted avg       0.94      0.94      0.94      1557

Broad Accuracy: 0.9351

=== Stage 2 (Overweight: I vs II) ===
                     precision    recall  f1-score   support

 Overweight_Level_I       0.91      0.86      0.88       211
Overweight_Level_II       0.89      0.93      0.91       265

           accuracy                           0.90       476
          macro avg       0.90      0.89      0.90       476
       weighted avg       0.90      0.90      0.90       476

Overweight Accuracy: 0.8971

=== Stage 2 (Obese: I/II/III) ===
                  pr

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



=== Stage 1 (Broad) ===
              precision    recall  f1-score   support

      Normal       0.87      0.89      0.88       255
       Obese       0.97      0.98      0.98       788
  Overweight       0.89      0.87      0.88       270
 Underweight       0.94      0.93      0.94       237

    accuracy                           0.94      1550
   macro avg       0.92      0.92      0.92      1550
weighted avg       0.94      0.94      0.94      1550

Broad Accuracy: 0.9387

=== Stage 2 (Overweight: I vs II) ===
                     precision    recall  f1-score   support

 Overweight_Level_I       0.91      0.84      0.87       159
Overweight_Level_II       0.79      0.88      0.83       111

           accuracy                           0.86       270
          macro avg       0.85      0.86      0.85       270
       weighted avg       0.86      0.86      0.86       270

Overweight Accuracy: 0.8556

=== Stage 2 (Obese: I/II/III) ===
                  precision    recall  f1-scor

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# -----------------------------
# 4) Inference helpers
# -----------------------------
def build_feature_frame(df_like: pd.DataFrame) -> pd.DataFrame:
    """Ensure exact training features (order + dtypes) for the pipelines."""
    X = df_like.reindex(columns=FEATURES, fill_value=np.nan).copy()
    for c in cat_cols:
        X[c] = X[c].astype('object')
    for c in num_cols:
        X[c] = pd.to_numeric(X[c], errors='coerce')
    return X

def predict_for_gender(test_part: pd.DataFrame, broad_model, over_model, obese_model) -> pd.Series:
    """Batch prediction for a gender-specific subset with Stage-2 routing."""
    X = build_feature_frame(test_part)

    # Stage 1
    broad_pred = broad_model.predict(X)
    final_pred = pd.Series(broad_pred, index=test_part.index).copy()

    # Stage 2: Overweight -> Level I vs II
    over_idx = final_pred[final_pred == 'Overweight'].index
    if len(over_idx) > 0:
        X_over = X.loc[over_idx]
        try:
            final_pred.loc[over_idx] = over_model.predict(X_over)
        except Exception:
            final_pred.loc[over_idx] = 'Overweight_Level_I'

    # Stage 2: Obese -> Type I/II/III
    obese_idx = final_pred[final_pred == 'Obese'].index
    if len(obese_idx) > 0:
        X_ob = X.loc[obese_idx]
        try:
            final_pred.loc[obese_idx] = obese_model.predict(X_ob)
        except Exception:
            final_pred.loc[obese_idx] = 'Obesity_Type_I'

    # Map broad Underweight/Normal to final fine labels
    final_pred = final_pred.replace({
        'Underweight': 'Insufficient_Weight',
        'Normal': 'Normal_Weight'
    })
    return final_pred


In [12]:
# -----------------------------
# 5) Load test and produce Kaggle submission
# -----------------------------
test = pd.read_csv('test.csv').copy()

# Compute BMI for test the same way as train (meters here)
test['BMI'] = test['Weight'] / (test['Height'] ** 2)
# If Height is CM, use:
# test['BMI'] = test['Weight'] / ((test['Height']/100.0) ** 2)

# Fix common typo if present
if 'CH20' in test.columns and 'CH2O' not in test.columns:
    test['CH2O'] = test['CH20']

# Split by gender for routing
male_test   = test[test['Gender'] == 'Male']
female_test = test[test['Gender'] == 'Female']

# Predict
male_pred   = predict_for_gender(male_test,   male_broad,   male_over,   male_obese)
female_pred = predict_for_gender(female_test, female_broad, female_over, female_obese)

# Combine predictions in original order
all_pred = pd.concat([male_pred, female_pred]).sort_index()

# Build submission
submission = pd.DataFrame({
    'id': test['id'].values,
    'WeightCategory': all_pred.values
})

# Good hygiene: enforce label set/order
submission['WeightCategory'] = pd.Categorical(submission['WeightCategory'], categories=ORDER, ordered=True)

# Save
submission.to_csv('sample_submission.csv', index=False)
print(submission.head(), "\nSaved as sample_submission.csv")

      id       WeightCategory
0  15533     Obesity_Type_III
1  15534   Overweight_Level_I
2  15535  Overweight_Level_II
3  15536      Obesity_Type_II
4  15537        Normal_Weight 
Saved as sample_submission.csv


In [3]:
from sklearn.base import clone
from copy import deepcopy

# keep your cat_cols / num_cols lists
cat_cols = [
    'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
    'SCC', 'CALC', 'MTRANS'
]
num_cols = ['Age', 'Height', 'Weight', 'BMI', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

def make_preprocess():
    return ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols),
            ('num', 'passthrough', num_cols),
        ],
        remainder='drop'
    )

def make_broad_pipeline():
    return Pipeline([
        ('prep', make_preprocess()),
        ('clf', RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42))
    ])

def make_over_pipeline():
    return Pipeline([
        ('prep', make_precess()),
        ('clf', RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42, class_weight='balanced'))
    ])

def make_obese_pipeline():
    return Pipeline([
        ('prep', make_preprocess()),
        ('clf', RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42, class_weight='balanced'))
    ])


In [4]:
def train_hierarchical_for_gender(gdf, label_order=ORDER):
    # fresh, independent pipelines
    broad_pipe = make_broad_pipeline()
    over_pipe  = make_over_pipeline()
    obese_pipe = make_obese_pipeline()

    # --- Stage 1: Broad ---
    X = gdf.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
    y_broad = gdf['BroadCategory']

    for c in (set(cat_cols + num_cols) - set(X.columns)):
        raise ValueError(f"Missing required feature column: {c}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_broad, test_size=0.2, stratify=y_broad, random_state=42
    )

    broad_pipe.fit(X_train, y_train)
    y_pred_broad = broad_pipe.predict(X_test)
    broad_acc = accuracy_score(y_test, y_pred_broad)
    print("\n=== Stage 1 (Broad) ===")
    print(classification_report(y_test, y_pred_broad))
    print("Broad Accuracy:", broad_acc)

    # --- Stage 2: Overweight (I vs II) ---
    over_labels = ['Overweight_Level_I','Overweight_Level_II']
    over_df = gdf[(gdf['BroadCategory']=='Overweight') & (gdf['WeightCategory'].isin(over_labels))].copy()

    over_acc = None
    if len(over_df) > 1 and over_df['WeightCategory'].nunique()==2:
        X_over = over_df.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
        y_over = over_df['WeightCategory']
        Xo_tr, Xo_te, yo_tr, yo_te = train_test_split(X_over, y_over, test_size=0.2, stratify=y_over, random_state=42)
        over_pipe.fit(Xo_tr, yo_tr)
        yo_pred = over_pipe.predict(Xo_te)
        over_acc = accuracy_score(yo_te, yo_pred)
        print("\n=== Stage 2 (Overweight: I vs II) ===")
        print(classification_report(yo_te, yo_pred, labels=over_labels, target_names=over_labels))
        print("Overweight Accuracy:", over_acc)
    else:
        print("\n=== Stage 2 (Overweight) ===")
        print("Not enough variation to train (need both Level I and Level II).")

    # --- Stage 2: Obese (I/II/III) ---
    obese_labels = ['Obesity_Type_I','Obesity_Type_II','Obesity_Type_III']
    obese_df = gdf[(gdf['BroadCategory']=='Obese') & (gdf['WeightCategory'].isin(obese_labels))].copy()

    obese_acc = None
    if len(obese_df) > 1 and obese_df['WeightCategory'].nunique()>=2:
        X_ob = obese_df.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
        y_ob = obese_df['WeightCategory']
        Xb_tr, Xb_te, yb_tr, yb_te = train_test_split(X_ob, y_ob, test_size=0.2, stratify=y_ob, random_state=42)
        obese_pipe.fit(Xb_tr, yb_tr)
        yb_pred = obese_pipe.predict(Xb_te)
        present = [c for c in obese_labels if c in yb_te.unique()]
        obese_acc = accuracy_score(yb_te, yb_pred)
        print("\n=== Stage 2 (Obese: I/II/III) ===")
        print(classification_report(yb_te, yb_pred, labels=present, target_names=present))
        print("Obese Accuracy:", obese_acc)
    else:
        print("\n=== Stage 2 (Obese) ===")
        print("Not enough variation to train (need at least two Obesity types).")

    return broad_pipe, over_pipe, obese_pipe, broad_acc, over_acc, obese_acc


NameError: name 'ORDER' is not defined