In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
df = pd.read_csv("train.csv")

In [37]:
order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

In [38]:
def broad_category(cat):
    if cat == 'Insufficient_Weight':
        return 'Underweight'
    elif cat == 'Normal_Weight':
        return 'Normal'
    elif cat.startswith('Overweight'):
        return 'Overweight'
    else:
        return 'Obese'

df['BroadCategory'] = df['WeightCategory'].apply(broad_category)


In [39]:
df['BMI'] = df['Weight'] / (df['Height'] ** 2)

In [32]:
male_df = df[df['Gender'].str.lower() == 'male'].reset_index(drop=True)
female_df = df[df['Gender'].str.lower() == 'female'].reset_index(drop=True)

In [33]:
male_df.drop(['id', 'Gender'], axis=1, inplace=True)
female_df.drop(['id', 'Gender'], axis=1, inplace=True)
df.drop(['id', 'Gender'], axis=1, inplace=True)

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df.drop(['WeightCategory','BroadCategory'], axis=1)
y = df['BroadCategory']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

broad_model = RandomForestClassifier(random_state=42)
broad_model.fit(X_train, y_train)


ValueError: could not convert string to float: 'no'

In [None]:
# Overweight subset
over_df = df[df['BroadCategory'] == 'Overweight']
X_over = over_df.drop(['WeightCategory','BroadCategory'], axis=1)
y_over = over_df['WeightCategory']
over_model = RandomForestClassifier(random_state=42).fit(X_over, y_over)

# Obese subset
obese_df = df[df['BroadCategory'] == 'Obese']
X_ob = obese_df.drop(['WeightCategory','BroadCategory'], axis=1)
y_ob = obese_df['WeightCategory']
ob_model = RandomForestClassifier(random_state=42).fit(X_ob, y_ob)


In [8]:
from sklearn.preprocessing import LabelEncoder

cat_cols = ['family_history_with_overweight', 'FAVC', 'CAEC', 
            'SMOKE', 'SCC', 'CALC', 'MTRANS']

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [9]:
from pandas.api.types import CategoricalDtype

weight_order = CategoricalDtype(categories=order, ordered=True)
df['WeightCategory'] = df['WeightCategory'].astype(weight_order)
df['WeightCategory'] = df['WeightCategory'].cat.codes


In [8]:
# Ensure the WeightCategory column is categorical with the specified order
df_male['WeightCategory'] = pd.Categorical(df_male['WeightCategory'], categories=order, ordered=True)

# Get counts according to the specified order
counts = df_male['WeightCategory'].value_counts().reindex(order)

print(counts)


WeightCategory
Insufficient_Weight     682
Normal_Weight          1072
Overweight_Level_I     1051
Overweight_Level_II    1325
Obesity_Type_I         1252
Obesity_Type_II        2397
Obesity_Type_III          4
Name: count, dtype: int64


In [41]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# -----------------------------
# 0) Assumptions / Setup
# -----------------------------
# df = ...  # your dataframe
# Columns you shared:
# ['id','Gender','Age','Height','Weight','family_history_with_overweight','FAVC','FCVC','NCP','CAEC','SMOKE','CH2O','SCC','FAF','TUE','CALC','MTRANS','WeightCategory']

ORDER = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

def to_broad(cat: str) -> str:
    if cat == 'Insufficient_Weight':
        return 'Underweight'
    if cat == 'Normal_Weight':
        return 'Normal'
    if cat.startswith('Overweight'):
        return 'Overweight'
    return 'Obese'  # all Obesity_* go here

# -----------------------------
# 1) Basic feature engineering
# -----------------------------
df = df.copy()

# Create BMI safely
# df['BMI'] = df['Weight'] / ((df['Height'] / 100.0)**2)

# Broad category for Stage 1
df['BroadCategory'] = df['WeightCategory'].apply(to_broad)

# -----------------------------
# 2) Split by gender (two separate model families)
# -----------------------------
def prepare_gender_df(df, gender_value):
    gdf = df[df['Gender'] == gender_value].copy()
    # Drop id and Gender from features
    # Keep BMI; you may keep Height/Weight too if you like—here we keep them
    # Target columns remain for later splits
    return gdf

male_df = prepare_gender_df(df, 'Male')
female_df = prepare_gender_df(df, 'Female')

# -----------------------------
# 3) Common preprocessing: encode categoricals via ColumnTransformer
# -----------------------------
# Numerical (already numeric):
num_cols = ['Age', 'Height', 'Weight', 'BMI', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Categorical (strings/yes-no/etc):
cat_cols = [
    'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
    'SCC', 'CALC', 'MTRANS'
]
# Do NOT include 'Gender' here since we dropped it from each gender subset.

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols),
        ('num', 'passthrough', num_cols),
    ],
    remainder='drop'
)

# -----------------------------
# 4) Build pipelines
# -----------------------------
# Stage 1: Broad (Underweight/Normal/Overweight/Obese)
broad_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

broad_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', broad_clf)
])

# Stage 2 models:
over_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
obese_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)

over_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', over_clf)
])

obese_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', obese_clf)
])

# -----------------------------
# 5) Train / evaluate per gender
# -----------------------------
def train_hierarchical_for_gender(gdf, label_order=ORDER):
    # TRAIN/TEST for Stage 1 (BroadCategory)
    X = gdf.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
    y_broad = gdf['BroadCategory']

    # In case id/Gender were already removed above, errors='ignore' keeps it safe.
    # Also ensure all required columns exist in X:
    for c in (set(cat_cols + num_cols) - set(X.columns)):
        raise ValueError(f"Missing required feature column: {c}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_broad, test_size=0.2, stratify=y_broad, random_state=42
    )

    broad_pipe.fit(X_train, y_train)
    y_pred_broad = broad_pipe.predict(X_test)
    print("\n=== Stage 1 (Broad) ===")
    print(classification_report(y_test, y_pred_broad))
    print("Broad Accuracy:", accuracy_score(y_test, y_pred_broad))

    # TRAIN Stage 2: Overweight submodel (Level I vs II)
    over_df = gdf[gdf['BroadCategory'] == 'Overweight'].copy()
    X_over = over_df.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
    y_over = over_df['WeightCategory']
    if len(over_df) > 0 and y_over.nunique() > 1:
        over_pipe.fit(X_over, y_over)
        y_pred_over = over_pipe.predict(X_over)
        print("\n=== Stage 2 (Overweight: Level I vs II) ===")
        print(classification_report(y_over, y_pred_over, labels=['Overweight_Level_I','Overweight_Level_II']))
    else:
        print("\n=== Stage 2 (Overweight) ===")
        print("Not enough variation to train (need both Level I and Level II).")

    # TRAIN Stage 2: Obese submodel (Type I/II/III)
    obese_df = gdf[gdf['BroadCategory'] == 'Obese'].copy()
    X_ob = obese_df.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
    y_ob = obese_df['WeightCategory']
    if len(obese_df) > 0 and y_ob.nunique() > 1:
        obese_pipe.fit(X_ob, y_ob)
        y_pred_ob = obese_pipe.predict(X_ob)
        print("\n=== Stage 2 (Obese: Type I/II/III) ===")
        # Reduce report to relevant classes present
        present = [c for c in ['Obesity_Type_I','Obesity_Type_II','Obesity_Type_III'] if c in y_ob.unique()]
        print(classification_report(y_ob, y_pred_ob, labels=present))
    else:
        print("\n=== Stage 2 (Obese) ===")
        print("Not enough variation to train (need at least two Obesity types).")

    return broad_pipe, over_pipe, obese_pipe

print("------ MALE MODELS ------")
male_broad, male_over, male_obese = train_hierarchical_for_gender(male_df)

print("\n\n------ FEMALE MODELS ------")
female_broad, female_over, female_obese = train_hierarchical_for_gender(female_df)

# -----------------------------
# 6) Unified predict function
# -----------------------------
def predict_full(sample_df, broad_model, over_model, obese_model):
    """
    sample_df: dataframe with SAME columns as training X (no id/Gender/targets),
               and with raw categorical strings; the pipeline handles encoding.
    """
    bpred = broad_model.predict(sample_df)[0]
    if bpred == 'Overweight':
        # If overweight sub-model wasn't trained, fall back to broad
        try:
            return over_model.predict(sample_df)[0]
        except Exception:
            return 'Overweight_Level_I'  # simple fallback
    elif bpred == 'Obese':
        try:
            return obese_model.predict(sample_df)[0]
        except Exception:
            return 'Obesity_Type_I'  # simple fallback
    else:
        # Underweight or Normal are final
        return 'Insufficient_Weight' if bpred == 'Underweight' else 'Normal_Weight'


------ MALE MODELS ------

=== Stage 1 (Broad) ===
              precision    recall  f1-score   support

      Normal       0.86      0.86      0.86       215
       Obese       0.98      0.97      0.97       731
  Overweight       0.92      0.92      0.92       475
 Underweight       0.89      0.92      0.91       136

    accuracy                           0.94      1557
   macro avg       0.91      0.92      0.91      1557
weighted avg       0.94      0.94      0.94      1557

Broad Accuracy: 0.9351316634553629

=== Stage 2 (Overweight: Level I vs II) ===
                     precision    recall  f1-score   support

 Overweight_Level_I       1.00      1.00      1.00      1051
Overweight_Level_II       1.00      1.00      1.00      1325

           accuracy                           1.00      2376
          macro avg       1.00      1.00      1.00      2376
       weighted avg       1.00      1.00      1.00      2376


=== Stage 2 (Obese: Type I/II/III) ===
                  precisi

In [45]:
def train_hierarchical_for_gender(gdf, label_order=ORDER):
    # --- Stage 1: Broad Category ---
    X = gdf.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
    y_broad = gdf['BroadCategory']

    for c in (set(cat_cols + num_cols) - set(X.columns)):
        raise ValueError(f"Missing required feature column: {c}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_broad, test_size=0.2, stratify=y_broad, random_state=42
    )

    broad_pipe.fit(X_train, y_train)
    y_pred_broad = broad_pipe.predict(X_test)
    broad_acc = accuracy_score(y_test, y_pred_broad)
    print("\n=== Stage 1 (Broad) ===")
    print(classification_report(y_test, y_pred_broad))
    print("Broad Accuracy:", broad_acc)

    # --- Stage 2: Overweight (Level I vs II) ---
    over_df = gdf[gdf['BroadCategory'] == 'Overweight'].copy()
    X_over = over_df.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
    y_over = over_df['WeightCategory']
    over_acc = None
    if len(over_df) > 0 and y_over.nunique() > 1:
        over_pipe.fit(X_over, y_over)
        y_pred_over = over_pipe.predict(X_over)
        over_acc = accuracy_score(y_over, y_pred_over)
        print("\n=== Stage 2 (Overweight: Level I vs II) ===")
        print(classification_report(y_over, y_pred_over, labels=['Overweight_Level_I', 'Overweight_Level_II']))
        print("Overweight Accuracy:", over_acc)
    else:
        print("\n=== Stage 2 (Overweight) ===")
        print("Not enough variation to train (need both Level I and Level II).")

    # --- Stage 2: Obese (Type I / II / III) ---
    obese_df = gdf[gdf['BroadCategory'] == 'Obese'].copy()
    X_ob = obese_df.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
    y_ob = obese_df['WeightCategory']
    obese_acc = None
    if len(obese_df) > 0 and y_ob.nunique() > 1:
        obese_pipe.fit(X_ob, y_ob)
        y_pred_ob = obese_pipe.predict(X_ob)
        obese_acc = accuracy_score(y_ob, y_pred_ob)
        present = [c for c in ['Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'] if c in y_ob.unique()]
        print("\n=== Stage 2 (Obese: Type I/II/III) ===")
        print(classification_report(y_ob, y_pred_ob, labels=present))
        print("Obese Accuracy:", obese_acc)
    else:
        print("\n=== Stage 2 (Obese) ===")
        print("Not enough variation to train (need at least two Obesity types).")

    # --- Return all models and metrics ---
    return broad_pipe, over_pipe, obese_pipe, broad_acc, over_acc, obese_acc


In [None]:
print("------ MALE MODELS ------")
male_broad, male_over, male_obese, male_broad_acc, male_over_acc, male_obese_acc = \
    train_hierarchical_for_gender(male_df)

print("\n\n------ FEMALE MODELS ------")
female_broad, female_over, female_obese, female_broad_acc, female_over_acc, female_obese_acc = \
    train_hierarchical_for_gender(female_df)

print("\n=== Summary ===")
print(f"Male   -> Broad: {male_broad_acc:.3f}, Over: {male_over_acc}, Obese: {male_obese_acc}")
print(f"Female -> Broad: {female_broad_acc:.3f}, Over: {female_over_acc}, Obese: {female_obese_acc}")


------ MALE MODELS ------

=== Stage 1 (Broad) ===
              precision    recall  f1-score   support

      Normal       0.86      0.86      0.86       215
       Obese       0.98      0.97      0.97       731
  Overweight       0.92      0.92      0.92       475
 Underweight       0.89      0.92      0.91       136

    accuracy                           0.94      1557
   macro avg       0.91      0.92      0.91      1557
weighted avg       0.94      0.94      0.94      1557

Broad Accuracy: 0.9351316634553629

=== Stage 2 (Overweight: Level I vs II) ===
                     precision    recall  f1-score   support

 Overweight_Level_I       1.00      1.00      1.00      1051
Overweight_Level_II       1.00      1.00      1.00      1325

           accuracy                           1.00      2376
          macro avg       1.00      1.00      1.00      2376
       weighted avg       1.00      1.00      1.00      2376

Overweight Accuracy: 1.0

=== Stage 2 (Obese: Type I/II/III) ===


In [48]:
over_clf  = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42, class_weight='balanced')
obese_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42, class_weight='balanced')


In [49]:
from sklearn.metrics import classification_report, accuracy_score

def train_hierarchical_for_gender(gdf, label_order=ORDER):
    # ---------- Stage 1: Broad ----------
    X = gdf.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
    y_broad = gdf['BroadCategory']

    for c in (set(cat_cols + num_cols) - set(X.columns)):
        raise ValueError(f"Missing required feature column: {c}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_broad, test_size=0.2, stratify=y_broad, random_state=42
    )

    broad_pipe.fit(X_train, y_train)
    y_pred_broad = broad_pipe.predict(X_test)
    broad_acc = accuracy_score(y_test, y_pred_broad)
    print("\n=== Stage 1 (Broad) ===")
    print(classification_report(y_test, y_pred_broad))
    print("Broad Accuracy:", broad_acc)

    # ---------- Stage 2: Overweight (Level I vs II) ----------
    over_labels = ['Overweight_Level_I','Overweight_Level_II']
    over_df = gdf[gdf['BroadCategory'] == 'Overweight'].copy()
    over_df = over_df[over_df['WeightCategory'].isin(over_labels)]

    over_acc = None
    if len(over_df) > 0 and over_df['WeightCategory'].nunique() == 2:
        X_over = over_df.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
        y_over = over_df['WeightCategory']

        Xo_tr, Xo_te, yo_tr, yo_te = train_test_split(
            X_over, y_over, test_size=0.2, stratify=y_over, random_state=42
        )
        over_pipe.fit(Xo_tr, yo_tr)
        yo_pred = over_pipe.predict(Xo_te)
        over_acc = accuracy_score(yo_te, yo_pred)
        print("\n=== Stage 2 (Overweight: I vs II) ===")
        print(classification_report(yo_te, yo_pred, labels=over_labels, target_names=over_labels))
        print("Overweight Accuracy:", over_acc)
    else:
        print("\n=== Stage 2 (Overweight) ===")
        print("Not enough variation to train (need both Level I and Level II).")

    # ---------- Stage 2: Obese (Type I / II / III) ----------
    obese_labels = ['Obesity_Type_I','Obesity_Type_II','Obesity_Type_III']
    obese_df = gdf[gdf['BroadCategory'] == 'Obese'].copy()
    obese_df = obese_df[obese_df['WeightCategory'].isin(obese_labels)]

    obese_acc = None
    if len(obese_df) > 0 and obese_df['WeightCategory'].nunique() >= 2:
        X_ob = obese_df.drop(columns=['id','Gender','WeightCategory','BroadCategory'], errors='ignore')
        y_ob = obese_df['WeightCategory']

        Xb_tr, Xb_te, yb_tr, yb_te = train_test_split(
            X_ob, y_ob, test_size=0.2, stratify=y_ob, random_state=42
        )
        obese_pipe.fit(Xb_tr, yb_tr)
        yb_pred = obese_pipe.predict(Xb_te)
        # use only the labels present in yb_te to avoid warnings on tiny classes
        present = [c for c in obese_labels if c in yb_te.unique()]
        obese_acc = accuracy_score(yb_te, yb_pred)
        print("\n=== Stage 2 (Obese: Type I/II/III) ===")
        print(classification_report(yb_te, yb_pred, labels=present, target_names=present))
        print("Obese Accuracy:", obese_acc)
    else:
        print("\n=== Stage 2 (Obese) ===")
        print("Not enough variation to train (need at least two Obesity types).")

    return broad_pipe, over_pipe, obese_pipe, broad_acc, over_acc, obese_acc


In [50]:
print("------ MALE MODELS ------")
male_broad, male_over, male_obese, male_broad_acc, male_over_acc, male_obese_acc = \
    train_hierarchical_for_gender(male_df)

print("\n\n------ FEMALE MODELS ------")
female_broad, female_over, female_obese, female_broad_acc, female_over_acc, female_obese_acc = \
    train_hierarchical_for_gender(female_df)

# -----------------------------
# SUMMARY COMPARISON
# -----------------------------
print("\n=== Summary ===")
print(f"{'Model Type':<10} | {'Broad Acc':<10} | {'Overweight Acc':<15} | {'Obesity Acc'}")
print("-" * 60)
print(f"{'Male':<10} | {male_broad_acc:.3f}{'':<5} | {str(round(male_over_acc,3)) if male_over_acc else 'N/A':<15} | {str(round(male_obese_acc,3)) if male_obese_acc else 'N/A'}")
print(f"{'Female':<10} | {female_broad_acc:.3f}{'':<5} | {str(round(female_over_acc,3)) if female_over_acc else 'N/A':<15} | {str(round(female_obese_acc,3)) if female_obese_acc else 'N/A'}")


------ MALE MODELS ------

=== Stage 1 (Broad) ===
              precision    recall  f1-score   support

      Normal       0.86      0.86      0.86       215
       Obese       0.98      0.97      0.97       731
  Overweight       0.92      0.92      0.92       475
 Underweight       0.89      0.92      0.91       136

    accuracy                           0.94      1557
   macro avg       0.91      0.92      0.91      1557
weighted avg       0.94      0.94      0.94      1557

Broad Accuracy: 0.9351316634553629

=== Stage 2 (Overweight: I vs II) ===
                     precision    recall  f1-score   support

 Overweight_Level_I       0.91      0.86      0.88       211
Overweight_Level_II       0.89      0.93      0.91       265

           accuracy                           0.90       476
          macro avg       0.90      0.90      0.90       476
       weighted avg       0.90      0.90      0.90       476

Overweight Accuracy: 0.8991596638655462

=== Stage 2 (Obese: Type I/II/

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



=== Stage 1 (Broad) ===
              precision    recall  f1-score   support

      Normal       0.87      0.89      0.88       255
       Obese       0.97      0.98      0.98       788
  Overweight       0.89      0.87      0.88       270
 Underweight       0.94      0.93      0.94       237

    accuracy                           0.94      1550
   macro avg       0.92      0.92      0.92      1550
weighted avg       0.94      0.94      0.94      1550

Broad Accuracy: 0.9387096774193548

=== Stage 2 (Overweight: I vs II) ===
                     precision    recall  f1-score   support

 Overweight_Level_I       0.92      0.84      0.88       159
Overweight_Level_II       0.79      0.89      0.84       111

           accuracy                           0.86       270
          macro avg       0.85      0.86      0.86       270
       weighted avg       0.87      0.86      0.86       270

Overweight Accuracy: 0.8592592592592593

=== Stage 2 (Obese: Type I/II/III) ===
                 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
df.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS',
       'WeightCategory', 'BroadCategory', 'BMI'],
      dtype='object')

In [53]:
test = pd.read_csv('test.csv')

In [57]:
test.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'BMI'],
      dtype='object')

In [54]:
test['BMI'] = test['Weight'] / (test['Height'] ** 2)

In [58]:
FEATURES = [
    # categorical
    'family_history_with_overweight','FAVC','CAEC','SMOKE','SCC','CALC','MTRANS',
    # numeric
    'Age','Height','Weight','BMI','FCVC','NCP','CH2O','FAF','TUE'
]

def build_inference_sample(row):
    s = row.to_frame().T.copy()

    # If your Height is in cm, compute BMI as Weight / ( (Height_cm/100)^2 )
    # You currently do Weight/(Height**2). Keep it only if Height is already in meters.
    # s['BMI'] = s['Weight'] / ((s['Height']/100.0)**2)  # uncomment if Height is cm

    # Ensure CH2O is present (fix common CH20 typo)
    if 'CH20' in s.columns and 'CH2O' not in s.columns:
        s['CH2O'] = s['CH20']

    # Keep ONLY model features in the exact order
    s = s.reindex(columns=FEATURES)
    return s

In [59]:
predictions = []

for _, row in test.iterrows():
    sample = build_inference_sample(row)

    # Route based on gender (use the correct model set)
    if row['Gender'] == 'Male':
        pred = predict_full(sample, male_broad, male_over, male_obese)
    else:
        pred = predict_full(sample, female_broad, female_over, female_obese)

    predictions.append((row['id'], pred))

ValueError: X has 27 features, but RandomForestClassifier is expecting 29 features as input.

In [None]:
sub_df = pd.DataFrame(predictions, columns=['id', 'WeightCategory'])
