In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
df = pd.read_csv("train.csv")

In [10]:
order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# -----------------------------
# 0) Assumptions / Setup
# -----------------------------
# df = ...  # your dataframe
# Columns you shared:
# ['id','Gender','Age','Height','Weight','family_history_with_overweight','FAVC','FCVC','NCP','CAEC','SMOKE','CH2O','SCC','FAF','TUE','CALC','MTRANS','WeightCategory']

ORDER = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

def to_broad(cat: str) -> str:
    if cat == 'Insufficient_Weight':
        return 'Underweight'
    if cat == 'Normal_Weight':
        return 'Normal'
    if cat.startswith('Overweight'):
        return 'Overweight'
    return 'Obese'  # all Obesity_* go here

# -----------------------------
# 1) Basic feature engineering
# -----------------------------
df = df.copy()

# Create BMI safely
# df['BMI'] = df['Weight'] / ((df['Height'] / 100.0)**2)
df['BMI'] = df['Weight'] / (df['Height'] ** 2)

# Broad category for Stage 1
df['BroadCategory'] = df['WeightCategory'].apply(to_broad)

# -----------------------------
# 2) Split by gender (two separate model families)
# -----------------------------
def prepare_gender_df(df, gender_value):
    gdf = df[df['Gender'] == gender_value].copy()
    # Drop id and Gender from features
    # Keep BMI; you may keep Height/Weight too if you like—here we keep them
    # Target columns remain for later splits
    return gdf

male_df = prepare_gender_df(df, 'Male')
female_df = prepare_gender_df(df, 'Female')

# -----------------------------
# 3) Common preprocessing: encode categoricals via ColumnTransformer
# -----------------------------
# Numerical (already numeric):
num_cols = ['Age', 'Height', 'Weight', 'BMI', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

# Categorical (strings/yes-no/etc):
cat_cols = [
    'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
    'SCC', 'CALC', 'MTRANS'
]
# Do NOT include 'Gender' here since we dropped it from each gender subset.

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols),
        ('num', 'passthrough', num_cols),
    ],
    remainder='drop'
)

# -----------------------------
# 4) Build pipelines
# -----------------------------
# Stage 1: Broad (Underweight/Normal/Overweight/Obese)
broad_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

broad_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', broad_clf)
])

# Stage 2 models:
over_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
obese_clf = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)

over_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', over_clf)
])

obese_pipe = Pipeline(steps=[
    ('prep', preprocess),
    ('clf', obese_clf)
])

In [12]:
def train_hierarchical_for_gender(gdf, label_order=ORDER):
    # --- Stage 1: Broad Category ---
    X = gdf.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
    y_broad = gdf['BroadCategory']

    for c in (set(cat_cols + num_cols) - set(X.columns)):
        raise ValueError(f"Missing required feature column: {c}")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_broad, test_size=0.2, stratify=y_broad, random_state=42
    )

    broad_pipe.fit(X_train, y_train)
    y_pred_broad = broad_pipe.predict(X_test)
    broad_acc = accuracy_score(y_test, y_pred_broad)
    print("\n=== Stage 1 (Broad) ===")
    print(classification_report(y_test, y_pred_broad))
    print("Broad Accuracy:", broad_acc)

    # --- Stage 2: Overweight (Level I vs II) ---
    over_df = gdf[gdf['BroadCategory'] == 'Overweight'].copy()
    X_over = over_df.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
    y_over = over_df['WeightCategory']
    over_acc = None
    if len(over_df) > 0 and y_over.nunique() > 1:
        over_pipe.fit(X_over, y_over)
        y_pred_over = over_pipe.predict(X_over)
        over_acc = accuracy_score(y_over, y_pred_over)
        print("\n=== Stage 2 (Overweight: Level I vs II) ===")
        print(classification_report(y_over, y_pred_over, labels=['Overweight_Level_I', 'Overweight_Level_II']))
        print("Overweight Accuracy:", over_acc)
    else:
        print("\n=== Stage 2 (Overweight) ===")
        print("Not enough variation to train (need both Level I and Level II).")

    # --- Stage 2: Obese (Type I / II / III) ---
    obese_df = gdf[gdf['BroadCategory'] == 'Obese'].copy()
    X_ob = obese_df.drop(columns=['id', 'Gender', 'WeightCategory', 'BroadCategory'], errors='ignore')
    y_ob = obese_df['WeightCategory']
    obese_acc = None
    if len(obese_df) > 0 and y_ob.nunique() > 1:
        obese_pipe.fit(X_ob, y_ob)
        y_pred_ob = obese_pipe.predict(X_ob)
        obese_acc = accuracy_score(y_ob, y_pred_ob)
        present = [c for c in ['Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'] if c in y_ob.unique()]
        print("\n=== Stage 2 (Obese: Type I/II/III) ===")
        print(classification_report(y_ob, y_pred_ob, labels=present))
        print("Obese Accuracy:", obese_acc)
    else:
        print("\n=== Stage 2 (Obese) ===")
        print("Not enough variation to train (need at least two Obesity types).")

    # --- Return all models and metrics ---
    return broad_pipe, over_pipe, obese_pipe, broad_acc, over_acc, obese_acc


In [13]:
print("------ MALE MODELS ------")
male_broad, male_over, male_obese, male_broad_acc, male_over_acc, male_obese_acc = \
    train_hierarchical_for_gender(male_df)

print("\n\n------ FEMALE MODELS ------")
female_broad, female_over, female_obese, female_broad_acc, female_over_acc, female_obese_acc = \
    train_hierarchical_for_gender(female_df)

print("\n=== Summary ===")
print(f"Male   -> Broad: {male_broad_acc:.3f}, Over: {male_over_acc}, Obese: {male_obese_acc}")
print(f"Female -> Broad: {female_broad_acc:.3f}, Over: {female_over_acc}, Obese: {female_obese_acc}")


------ MALE MODELS ------

=== Stage 1 (Broad) ===
              precision    recall  f1-score   support

      Normal       0.86      0.86      0.86       215
       Obese       0.98      0.97      0.97       731
  Overweight       0.92      0.92      0.92       475
 Underweight       0.89      0.92      0.91       136

    accuracy                           0.94      1557
   macro avg       0.91      0.92      0.91      1557
weighted avg       0.94      0.94      0.94      1557

Broad Accuracy: 0.9351316634553629

=== Stage 2 (Overweight: Level I vs II) ===
                     precision    recall  f1-score   support

 Overweight_Level_I       1.00      1.00      1.00      1051
Overweight_Level_II       1.00      1.00      1.00      1325

           accuracy                           1.00      2376
          macro avg       1.00      1.00      1.00      2376
       weighted avg       1.00      1.00      1.00      2376

Overweight Accuracy: 1.0

=== Stage 2 (Obese: Type I/II/III) ===


In [14]:
# ========================
# 🔹 MAKE PREDICTIONS ON TEST
# ========================

# Load test data
test = pd.read_csv('test.csv')

# Compute BMI (same formula you used in training)
# ⚠️ If your Height column is in cm, use (Height / 100)**2 instead.
test['BMI'] = test['Weight'] / (test['Height'] ** 2)

# Ensure same features
cat_cols = [
    'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
    'SCC', 'CALC', 'MTRANS'
]
num_cols = ['Age', 'Height', 'Weight', 'BMI', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
FEATURES = cat_cols + num_cols

def build_feature_frame(df_like):
    """Ensure consistent feature columns and datatypes."""
    X = df_like.reindex(columns=FEATURES, fill_value=np.nan).copy()
    for c in cat_cols:
        X[c] = X[c].astype('object')
    for c in num_cols:
        X[c] = pd.to_numeric(X[c], errors='coerce')
    return X

def predict_for_gender(test_part, broad_model, over_model, obese_model):
    """Predict for a gender-specific test subset."""
    X = build_feature_frame(test_part)
    broad_pred = broad_model.predict(X)

    final_pred = pd.Series(broad_pred, index=test_part.index)

    # Overweight subset → Level I vs II
    over_idx = final_pred[final_pred == 'Overweight'].index
    if len(over_idx) > 0:
        try:
            final_pred.loc[over_idx] = over_model.predict(X.loc[over_idx])
        except Exception:
            final_pred.loc[over_idx] = 'Overweight_Level_I'

    # Obese subset → Type I/II/III
    obese_idx = final_pred[final_pred == 'Obese'].index
    if len(obese_idx) > 0:
        try:
            final_pred.loc[obese_idx] = obese_model.predict(X.loc[obese_idx])
        except Exception:
            final_pred.loc[obese_idx] = 'Obesity_Type_I'

    # Map Underweight/Normal
    final_pred = final_pred.replace({
        'Underweight': 'Insufficient_Weight',
        'Normal': 'Normal_Weight'
    })

    return final_pred

# Split test set by gender
male_test = test[test['Gender'] == 'Male']
female_test = test[test['Gender'] == 'Female']

# Predict for both genders
male_pred = predict_for_gender(male_test, male_broad, male_over, male_obese)
female_pred = predict_for_gender(female_test, female_broad, female_over, female_obese)

# Combine results
all_pred = pd.concat([male_pred, female_pred]).sort_index()

# Build submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'WeightCategory': all_pred.values
})

# Save submission file
submission.to_csv('sample_submission.csv', index=False)
print(submission.head(), '\nSaved as sample_submission.csv')


ValueError: X has 27 features, but RandomForestClassifier is expecting 29 features as input.