<a href="https://colab.research.google.com/github/boeyjw/kaggle-store/blob/main/2024/S04E02_Obesity/03_FeatEng.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Init

In [None]:
from pathlib import Path

import numpy as np
import scipy.stats as ss
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from prettytable import PrettyTable

In [None]:
SEED = 1234567890
LABEL = "nobeyesdad"
data_p = Path("/content/drive/MyDrive/Colab Notebooks/2024/S04E02_Obesity")

train = pd.read_csv(data_p / "train.csv.gz").rename(columns=lambda x: x.strip().lower()).set_index("id")
test = pd.read_csv(data_p / "test.csv.gz").rename(columns=lambda x: x.strip().lower()).set_index("id")

# X_train, X_val, y_train, y_val = train_test_split(train.drop(columns=[LABEL]), train[LABEL], test_size=0.2, stratify=train[LABEL], random_state=SEED)
# X_train.shape, X_val.shape

# Feature Engineering

## Univariate

In [None]:
def uni_feateng(df):
    df["_binned_age"] = pd.cut(df["age"], bins=[0, 18, 30, 40, 9999], right=False, labels=[0, 1, 2, 3]).astype(int)
    # df["_height"] = df["height"].clip(lower=1.5, upper=1.9)
    # df["_weight"] = df["weight"].clip(lower=40)
    df["_binned_fcvc"] = pd.cut(df["fcvc"], bins=[0, 1.0, 1.5, 2.0, 2.5, 3.0], right=True, labels=[1.0, 1.5, 2.0, 2.5, 3.0]).astype(float)
    df["_binned_ncp"] = pd.cut(df["ncp"], bins=[0, 1.0, 3.0, 4.0], right=True, labels=[1, 3, 4]).astype(int)
    df["_binned_ch2o"] = pd.cut(df["ch2o"], bins=[0, 1.0, 2.0, 3.0], right=True, labels=[1, 2, 3]).astype(int)
    df["_binned_faf"] = pd.cut(df["faf"], bins=[-1, 0, 1.0, 2.0, 3.0], right=True, labels=[0, 1, 2, 3]).astype(int)
    df["_binned_tue"] = pd.cut(df["tue"], bins=[-1, 0, 1.0, 2.0], right=True, labels=[0, 1, 2]).astype(int)
    df["_calc"] = df["calc"]
    df.loc[df["_calc"] == "Always", "_calc"] = "Frequently"
    df["_exercise"] = np.where(df["mtrans"].isin(["Automobile", "Motorbike"]), "No", "Yes")
    df.loc[df["mtrans"] == "Public_Transportation", "_exercise"] = "Public_Transportation"

In [None]:
uni_feateng(train)
uni_feateng(test)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20758 entries, 0 to 20757
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          20758 non-null  object 
 1   age                             20758 non-null  float64
 2   height                          20758 non-null  float64
 3   weight                          20758 non-null  float64
 4   family_history_with_overweight  20758 non-null  object 
 5   favc                            20758 non-null  object 
 6   fcvc                            20758 non-null  float64
 7   ncp                             20758 non-null  float64
 8   caec                            20758 non-null  object 
 9   smoke                           20758 non-null  object 
 10  ch2o                            20758 non-null  float64
 11  scc                             20758 non-null  object 
 12  faf                             

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13840 entries, 20758 to 34597
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          13840 non-null  object 
 1   age                             13840 non-null  float64
 2   height                          13840 non-null  float64
 3   weight                          13840 non-null  float64
 4   family_history_with_overweight  13840 non-null  object 
 5   favc                            13840 non-null  object 
 6   fcvc                            13840 non-null  float64
 7   ncp                             13840 non-null  float64
 8   caec                            13840 non-null  object 
 9   smoke                           13840 non-null  object 
 10  ch2o                            13840 non-null  float64
 11  scc                             13840 non-null  object 
 12  faf                         

## Multivariate

In [None]:
def multi_feateng(df):
    df["_bmi"] = df["weight"] / (df["height"]**2)
    df["_binned_bmi"] = pd.cut(df["_bmi"], bins=[0, 18.5, 25, 30, 9999], labels=["Underweight", "Healthy", "Overweight", "Obese"]).astype(str)
    df["_habitually_unhealthy"] = np.where(
        (~df["_binned_bmi"].isin(["Underweight", "Healthy"]))
        & (df["family_history_with_overweight"] == "yes")
        & (df["_exercise"] == "No")
        & (df["favc"] == "yes")
        & (df["calc"] != "no")
    , "yes", "no")
    df["_eating_habits"] = "Normal"
    df.loc[(df["ncp"] <= 2) & (df["faf"] >= 1), "_eating_habits"] = "Under"
    df.loc[(df["ncp"] >= 2) & (df["faf"] < 1) & (df["caec"] != "no"), "_eating_habits"] = "Over"
    df["_health_conscious"] = np.where(
        (df["ncp"] >= 2)
        & (df["caec"].isin(["no", "Sometimes"]))
        & (df["_exercise"].isin(["Public_Transportation", "Yes"]))
        & (df["faf"] >= 1)
    , "yes", "no")
    df["_devine_ideal"] = np.where(
        df["gender"] == "Female",
        np.where(df["weight"] >= (45.5 + 0.9 * (df["height"] * 100 - 152)), "Over", "Ideal"),
        np.where(df["weight"] >= (50 + 0.9 * (df["height"] * 100 - 152)), "Over", "Ideal")
    )

In [None]:
multi_feateng(train)
multi_feateng(test)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20758 entries, 0 to 20757
Data columns (total 31 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          20758 non-null  object 
 1   age                             20758 non-null  float64
 2   height                          20758 non-null  float64
 3   weight                          20758 non-null  float64
 4   family_history_with_overweight  20758 non-null  object 
 5   favc                            20758 non-null  object 
 6   fcvc                            20758 non-null  float64
 7   ncp                             20758 non-null  float64
 8   caec                            20758 non-null  object 
 9   smoke                           20758 non-null  object 
 10  ch2o                            20758 non-null  float64
 11  scc                             20758 non-null  object 
 12  faf                             

In [None]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13840 entries, 20758 to 34597
Data columns (total 30 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          13840 non-null  object 
 1   age                             13840 non-null  float64
 2   height                          13840 non-null  float64
 3   weight                          13840 non-null  float64
 4   family_history_with_overweight  13840 non-null  object 
 5   favc                            13840 non-null  object 
 6   fcvc                            13840 non-null  float64
 7   ncp                             13840 non-null  float64
 8   caec                            13840 non-null  object 
 9   smoke                           13840 non-null  object 
 10  ch2o                            13840 non-null  float64
 11  scc                             13840 non-null  object 
 12  faf                         

In [None]:
dropcols = ["nobeyesdad", "age", "height", "weight", "fcvc", "ncp", "ch2o", "faf", "tue", "_bmi", "calc"]
df_mi = train.drop(columns=dropcols)
col_transformer_mi = ColumnTransformer([
    ("ohe", OneHotEncoder(drop="if_binary", sparse_output=False).set_output(transform="pandas"), df_mi.select_dtypes(include=object).columns)
], verbose_feature_names_out=False, remainder="passthrough").set_output(transform="pandas").fit(df_mi)
df_mi = col_transformer_mi.transform(df_mi)

mi = mutual_info_classif(df_mi, train["nobeyesdad"], n_neighbors=5)
mi_tab = PrettyTable(field_names=["Variable", "MI"], sortby="MI", reversesort=True)
mi_tab.add_rows(zip(list(col_transformer_mi.get_feature_names_out()), mi.round(4)))

print(mi_tab)

+------------------------------------+--------+
|              Variable              |   MI   |
+------------------------------------+--------+
|         _binned_bmi_Obese          | 0.5783 |
|         _devine_ideal_Over         | 0.3502 |
|       _binned_bmi_Overweight       | 0.3374 |
|        _binned_bmi_Healthy         | 0.2833 |
|            _binned_fcvc            | 0.2765 |
|      _binned_bmi_Underweight       | 0.2686 |
|            gender_Male             | 0.2558 |
| family_history_with_overweight_yes | 0.1685 |
|            _binned_tue             | 0.1512 |
|            _binned_age             | 0.1314 |
|            _binned_ch2o            | 0.1286 |
|           caec_Sometimes           | 0.1235 |
|            _binned_ncp             | 0.1095 |
|          caec_Frequently           | 0.1081 |
|          _calc_Sometimes           | 0.1065 |
|              _calc_no              | 0.0961 |
|            _binned_faf             | 0.0838 |
|     _habitually_unhealthy_yes      | 0

In [None]:
pd.pivot_table(train, index="nobeyesdad", columns="_devine_ideal", values="weight", aggfunc="mean")

_devine_ideal,Ideal,Over
nobeyesdad,Unnamed: 1_level_1,Unnamed: 2_level_1
Insufficient_Weight,49.696288,56.080828
Normal_Weight,59.42574,63.167368
Obesity_Type_I,46.734258,92.418122
Obesity_Type_II,,115.995914
Obesity_Type_III,42.0,117.716166
Overweight_Level_I,65.736826,74.484162
Overweight_Level_II,64.25,82.113845


In [None]:
train.to_csv(data_p / "train_feateng.1.csv.gz", index=True), test.to_csv(data_p / "test_feateng.1.csv.gz", index=True)

(None, None)

## Overweight derived

In [None]:
def over_feateng(df):
    pass