In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install ucimlrepo



# Pre-Processing

Based on the paper associated with the Dataset, 

$$\text{Mass Body Index} = \frac{\text{Weight}}{(\text{Height}^2)}$$

And the following guidelines based on the mass body index were used to classify individuals' obesity level:

 Underweight Less than 18.5 
 Normal 18.5 to 24.9
 Overweight 25.0 to 29.9
 Obesity I 30.0 to 34.9
 Obesity II 35.0 to 39.9
 Obesity III Higher than 40

For this reason, introducing both weight and height as covariates trivializes the problem. For the duration of our project, we assume weight is known but height is not, following the setup in this paper: https://www.frontiersin.org/journals/big-data/articles/10.3389/fdata.2024.1469981/full

In [54]:
from ucimlrepo import fetch_ucirepo 
  
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 

X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features 

# Since Height and Weight directly define the Mass Body Index -> Obesity Classification, we remove Height.
X = X[[col for col in X.columns if col not in ["Height"]]]

y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets

In [55]:
numerical_cols = ["Age", "FCVC", "NCP", "CH2O", "FAF", "TUE", "Weight"]
categorical_cols = [col for col in X.columns if col not in numerical_cols]

Below we encode the response variable in two ways: 

- Binary label: 1 if a unit is considered obese at any level, 0 otherwise

- Multinomial label: the original categories

In [61]:
from sklearn.model_selection import train_test_split
# Train-test split - note: conduct this first before all pre-processing because all pre processing should be based on the train set ONLY (avoid data leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Binary classification

y_train_bin = y_train.isin(["Obesity_Type_I", "Obesity_Type_III", "Obesity_Type_II"]).rename(columns={"NObeyesdad": "is_obese"})["is_obese"]
y_test_bin = y_test.isin(["Obesity_Type_I", "Obesity_Type_III", "Obesity_Type_II"]).rename(columns={"NObeyesdad": "is_obese"})["is_obese"]

# Multiclass classification

y_train_multi = y_train["NObeyesdad"].rename("obesity_category")
y_test_multi = y_test["NObeyesdad"].rename("obesity_category")

In [63]:
# Save to CSV
y_train_bin.to_csv("y_train_bin.csv", index=False)

In [65]:
# Save to CSV
y_test_bin.to_csv("y_test_bin.csv", index=False)

In [67]:
# Save to CSV
y_train_multi.to_csv("y_train_multi.csv", index=False)

In [69]:
# Save to CSV
y_test_multi.to_csv("y_test_multi.csv", index=False)

In [71]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [73]:
numerical_cols = ["Age", "FCVC", "NCP", "CH2O", "FAF", "TUE", "Weight"]
categorical_cols = [col for col in X.columns if col not in numerical_cols]

For each numerical feature:

- Add log of feature

- Add exponent of feature

- Add all possible degree 2 combinations of the features including non-interaction features (ex. Age * FCVC, or Age^2)

The idea is that we can generate a reasonable set of possible features and use methods like LASSO regression or Recursive Feature Elimination to narrow them down. In practice, we use LASSO for speed purposes (RFE mainly used for non-linear models). 

In [80]:
# ============================= LOG + EXP TRANSFORMATIONS ==================================================

# Create transformations for log, exp columns
log_transformer = FunctionTransformer(np.log1p)
exp_transformer = FunctionTransformer(np.exp)

# Create column transformation pipeline (apply exp, log)
column_transformer = ColumnTransformer([
    ('log_age', log_transformer, numerical_cols),
    ('exp_age', exp_transformer, numerical_cols),
], remainder='passthrough')

# Transform training and test sets (apply exp, log)
X_logexp_train_arr = column_transformer.fit_transform(X_train)
X_logexp_test_arr = column_transformer.transform(X_test)

# Generate exp, log, column names
log_names = [f"log1p_{col}" for col in numerical_cols]
exp_names = [f"exp_{col}" for col in numerical_cols]
passthrough_cols = [col for col in X_train.columns if col not in numerical_cols]
transform_cols = log_names + exp_names + passthrough_cols

# Create exp, log dataframes for both train and test
X_logexp_train_df = pd.DataFrame(X_logexp_train_arr, columns=transform_cols, index=X_train.index)
X_logexp_test_df = pd.DataFrame(X_logexp_test_arr, columns=transform_cols, index=X_test.index)

# ============================= POLYNOMIAL TRANSFORMATIONS ==================================================

# Create sklearn polynomial features object (automatically will create poly features based on arguments)
poly = PolynomialFeatures(degree=(2, 2), include_bias=False)

# Transform training and test sets (apply poly features)
X_poly_train_arr = poly.fit_transform(X_train[numerical_cols])
X_poly_test_arr = poly.transform(X_test[numerical_cols])

# Generate poly feature names
poly_feature_names = poly.get_feature_names_out(numerical_cols)

# Create poly dataframes for both train and test
X_poly_train_df = pd.DataFrame(X_poly_train_arr, columns=poly_feature_names, index=X_train.index)
X_poly_test_df = pd.DataFrame(X_poly_test_arr, columns=poly_feature_names, index=X_test.index)

# ============================= COMBINE ALL FEATURES ==================================================

# Combine all features - X_train has original, X_poly has polynomials, X_logexp has log and exp
X_train_long = pd.concat([X_train, X_poly_train_df, X_logexp_train_df], axis=1)
X_test_long = pd.concat([X_test, X_poly_test_df, X_logexp_test_df], axis=1)

In [86]:
# One-hot encoding for categorical features, and standardization of numerical features
one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first')
scaler = StandardScaler()
# Numerical columns, including generated features, for standardization
extended_numerical_cols = [col for col in X_train_long.columns if col not in categorical_cols]

# One-hot encode categorical features and standardize numerical features for training set
X_train_dummies = pd.DataFrame(one_hot_encoder.fit_transform(X_train[categorical_cols]), columns=one_hot_encoder.get_feature_names_out())
X_train_std_numerical = pd.DataFrame(scaler.fit_transform(X_train_long[extended_numerical_cols]), columns=extended_numerical_cols)
X_train_full = pd.concat([X_train_std_numerical, X_train_dummies], axis=1)

# One-hot encode categorical features and standardize numerical features for test set
X_test_dummies = pd.DataFrame(one_hot_encoder.transform(X_test[categorical_cols]), columns=one_hot_encoder.get_feature_names_out())
X_test_std_numerical = pd.DataFrame(scaler.transform(X_test_long[extended_numerical_cols]), columns=extended_numerical_cols)
X_test_full = pd.concat([X_test_std_numerical, X_test_dummies], axis=1)

In [88]:
X_train_full.shape # 64 features to work with

(1688, 64)

In [90]:
# Save to CSV
X_train_full.to_csv("X_train_full.csv", index=False)

In [92]:
# Save to CSV
X_test_full.to_csv("X_test_full.csv", index=False)

## Feature Selection

Use LASSO regression to select features. 

Retrospectively, this may not be necessary, as we apply GridSearchCV later across L1, L2, and Elasticnet penalties, meaning the GridSearch would automatically feature select using L1 based on cross-validated accuracy.

Still, it can speed up computation time by initially getting rid of seemingly useless features.

In [98]:
from sklearn.linear_model import LogisticRegression

In [100]:
c_arr = [0.1, 1, 10]

features_dict = {}

# For each regularization strength, apply LASSO and retrieve non-zero features
for C in c_arr:
    lr = LogisticRegression(penalty="l1", solver="saga", C=C, multi_class="multinomial", max_iter=10000)
    lr.fit(X_train_full, y_train_multi)

    features_dict[C] = X_train_full.columns[lr.coef_[0] != 0]



In [101]:
# 0.1 = strongest regularization, most minimal set of features
# 1 = second strongest regularization, moderate set of features
# 10 = least regularization, most features

features_dict

{0.1: Index(['FCVC^2', 'FCVC NCP', 'FCVC CH2O', 'NCP FAF', 'NCP TUE', 'log1p_Age',
        'log1p_Weight', 'exp_NCP', 'Gender_Male', 'CAEC_Frequently'],
       dtype='object'),
 1: Index(['Age', 'Weight', 'Age^2', 'Age Weight', 'FCVC^2', 'FCVC CH2O',
        'FCVC FAF', 'FCVC TUE', 'FCVC Weight', 'NCP FAF', 'NCP TUE',
        'NCP Weight', 'CH2O FAF', 'CH2O TUE', 'TUE^2', 'log1p_FCVC',
        'log1p_CH2O', 'log1p_TUE', 'log1p_Weight', 'exp_FCVC', 'exp_CH2O',
        'exp_FAF', 'Gender_Male', 'FAVC_yes', 'CAEC_Frequently',
        'CAEC_Sometimes', 'SCC_yes', 'CALC_Frequently',
        'MTRANS_Public_Transportation'],
       dtype='object'),
 10: Index(['Age', 'Weight', 'FAF', 'TUE', 'Age^2', 'Age FCVC', 'Age NCP',
        'Age CH2O', 'Age FAF', 'Age Weight', 'FCVC^2', 'FCVC NCP', 'FCVC CH2O',
        'FCVC FAF', 'FCVC TUE', 'FCVC Weight', 'NCP^2', 'NCP CH2O', 'NCP FAF',
        'NCP TUE', 'NCP Weight', 'CH2O^2', 'CH2O FAF', 'CH2O TUE',
        'CH2O Weight', 'FAF^2', 'FAF TUE', 'FAF W