<a href="https://colab.research.google.com/github/boeyjw/kaggle-store/blob/main/2024/S04E02_Obesity/02_autogluon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%shell
pip install -U pip
pip install -U setuptools wheel

# CPU version of pytorch has smaller footprint - see installation instructions in
# pytorch documentation - https://pytorch.org/get-started/locally/
pip install torch==2.0.1+cpu torchvision==0.15.2+cpu --index-url https://download.pytorch.org/whl/cpu

pip install autogluon

Collecting pip
  Downloading pip-24.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-24.0
Collecting setuptools
  Downloading setuptools-69.1.0-py3-none-any.whl.metadata (6.1 kB)
Downloading setuptools-69.1.0-py3-none-any.whl (819 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m819.3/819.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 67.7.2
    Uninstalling setuptools-67.7.2:
      Successfully uninstalled setuptools-67.7.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This b



# Init

In [None]:
from pathlib import Path

import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import ConfusionMatrixDisplay

from autogluon.tabular import TabularDataset, TabularPredictor

SEED = 1234567890
LABEL = "nobeyesdad"
data_p = Path("/content/drive/MyDrive/Colab Notebooks/2024/S04E02_Obesity")

# AutoGluon

## From raw

In [None]:
train, test = pd.read_csv(data_p / "train.csv.gz").set_index("id"), pd.read_csv(data_p / "test.csv.gz").set_index("id")
train, test = train.rename(columns=lambda x: x.strip().lower()), test.rename(columns=lambda x: x.strip().lower())

train.shape, test.shape

((20758, 17), (13840, 16))

In [None]:
X_train, X_val, _, _ = train_test_split(train.drop(columns=[LABEL]), train[LABEL], test_size=0.2, stratify=train[LABEL], random_state=SEED)
X_train.shape, X_val.shape

((16606, 16), (4152, 16))

In [None]:
ag_train = TabularDataset(train.iloc[X_train.index])
ag_val = TabularDataset(train.iloc[X_val.index])

In [None]:
save_p = data_p / "autogluon"

if not save_p.exists():
    ag_predictor = TabularPredictor(label=LABEL, problem_type="multiclass", eval_metric="log_loss", path=save_p).fit(
        ag_train, time_limit=1800, presets="best_quality", num_bag_folds=5, num_bag_sets=4, num_stack_levels=3
    )

predictor = TabularPredictor.load(save_p)

In [None]:
predictor.evaluate(ag_val)

{'log_loss': -0.25295145728494617,
 'accuracy': 0.9096820809248555,
 'balanced_accuracy': 0.8995160157501979,
 'mcc': 0.894072552385261}

## From pre1

In [None]:
train_pre1 = pd.read_parquet(data_p / "train_pre1.parquet")
target_encode = joblib.load(data_p / "target_encode.pkz")
col_transformer = joblib.load(data_p / "col_transformer.pkz")

X_train_p1, X_val_p1, y_train_p1, y_val_p1 = train_test_split(train_pre1.drop(columns=[LABEL]), train_pre1[LABEL], test_size=0.2, stratify=train_pre1[LABEL], random_state=SEED)
y_train_p1, y_val_p1 = pd.Series(target_encode.inverse_transform(y_train_p1)), pd.Series(target_encode.inverse_transform(y_val_p1))
X_train_p1.shape, X_val_p1.shape

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


((16606, 22), (4152, 22))

In [None]:
ag_train_p1 = TabularDataset(pd.concat([X_train_p1.reset_index(drop=True), y_train_p1.to_frame(LABEL)], axis=1))
ag_val_p1 = TabularDataset(pd.concat([X_val_p1.reset_index(drop=True), y_val_p1.to_frame(LABEL)], axis=1))

In [None]:
save_p = data_p / "autogluon_p1"

if not save_p.exists():
    ag_predictor = TabularPredictor(label=LABEL, problem_type="multiclass", eval_metric="log_loss", path=save_p).fit(
        ag_train_p1,
        time_limit=3600,
        presets="best_quality", num_bag_folds=5, num_bag_sets=4, num_stack_levels=3
    )

predictor_p1 = TabularPredictor.load(save_p)

In [None]:
met_p1 = predictor_p1.evaluate(ag_val_p1, detailed_report=True)

In [None]:
met_p1['']

{'log_loss': -0.24701275158499486,
 'accuracy': 0.9120905587668593,
 'balanced_accuracy': 0.9022330491667896,
 'mcc': 0.8968928449481263,
 'confusion_matrix':                      Insufficient_Weight  Normal_Weight  Obesity_Type_I  \
 Insufficient_Weight                  470             32               0   
 Normal_Weight                         26            551               1   
 Obesity_Type_I                         0              1             523   
 Obesity_Type_II                        0              0              13   
 Obesity_Type_III                       0              0               0   
 Overweight_Level_I                     2             35               9   
 Overweight_Level_II                    0             10              41   
 
                      Obesity_Type_II  Obesity_Type_III  Overweight_Level_I  \
 Insufficient_Weight                0                 0                   3   
 Normal_Weight                      0                 0                  3

In [None]:
met_p1["confusion_matrix"]

Unnamed: 0,Insufficient_Weight,Normal_Weight,Obesity_Type_I,Obesity_Type_II,Obesity_Type_III,Overweight_Level_I,Overweight_Level_II
Insufficient_Weight,470,32,0,0,0,3,0
Normal_Weight,26,551,1,0,0,36,3
Obesity_Type_I,0,1,523,13,3,11,31
Obesity_Type_II,0,0,13,634,0,0,3
Obesity_Type_III,0,0,0,0,808,1,0
Overweight_Level_I,2,35,9,0,0,384,55
Overweight_Level_II,0,10,41,5,0,31,417


In [None]:
predictor_p1.feature_importance(ag_val_p1)

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
weight,3.152383,0.049417,7.24405e-09,5,3.254133,3.050633
gender_Male,0.311362,0.006456,2.216807e-08,5,0.324655,0.298069
fcvc,0.285812,0.008343,8.702575e-08,5,0.302991,0.268634
height,0.228739,0.007566,1.433978e-07,5,0.244317,0.213162
age,0.073812,0.002625,1.916822e-07,5,0.079217,0.068407
ch2o,0.032611,0.001849,1.235984e-06,5,0.036419,0.028803
tue,0.025794,0.001091,3.836953e-07,5,0.028041,0.023547
ncp,0.019307,0.001251,2.106546e-06,5,0.021884,0.01673
faf,0.01816,0.001263,2.791246e-06,5,0.020761,0.015559
calc_no,0.014782,0.000526,1.920123e-07,5,0.015864,0.013699


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

test.loc[test["calc"] == "Always", "calc"] = "Frequently"

catcols = train.select_dtypes(include="object").columns[:-1]
col_transformer = ColumnTransformer([
    ("", OneHotEncoder(categories="auto", drop="first", sparse_output=False).set_output(transform="pandas"), catcols)
], remainder="passthrough", verbose_feature_names_out=False).set_output(transform="pandas").fit(train.drop(columns=["nobeyesdad"]))
test_p1 = col_transformer.transform(test)

test_p1.head()

Unnamed: 0_level_0,gender_Male,family_history_with_overweight_yes,favc_yes,caec_Frequently,caec_Sometimes,caec_no,smoke_yes,scc_yes,calc_Sometimes,calc_no,...,mtrans_Public_Transportation,mtrans_Walking,age,height,weight,fcvc,ncp,ch2o,faf,tue
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20758,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,26.899886,1.848294,120.644178,2.938616,3.0,2.825629,0.8554,0.0
20759,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,21.0,1.6,66.0,2.0,1.0,3.0,1.0,0.0
20760,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,26.0,1.643355,111.600553,3.0,3.0,2.621877,0.0,0.250502
20761,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,20.979254,1.553127,103.669116,2.0,2.977909,2.786417,0.094851,0.0
20762,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,26.0,1.627396,104.835346,3.0,3.0,2.653531,0.0,0.741069


In [None]:
ag_test_p1 = TabularDataset(test_p1)
y_pred_p1 = predictor_p1.predict(ag_test_p1)

In [None]:
submission_p1 = y_pred_p1.to_frame("NObeyesdad")
submission_p1.index = test_p1.index
submission_p1 = submission_p1.reset_index()
submission_p1.head()

Unnamed: 0,id,NObeyesdad
0,20758,Obesity_Type_II
1,20759,Overweight_Level_I
2,20760,Obesity_Type_III
3,20761,Obesity_Type_I
4,20762,Obesity_Type_III


In [None]:
submission_p1.to_csv(data_p / "submission_p1.csv", index=False)

## From FeatEng.1

In [None]:
train_p2, test_p2 = pd.read_csv(data_p / "train_feateng.1.csv.gz"), pd.read_csv(data_p / "test_feateng.1.csv.gz")
X_train_p2, X_val_p2, y_train_p2, y_val_p2 = train_test_split(train_p2.drop(columns=[LABEL]), train_p2[LABEL], test_size=0.2, stratify=train_p2[LABEL], random_state=SEED)
X_train_p2.shape, X_val_p2.shape

((16606, 31), (4152, 31))

In [None]:
col_transformer_p2 = ColumnTransformer([
    ("", "drop", ["age", "fcvc", "ncp", "ch2o", "faf", "tue", "calc", "smoke", "_exercise"]),
    ("ohe", OneHotEncoder(drop="if_binary", sparse_output=False).set_output(transform="pandas"), [
        'gender', 'family_history_with_overweight', 'favc', 'caec',
        'scc', '_calc', '_binned_bmi', '_habitually_unhealthy',
        '_eating_habits', '_health_conscious', '_devine_ideal', 'mtrans'
    ])
], verbose_feature_names_out=False, remainder="passthrough").set_output(transform="pandas").fit(X_train_p2)

ag_train_p2 = TabularDataset(pd.concat([col_transformer_p2.transform(X_train_p2).set_index("id"), y_train_p2.to_frame(LABEL)], axis=1, join="inner"))
ag_val_p2 = TabularDataset(pd.concat([col_transformer_p2.transform(X_val_p2).set_index("id"), y_val_p2.to_frame(LABEL)], axis=1, join="inner"))

In [None]:
save_p = data_p / "autogluon_p2.1"

if not save_p.exists():
    ag_predictor = TabularPredictor(label=LABEL, problem_type="multiclass", eval_metric="log_loss", path=save_p).fit(
        ag_train_p2,
        time_limit=3600,
        presets="best_quality", num_bag_folds=5, num_bag_sets=4, num_stack_levels=3
    )

predictor_p2 = TabularPredictor.load(save_p)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=5, num_bag_sets=4
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: /content/drive/MyDrive/Colab Notebooks/2024/S04E02_Obesity/autogluon_p2.1/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "/content/drive/MyDrive/Colab Notebooks/2024/S04E02_Obesity/autogluon_p2.1/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.10.12
Operating System:   Lin

### P2 Metrics

In [None]:
met_p2 = predictor_p2.evaluate(ag_val_p2, detailed_report=True)

In [None]:
met_p2

{'log_loss': -0.2755885470900383,
 'accuracy': 0.9065510597302505,
 'balanced_accuracy': 0.8967035939303539,
 'mcc': 0.8904012736893311,
 'confusion_matrix':                      Insufficient_Weight  Normal_Weight  Obesity_Type_I  \
 Insufficient_Weight                  472             30               0   
 Normal_Weight                         31            545               3   
 Obesity_Type_I                         0              0             514   
 Obesity_Type_II                        0              0              17   
 Obesity_Type_III                       0              0               0   
 Overweight_Level_I                     1             40               8   
 Overweight_Level_II                    0             11              40   
 
                      Obesity_Type_II  Obesity_Type_III  Overweight_Level_I  \
 Insufficient_Weight                0                 0                   3   
 Normal_Weight                      0                 0                  36

In [None]:
met_p2["confusion_matrix"]

Unnamed: 0,Insufficient_Weight,Normal_Weight,Obesity_Type_I,Obesity_Type_II,Obesity_Type_III,Overweight_Level_I,Overweight_Level_II
Insufficient_Weight,472,30,0,0,0,3,0
Normal_Weight,31,545,3,0,0,36,2
Obesity_Type_I,0,0,514,18,3,12,35
Obesity_Type_II,0,0,17,628,0,1,4
Obesity_Type_III,0,0,0,0,808,1,0
Overweight_Level_I,1,40,8,0,0,380,56
Overweight_Level_II,0,11,40,2,0,34,417


In [None]:
predictor_p2.feature_importance(ag_val_p2)

Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
_weight,0.684518,0.016676,4.223304e-08,5,0.718854,0.650182
gender_Male,0.540934,0.012165,3.06716e-08,5,0.565981,0.515886
_bmi,0.414256,0.010408,4.777668e-08,5,0.435686,0.392826
_binned_fcvc,0.145714,0.003746,5.239534e-08,5,0.153428,0.137999
_binned_bmi_Overweight,0.033029,0.002715,5.430709e-06,5,0.03862,0.027439
_binned_bmi_Obese,0.027099,0.00216,4.800358e-06,5,0.031546,0.022652
_height,0.019614,0.002051,1.413687e-05,5,0.023837,0.015392
_binned_ch2o,0.0178,0.001901,1.536387e-05,5,0.021713,0.013886
family_history_with_overweight_yes,0.014886,0.002074,4.40435e-05,5,0.019156,0.010616
_binned_faf,0.014857,0.000686,5.430403e-07,5,0.016268,0.013445


### P3 Metrics

In [None]:
met_p3 = predictor_p2.evaluate(ag_val_p2, detailed_report=True)
met_p3

{'log_loss': -0.27666351667994726,
 'accuracy': 0.9075144508670521,
 'balanced_accuracy': 0.8975910957771509,
 'mcc': 0.891527269701327,
 'confusion_matrix':                      Insufficient_Weight  Normal_Weight  Obesity_Type_I  \
 Insufficient_Weight                  472             30               0   
 Normal_Weight                         30            548               2   
 Obesity_Type_I                         0              0             515   
 Obesity_Type_II                        0              0              17   
 Obesity_Type_III                       0              0               0   
 Overweight_Level_I                     1             37              10   
 Overweight_Level_II                    0             11              41   
 
                      Obesity_Type_II  Obesity_Type_III  Overweight_Level_I  \
 Insufficient_Weight                0                 0                   3   
 Normal_Weight                      0                 0                  34

In [None]:
met_p3["confusion_matrix"]

Unnamed: 0,Insufficient_Weight,Normal_Weight,Obesity_Type_I,Obesity_Type_II,Obesity_Type_III,Overweight_Level_I,Overweight_Level_II
Insufficient_Weight,472,30,0,0,0,3,0
Normal_Weight,30,548,2,0,0,34,3
Obesity_Type_I,0,0,515,18,3,12,34
Obesity_Type_II,0,0,17,629,0,1,3
Obesity_Type_III,0,0,0,0,808,1,0
Overweight_Level_I,1,37,10,0,0,381,56
Overweight_Level_II,0,11,41,2,0,35,415


In [None]:
predictor_p2.feature_importance(ag_val_p2)

Computing feature importance via permutation shuffling for 36 features using 4152 rows with 5 shuffle sets...
	3700.35s	= Expected runtime (740.07s per shuffle set)
	2125.15s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
weight,0.657836,0.016705,4.985742e-08,5,0.692232,0.62344
gender_Male,0.528207,0.01176,2.946747e-08,5,0.552421,0.503992
_bmi,0.342908,0.008664,4.887282e-08,5,0.360748,0.325068
_binned_fcvc,0.14364,0.003253,3.154331e-08,5,0.150338,0.136942
_binned_bmi_Obese I,0.038006,0.001722,5.038866e-07,5,0.041551,0.034461
_binned_bmi_Overweight,0.027034,0.001655,1.676485e-06,5,0.030441,0.023626
_binned_bmi_Healthy,0.019882,0.0019,9.887979e-06,5,0.023794,0.01597
height,0.019556,0.001974,1.228761e-05,5,0.02362,0.015492
_binned_ch2o,0.018331,0.00189,1.337509e-05,5,0.022223,0.014439
_binned_faf,0.015798,0.000846,9.844473e-07,5,0.01754,0.014055
