In [9]:
import numpy as np
import pandas as pd

In [10]:
train = pd.read_csv("train.csv", index_col='id')
test = pd.read_csv("test.csv", index_col='id')
df = train.copy()

# Pre-Processing

## Categorical Features :
- Binary categorical features : 
    - Gender 
    - family_history_with_overweight (Boolean)
    - FAVC (Boolean)
    - SMOKE (Boolean)
    - SCC (Boolean)
- Features with classes that follow a hierarchy : 
    - CAEC (4 categories : no, Sometimes, Frequently, Always)
    - CALC (3 categories : no, Sometimes, Frequently)
- Other categorical features : 
    - MTRANS (5 categories : Public_Transportation, Automobile, Walking, Motorbike, Bike)
    - NObeyesdad (Target)
## Numerical Features : 
- Features that might need to be normalized : 
    - Age
    - Height
    - Weight
- Features that look like they are catgeories : 
    - FCVC
    - NCP
    - CH2O
    - FAF
    - TUE

In [11]:
from sklearn.preprocessing import (
    OrdinalEncoder, 
    OneHotEncoder, 
    StandardScaler, 
    MinMaxScaler,
    PowerTransformer, 
    FunctionTransformer
)
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer

In [12]:
numeric_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_features

['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']

In [30]:
#onehot_features = ["MTRANS"]
ordinal_features = ["Gender", "family_history_with_overweight", "FAVC", "SMOKE", "SCC", "CAEC", "CALC", "MTRANS"]
norm_features = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
map_features = ["FCVC", "NCP", "CH2O", "FAF", "TUE"]

In [31]:
onehot_transformer = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
ordinal_transformer = make_pipeline(OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=5))
numeric_transformer = make_pipeline(StandardScaler())
boxcox_transformer = PowerTransformer(method='box-cox', standardize=False)
yeo_johnson_transformer = PowerTransformer(method='yeo-johnson', standardize=False)

In [32]:
def calculate_bmi(df):
    df['bmi'] = df['Weight'] / (df['Height'] ** 2)
    return df

In [33]:
def dietary_ratio(df):
    df['dietary_ratio'] = df['FAF'] / (df['NCP'] + df['CH2O'])
    return df

In [34]:
transformer = make_column_transformer(
    (onehot_transformer, ordinal_features),
    #(onehot_transformer, onehot_features),
    (yeo_johnson_transformer, norm_features),
    remainder='passthrough'
)

In [35]:
preprocessor = make_pipeline(transformer, MinMaxScaler())