In [23]:
from IPython.display import display, Markdown

import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import ShuffleSplit, GridSearchCV, KFold, cross_validate
from sklearn import model_selection
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings('ignore')

# 1. Obtenção dos dados


Nessa fase do processo obtemos o conjunto de dados e dicionário de dados.

In [65]:
df = pd.read_csv('../data/raw/obesity_dataset.csv')
df_dict = pd.read_csv('../data/external/dictionary.csv')

df_dict

Unnamed: 0,variavel,descricao,tipo,subtipo
0,Age,idade do indivíduo,quantitativa,contínua
1,Gender,gênero do indivíduo,qualitativa,nominal
2,Height,altura do indiíduo,quantitativa,contínua
3,Weight,peso do indivíduo,quantitativa,contínua
4,CALC,frequência do consumo de álcool pelo indivíduo,qualitativa,ordinal
5,FAVC,indica se o indivíduo consome comidas altament...,qualitativa,nominal
6,FCVC,indica o nível de consumo de vegetais nas refe...,quantitativa,discreta
7,NCP,quantas refeições principais o indivíduo faz d...,quantitativa,contínua
8,SCC,indica se o indivíduo monitora as calorias ing...,qualitativa,nominal
9,SMOKE,indica se o indivíduo fuma ou não,qualitativa,nominal


# 2. Preparação dos dados

Nessa fase realizamos as principais transformações no conjunto de dados bruto já visando a modelagem.

Para as variáveis nominais, utilizamos a técnica do [OneHotEncoder](http://https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) para fazer o encoding das variáveis categóricas. Para as variáveis ordinais usamos o [OrdinalEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html), e empregamos o [StandardScaler](http://https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#standardscaler) para normalização das variáveis contínuas, ordinais e discretas.

Não aplicamos nenhum método para tratar dados ausentes, pois o conjunto de dados não possui valores faltantes.

In [70]:
target_column = 'NObeyesdad'
colunas_nominal = (
    df_dict
    .query("subtipo == 'nominal'") # Filtrando colunas nominais e tirando a variavel alvo.
    .variavel
    .to_list()
)
colunas_continua = (
    df_dict
    .query("subtipo == 'contínua'") # Filtrando colunas contínuas.
    .variavel
    .to_list()
)
colunas_ordinal = (
    df_dict
    .query("subtipo == 'ordinal' and variavel != @target_column") # Filtrando colunas ordinais e tirando a variavel alvo.
    .variavel
    .to_list()
)
colunas_discreta = (
    df_dict
    .query("subtipo == 'discreta'")
    .variavel
    .to_list()
)

X = df.drop(columns=[target_column], axis=1) # Dropando a variável alvo para ficarmos somente com as variáveis independentes.
y = df[target_column]

In [67]:
preprocessador_nominal = Pipeline([ 
    ('encoding', OneHotEncoder(sparse_output=False, drop='first')), 
])
preprocessador_continua = Pipeline([ 
    ('normalization', StandardScaler())
])
preprocessador_ordinal = Pipeline([
    ('encoding', OrdinalEncoder()),
    ('normalization', StandardScaler())
    
])
preprocessador_discreta = Pipeline([
    ('normalization', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('nominal', preprocessador_nominal, colunas_nominal),
    ('continuous', preprocessador_continua, colunas_continua),
    ('ordinal', preprocessador_ordinal, colunas_ordinal),
    ('discreta', preprocessador_discreta, colunas_discreta)
])


# 3. Metodologia

Iremos análisar quatro modelos, que serão testados utilizando o KFold como método de validação, a saber:

- Logistic Regression
- K-Nearest-Neighbors
- Support Vector Machine
- Decision Tree

Além disso, cada um desses algoritmos será testado com diferentes hiper-parametros, para que possamos encontrar o melhor modelo e a melhor configuração possível para esse modelo.

Utilizaremos as seguintes métricas para análise:

- **Acurácia (accuracy)**: proporção entre os dados que foram corretamente previstos (como positivos ou negativos) com o total de dados observados;
- **Precisão (precision)**: proporção entre dados corretamente previstos como positivos e o total de observações positivas.
- **Recall**: proporção entre dados corretamente previstos como positivos com o total de observações.
- **F1-score**: média entre precision e recall, portanto levando em conta tanto falsos positivos quanto falsos negativos.

# 4. Configuração do experimento

In [19]:
n_splits_comparative_analysis = 10
n_folds_grid_search = 5
test_size = 0.2
random_state = 42
scoring = 'accuracy'
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'] 

max_iter = 6000
models = [
    ('K-Nearest Neighbors', KNeighborsClassifier(), {"n_neighbors": range(3, 20, 2), 'weights': ['uniform', 'distance']}),
    ('Decision Tree', DecisionTreeClassifier(random_state=random_state), {'criterion':['gini','entropy'],'max_depth': [3, 6, 8]}),
    ('Logistic Regression', LogisticRegression(max_iter=max_iter, solver = 'liblinear'), {'C' : np.logspace(-4, 4, 20)}),
    ('Naive Bayes', GaussianNB(), {})
]

In [20]:
results = pd.DataFrame({})
cross_validate_grid_search = KFold(n_splits=n_folds_grid_search)
cross_validate_comparative_analysis = ShuffleSplit(n_splits=n_splits_comparative_analysis, test_size=test_size, random_state=random_state)
for model_name, model_object, model_parameters in models:
    print(f"running {model_name}...")
    model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )
    
    approach = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model_grid_search)
    ])
    
    scores = cross_validate(
        estimator=approach,
        X=X,
        y=y,
        cv=cross_validate_comparative_analysis,
        n_jobs=-1,
        scoring=metrics
    )
    
    scores['model_name'] = [model_name] * n_splits_comparative_analysis
    display(pd.DataFrame(scores).select_dtypes(include=[float, int]).agg(['mean', 'std']))
    results = pd.concat([results, pd.DataFrame(scores)], ignore_index=True)

running K-Nearest Neighbors...


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,2.389555,0.086119,0.850118,0.844595,0.847497,0.837594
std,0.923504,0.04836,0.015125,0.018283,0.015549,0.015111


running Decision Tree...


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,0.767775,0.082733,0.947045,0.945898,0.945739,0.945329
std,0.141477,0.087966,0.0096,0.010764,0.010936,0.011123


running Logistic Regression...


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,8.34686,0.130523,0.773286,0.777509,0.774252,0.769798
std,2.51998,0.097398,0.016434,0.014955,0.012634,0.014826


running Naive Bayes...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
mean,0.161433,0.085349,0.5026,0.465833,0.503406,0.416058
std,0.016938,0.042493,0.025685,0.06051,0.022889,0.030135


In [25]:
def highlight_best(s, props=''):
    if s.name[1] != 'std':
        if s.name[0].endswith('time'):
            return np.where(s == np.nanmin(s.values), props, '')
        return np.where(s == np.nanmax(s.values), props, '')

display(Markdown("### 4.1 Resultados gerais"))
(
    results
    .groupby('model_name')
    .agg(['mean', 'std']).T
    .style
    .apply(highlight_best, props='color:white;background-color:gray;font-weight: bold;', axis=1)
    .set_table_styles([{'selector': 'td', 'props': 'text-align: center;'}])
)

### 4.1 Resultados gerais

Unnamed: 0,model_name,Decision Tree,K-Nearest Neighbors,Logistic Regression,Naive Bayes
fit_time,mean,0.767775,2.389555,8.34686,0.161433
fit_time,std,0.141477,0.923504,2.51998,0.016938
score_time,mean,0.082733,0.086119,0.130523,0.085349
score_time,std,0.087966,0.04836,0.097398,0.042493
test_accuracy,mean,0.947045,0.850118,0.773286,0.5026
test_accuracy,std,0.0096,0.015125,0.016434,0.025685
test_precision_macro,mean,0.945898,0.844595,0.777509,0.465833
test_precision_macro,std,0.010764,0.018283,0.014955,0.06051
test_recall_macro,mean,0.945739,0.847497,0.774252,0.503406
test_recall_macro,std,0.010936,0.015549,0.012634,0.022889


# 4.2 Persistência do modelo

In [22]:
model_name, model_object, model_parameters  = [foo for foo in models if foo[0] == "Decision Tree"][0] 


model_grid_search = GridSearchCV(
        estimator=model_object,
        param_grid=model_parameters,
        scoring=scoring,
        n_jobs=-1,
        cv=cross_validate_grid_search
    )

approach = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model_grid_search)
])

approach.fit(X, y) #Seleciona o approach

print(f"Hiper parâmetros do modelo: {approach.steps[1][1].best_params_}")

Hiper parâmetros do modelo: {'criterion': 'entropy', 'max_depth': 8}


In [24]:
joblib.dump(approach, '../models/model.joblib') # Salva o modelo em disco

['../models/model.joblib']

# 5. Resultados e discussão

O modelo de classificação Árvore de Decisão obteve o melhor desempenho, pois nos nossos testes de:

- Accuracy
- Precision
- Recall
- f1-score

o modelo foi superior a todos os demais.