# Otimização de Hiperparâmetros

## Carregando os dados

In [2]:
import numpy as np
import pandas as pd

In [3]:
df_abt = pd.read_csv('C:\\Users\\HP\\Documents\\GitHub\\Case ML\\propensao_revenda_abt.csv')
df_abt.head()

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.0,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.8,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.0,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0


In [4]:
df_abt['data_ref_safra'].value_counts()

2018-06-01    2213
2018-05-01    2104
2018-04-01    1941
2018-03-01    1874
2018-02-01    1805
2018-01-01    1690
Name: data_ref_safra, dtype: int64

In [5]:
df_abt_train = df_abt.query('data_ref_safra < "2018-03-01"')
df_abt_oot   = df_abt.query('data_ref_safra == "2018-03-01"')

In [6]:
# variáveis chaves da tabela
key_vars = ['data_ref_safra', 'seller_id']

# variáveis numéricas
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']

# variáveis categóricas
cat_vars = ['uf']

# variável resposta/target
target = 'nao_revendeu_next_6m'

# criando a lista com as features
features = cat_vars + num_vars

# filtrando a base de features
X_train = df_abt_train[features]
# filtrando o target
y_train = df_abt_train[target]

In [None]:
!pip install feature-engine==1.0.2

In [7]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

dt_pipe = Pipeline(steps=[
                          ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
                          ('categoric_imputer', CategoricalImputer(variables=cat_vars)),
                          ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                          ('model', DecisionTreeClassifier(random_state=42))
])

In [9]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_validate(dt_pipe, X_train, y_train, scoring='roc_auc', cv=skf, n_jobs=-1)
cv_results_df = pd.DataFrame(cv_results)
cv_results_df

Unnamed: 0,fit_time,score_time,test_score
0,0.086768,0.031914,0.738241
1,0.075798,0.032912,0.760813
2,0.076795,0.03491,0.794548
3,0.046873,0.016957,0.73907
4,0.074799,0.01895,0.750518


In [10]:
# roc_auc média
cv_results_df['test_score'].mean()

0.7566381766381767

In [11]:
dt_pipe[-1].get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 42,
 'splitter': 'best'}

# GridSearch

Iremos utilizar a função `GridSearchCV`.

**Parâmetros**
* `estimator`: modelo de machine learning, também chamado estimador
* `param_grid`: grid em forma de dicionário com os parâmetros em que a busca será realizada
* `scoring`: métrica a ser otimizada
* `cv`: estratégia de validação a ser utilizada
* `n_jobs`: quantidade de cores do processador a ser utilizada para realizar o processamento em paralelo. O valor `-1` significa que todos os cores serão utilizados
* `refit`: Retreine o melhor modelo encontrado em toda a base de dados
* `verbose`: mostra as mensagens. Quanto maior o número inteiro, mais mensagens serão mostradas.

In [12]:
from sklearn.model_selection import GridSearchCV

grid_parametros = {
    'model__max_depth': [2, 3, 4, 5, 6, 7]
}

grid_search = GridSearchCV(estimator=dt_pipe, param_grid=grid_parametros, scoring='roc_auc', cv=skf, n_jobs=-1, refit=True, verbose=1)

In [13]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    0.6s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('numeric_imputer',
                                        ArbitraryNumberImputer(arbitrary_number=-999,
                                                               variables=['tot_orders_12m',
                                                                          'tot_items_12m',
                                                                          'tot_items_dist_12m',
                                                                          'receita_12m',
                                                                          'recencia'])),
                                       ('categoric_imputer',
                                        CategoricalImputer(variables=['uf'])),
                                       ('one_hot_encoder',
                                        OneHotEncoder(variables=['uf'])),
                                       ('model',


In [14]:
# retorna a melhor combinação de hiperparâmetros
best_params = grid_search.best_params_
best_params

{'model__max_depth': 4}

In [15]:
# retorna o melhor score, nesse caso a roc-auc média usando um cv=5
best_score = grid_search.best_score_
best_score

0.8911939911939912

In [16]:
# retorna o melhor modelo do grid
best_dt_model = grid_search.best_estimator_

## Avaliando a performance na base out of time

In [17]:
from sklearn.metrics import roc_auc_score

X_oot = df_abt_oot[features]
y_oot = df_abt_oot[target]

y_proba_oot = best_dt_model.predict_proba(X_oot)[:, 1]
rocauc_oot  = roc_auc_score(y_oot, y_proba_oot)
print(f"Decision Tree: ROCAUC OOT = {rocauc_oot}")

Decision Tree: ROCAUC OOT = 0.8955542777450486


# Grid Search com mais hiperparâmetros

In [18]:
from sklearn.model_selection import GridSearchCV

grid_parametros = {
    'model__max_depth': [2, 3, 4, 5, 6, 7],
    'model__criterion': ['gini', 'entropy'],
    'model__class_weight': ['balanced', None]
}

grid_search = GridSearchCV(dt_pipe, grid_parametros, scoring='roc_auc', cv=skf, n_jobs=-1, verbose=1, refit=True)

In [19]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:    2.7s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('numeric_imputer',
                                        ArbitraryNumberImputer(arbitrary_number=-999,
                                                               variables=['tot_orders_12m',
                                                                          'tot_items_12m',
                                                                          'tot_items_dist_12m',
                                                                          'receita_12m',
                                                                          'recencia'])),
                                       ('categoric_imputer',
                                        CategoricalImputer(variables=['uf'])),
                                       ('one_hot_encoder',
                                        OneHotEncoder(variables=['uf'])),
                                       ('model',


In [20]:
# melhor combinação de hiperparâmetros
grid_search.best_params_

{'model__class_weight': 'balanced',
 'model__criterion': 'gini',
 'model__max_depth': 4}

In [21]:
# score da melhor combinação de hiperparâmetros
grid_search.best_score_

0.8911974445307779

## Avaliando a performance na base out of time

In [22]:
from sklearn.metrics import roc_auc_score

best_dt_model2 = grid_search.best_estimator_

y_proba_oot = best_dt_model2.predict_proba(X_oot)[:, 1]
rocauc_oot  = roc_auc_score(y_oot, y_proba_oot)
print(f"Decision Tree: ROCAUC OOT = {rocauc_oot}")

Decision Tree: ROCAUC OOT = 0.8980601190245211


# Exercício: Faça uma busca de hiperparâmetros (grid search) para a regressão logística