# Cross Validation

## Carregando os dados

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_abt = pd.read_csv('C:\\Users\\HP\\Documents\\GitHub\\Case ML\\propensao_revenda_abt.csv')
df_abt.head()

Unnamed: 0,data_ref_safra,seller_id,uf,tot_orders_12m,tot_items_12m,tot_items_dist_12m,receita_12m,recencia,nao_revendeu_next_6m
0,2018-01-01,0015a82c2db000af6aaaf3ae2ecb0532,SP,3,3,1,2685.0,74,1
1,2018-01-01,001cca7ae9ae17fb1caed9dfb1094831,ES,171,207,9,21275.23,2,0
2,2018-01-01,002100f778ceb8431b7a1020ff7ab48f,SP,38,42,15,781.8,2,0
3,2018-01-01,003554e2dce176b5555353e4f3555ac8,GO,1,1,1,120.0,16,1
4,2018-01-01,004c9cd9d87a3c30c522c48c4fc07416,SP,130,141,75,16228.88,8,0


In [3]:
key_vars = ['data_ref_safra', 'seller_id']
num_vars = ['tot_orders_12m', 'tot_items_12m', 'tot_items_dist_12m', 'receita_12m', 'recencia']
cat_vars = ['uf']
target = 'nao_revendeu_next_6m'

features = cat_vars + num_vars

# filtra as colunas com as features
X = df_abt[features]
# filtra o target
y = df_abt[target]

# Train-Test Split / Hold-Out

In [4]:
from sklearn.model_selection import train_test_split

# Criando o conjunto de treino e teste -> Hold-Out
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

## Pipeline utilizado

Vamos utilizar o estimador DecisionTreeClassifier para testar todos os cenários

In [5]:
!pip install feature-engine==1.0.2



You should consider upgrading via the 'c:\users\hp\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [6]:
from sklearn.pipeline import Pipeline
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.imputation import CategoricalImputer
from feature_engine.encoding import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

dt = Pipeline(steps=[
                ('numeric_imputer', ArbitraryNumberImputer(variables=num_vars, arbitrary_number=-999)),
                ('categoric_imputer', CategoricalImputer(variables=cat_vars, fill_value='missing')),
                ('one_hot_encoder', OneHotEncoder(variables=cat_vars)),
                ('algoritmo', DecisionTreeClassifier(random_state=42))
])

In [7]:
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

from sklearn.metrics import accuracy_score

acc = accuracy_score(y_test, y_pred)
print(f'Acurácia = {acc:.3f}')

Acurácia = 0.760


# K-Fold Cross-Validation

In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_val_score(dt, X, y, scoring='accuracy', cv=kf, n_jobs=-1)

print(f'Acurácia = {cv_results.mean():.3f} ({cv_results.std():.3f})')

Acurácia = 0.749 (0.008)


In [9]:
# cross-validation com multiplas métricas
from sklearn.model_selection import cross_validate

cv_results = cross_validate(dt, X, y, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=kf, n_jobs=-1)
cv_results_df = pd.DataFrame(cv_results)

In [10]:
cv_results_df

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,0.161568,0.107357,0.757524,0.728346,0.719844,0.72407,0.753605
1,0.16456,0.099378,0.757954,0.737098,0.720952,0.728936,0.754567
2,0.165572,0.089387,0.738925,0.731755,0.688951,0.709708,0.735864
3,0.151597,0.081424,0.739785,0.6881,0.719157,0.703286,0.737214
4,0.082776,0.042887,0.753118,0.719661,0.73487,0.727186,0.751288


In [11]:
cv_results_df.mean()

fit_time          0.145214
score_time        0.084086
test_accuracy     0.749461
test_precision    0.720992
test_recall       0.716755
test_f1           0.718637
test_roc_auc      0.746508
dtype: float64

# Stratified K-Fold Cross-Validation

In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = cross_val_score(dt, X, y, scoring='accuracy', cv=skf, n_jobs=-1)

print(f'Acurácia = {cv_results.mean():.3f} ({cv_results.std():.3f})')

Acurácia = 0.752 (0.007)


In [13]:
# stratified cross-validation com multiplas métricas
from sklearn.model_selection import cross_validate

cv_results = cross_validate(dt, X, y, scoring=['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], cv=skf, n_jobs=-1)
cv_results_df = pd.DataFrame(cv_results)
cv_results_df

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_roc_auc
0,0.156582,0.073803,0.751075,0.719885,0.724735,0.722302,0.74843
1,0.149599,0.073804,0.739037,0.705323,0.714148,0.709708,0.737004
2,0.155583,0.075796,0.75871,0.726755,0.737247,0.731964,0.756649
3,0.150597,0.07181,0.75957,0.738784,0.713873,0.726115,0.755149
4,0.088761,0.041889,0.753548,0.724204,0.723507,0.723855,0.750909


In [14]:
cv_results_df.mean()

fit_time          0.140224
score_time        0.067420
test_accuracy     0.752388
test_precision    0.722990
test_recall       0.722702
test_f1           0.722789
test_roc_auc      0.749628
dtype: float64

# Leave-One Out Cross-Validation

Esse método é parecido com o K-fold, porém vários modelos são gerados até que todas as entidades sejam utilizadas para testar o modelo. Não consegui rodar, pois demanda muito recurso computacional. O ideal seria rodar em nuvem, como google colab.

In [15]:
from sklearn.model_selection import LeaveOneOut

loot = LeaveOneOut()

cv_results = cross_val_score(dt, X, y, scoring='accuracy', cv=loot, n_jobs=-1)

print(f'Acurácia = {cv_results.mean():.3f} ({cv_results.std():.3f})')

KeyboardInterrupt: 