Adaptado por Bruno Menezes (https://github.com/brunoleomenezes) e será utilizado apenas para fins acadêmicos.

In [29]:
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GroupKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

Site de vendas de veículos. 0 para não vendido e 1 para vendido. 

In [2]:
path = "/content/veiculos.csv"
dados = pd.read_csv(path).drop(columns=["Unnamed: 0"], axis=1)
dados.head(10)

Unnamed: 0,preco,vendido,idade_modelo,km_ano
0,30941.02,1,18,35085.22134
1,40557.96,1,20,12622.05362
2,89627.5,0,12,11440.79806
3,95276.14,0,3,43167.32682
4,117384.68,1,4,12770.1129
5,55405.26,1,11,23594.53374
6,93415.61,1,16,16077.3066
7,65265.09,0,10,18666.73466
8,43917.53,1,19,12607.56956
9,107860.41,1,15,24000.08742


In [3]:
x = dados[["preco","idade_modelo","km_ano"]]
y = dados["vendido"]

SEED = 158020
np.random.seed(SEED)
treinox, teste_x, treinoy, teste_y = train_test_split(x, y, test_size = 0.25, stratify = y)
print("Treinamento: %d. Teste %d." % (len(treinox), len(teste_x)))

Treinamento: 7500. Teste 2500.


Linha de base - Dummy (https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html). 

In [4]:
dummy_stratified = DummyClassifier()
dummy_stratified.fit(treinox, treinoy)
acuracia = dummy_stratified.score(teste_x, teste_y) * 100

print("Acurácia - Dummy: %.2f%%." % acuracia)

Acurácia - Dummy: 58.00%.


Decision Tree Classifier - https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

In [5]:
SEED = 158020
np.random.seed(SEED)
model = DecisionTreeClassifier(max_depth=2)
model.fit(treinox, treinoy)
previsoes = model.predict(teste_x)

acuracia = accuracy_score(teste_y, previsoes) * 100
print("Acurácia: %.2f%%." % acuracia)

Acurácia: 71.92%.


In [6]:
x = dados[["preco","idade_modelo","km_ano"]]
y = dados["vendido"]

SEED = 158020
np.random.seed(SEED)
treinox, teste_x, treinoy, teste_y = train_test_split(x, y, test_size = 0.25, stratify = y)
print("Treinamento: %d. Teste %d." % (len(treinox), len(teste_x)))

model = DecisionTreeClassifier(max_depth=2)
model.fit(treinox, treinoy)
previsoes = model.predict(teste_x)

acuracia = accuracy_score(teste_y, previsoes) * 100
print("Acurácia: %.2f%%." % acuracia)


Treinamento: 7500. Teste 2500.
Acurácia: 71.92%.


Mudança do SEED = 5.

In [10]:
x = dados[["preco","idade_modelo","km_ano"]]
y = dados["vendido"]

SEED = 5
np.random.seed(SEED)
treinox, teste_x, treinoy, teste_y = train_test_split(x, y, test_size = 0.25, stratify = y)
print("Treinamento: %d. Teste %d." % (len(treinox), len(teste_x)))

model = DecisionTreeClassifier(max_depth=2)
model.fit(treinox, treinoy)
previsoes = model.predict(teste_x)

acuracia = accuracy_score(teste_y, previsoes) * 100
print("Acurácia: %.2f%%." % acuracia)

Treinamento: 7500. Teste 2500.
Acurácia: 76.84%.


Cross Validate - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html

In [7]:
SEED = 301
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv = 3, return_train_score=False) # três pedaços
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 3 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 3 = [74.99, 76.57]


In [12]:
SEED = 301
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv = 3, return_train_score=True) # três pedaços, retornando score do treino
results

{'fit_time': array([0.0128839 , 0.00721049, 0.00740695]),
 'score_time': array([0.00275588, 0.00225711, 0.00262213]),
 'test_score': array([0.75704859, 0.7629763 , 0.75337534]),
 'train_score': array([0.75832583, 0.75536223, 0.76016199])}

In [8]:
SEED = 301
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv = 10, return_train_score=False) # dez pedaços
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 10 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 10 = [74.24, 77.32]


In [9]:
SEED = 301
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv = 5, return_train_score=False) # cinco pedaços
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 5 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 5 = [75.21, 76.35]


In [13]:
SEED = 5
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv = 5, return_train_score=False) # cinco pedaços
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 5 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 5 = [75.21, 76.35]


In [14]:
SEED = 5
np.random.seed(SEED)

model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, x, y, cv = 3, return_train_score=False) # três pedaços
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 5 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 5 = [74.99, 76.57]


In [15]:
# Dados desbalanceados
dados_desbalanceados = dados.sort_values("vendido", ascending=True)
xdesbalanceados = dados_desbalanceados[["preco","idade_modelo","km_ano"]]
ydesbalanceados = dados_desbalanceados["vendido"]
dados_desbalanceados.head(10)

Unnamed: 0,preco,vendido,idade_modelo,km_ano
4999,74023.29,0,12,24812.80412
5322,84843.49,0,13,23095.63834
5319,83100.27,0,19,36240.72746
5316,87932.13,0,16,32249.56426
5315,77937.01,0,15,28414.50704
5310,62474.09,0,19,26142.11896
5309,78095.4,0,17,31834.35454
5308,50979.69,0,20,22115.55028
5307,93204.68,0,20,27217.15808
5323,91721.9,0,15,31217.97732


In [18]:
SEED = 301
np.random.seed(SEED)

cv = KFold(n_splits = 10)
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, xdesbalanceados, ydesbalanceados, cv = cv, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 5 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 5 = [34.29, 81.39]


KFold - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html

In [20]:
SEED = 301
np.random.seed(SEED)

cv = KFold(n_splits = 10, shuffle=True) # shuffle=True
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, xdesbalanceados, ydesbalanceados, cv = cv, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 5 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 5 = [72.30, 79.26]


Stratified KFold - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

In [22]:
SEED = 301
np.random.seed(SEED)

cv = StratifiedKFold(n_splits = 10, shuffle=True) # shuffle=True
model = DecisionTreeClassifier(max_depth=2)
results = cross_validate(model, xdesbalanceados, ydesbalanceados, cv = cv, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 5 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 5 = [73.55, 78.01]


In [28]:
# Criando uma coluna com modelos de veiculos
np.random.seed(SEED)
dados['modelo_veiculo'] = dados.idade_modelo + np.random.randint(-2, 3, size=10000)
dados.modelo_veiculoo = dados.modelo_veiculo + abs(dados.modelo_veiculo.min()) + 1
dados.head(10)

  after removing the cwd from sys.path.


Unnamed: 0,preco,vendido,idade_modelo,km_ano,modelo_veiculo
0,30941.02,1,18,35085.22134,16
1,40557.96,1,20,12622.05362,22
2,89627.5,0,12,11440.79806,12
3,95276.14,0,3,43167.32682,4
4,117384.68,1,4,12770.1129,3
5,55405.26,1,11,23594.53374,11
6,93415.61,1,16,16077.3066,18
7,65265.09,0,10,18666.73466,12
8,43917.53,1,19,12607.56956,17
9,107860.41,1,15,24000.08742,13


GroupKFold - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupKFold.html

In [32]:
SEED = 301
np.random.seed(SEED)

cv = GroupKFold(n_splits = 10)
modelo = DecisionTreeClassifier(max_depth=2)
results = cross_validate(modelo, xdesbalanceados, ydesbalanceados, cv = cv, groups = dados.modelo_veiculo, return_train_score=False)
media = results['test_score'].mean()
desvio_padrao = results['test_score'].std()
print("Acurácia - cross validation, 5 = [%.2f, %.2f]" % ((media - 2 * desvio_padrao)*100, (media + 2 * desvio_padrao) * 100))

Acurácia - cross validation, 5 = [73.67, 77.90]
