In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Parte 1

## Cria o conjunto de dados

In [2]:
# O nome dos pacientes foram ignorados por não serem capaz de explicar a variavel target
# tabela de conversão
# sim     -> 1     nao      -> 0
# grande  -> 1     pequeno  -> 0     
# doente  -> 1     saudavel -> 0
treino = pd.DataFrame({
    'febre'  : [1,0,1,1,1,0],
    'enjoo'  : [1,0,1,0,0,0],
    'manchas': [0,1,0,1,0,1],
    'dores'  : [1,0,0,1,1,1],
    'diagnostico': [1,0,0,1,0,1]
})

# Primeira instancia = Luis
# Segunda instancia = Laura
teste = pd.DataFrame({
    'febre'  : [0,1],
    'enjoo'  : [0,1],
    'manchas': [0,1],
    'dores'  : [1,1],
    'diagnostico': [0,1] # valores assumidos
})

## Define conjunto de treino e teste

In [3]:
X_train = treino.drop(columns='diagnostico')
y_train = treino['diagnostico']

In [4]:
X_train.head(6)

Unnamed: 0,febre,enjoo,manchas,dores
0,1,1,0,1
1,0,0,1,0
2,1,1,0,0
3,1,0,1,1
4,1,0,0,1
5,0,0,1,1


In [5]:
X_test = teste.drop(columns='diagnostico')
y_test = teste['diagnostico']

In [6]:
X_test.head(2)

Unnamed: 0,febre,enjoo,manchas,dores
0,0,0,0,1
1,1,1,1,1


## Treinamento dos modelos

In [7]:
# Modelos utilizados no Stacking
stacking_estimators = [
    ('rdg', RidgeClassifier(normalize=True)),
    ('gnb', GaussianNB()),
    ('knn', KNeighborsClassifier(n_neighbors=2)),
    ('svm', make_pipeline(StandardScaler(), SVC(random_state=1)))
]

# Nos modelos de Boosting e Bagging o modelo DecisionTree está como classificador base
# No modelo Stacking o modelo DecisionTree está como combinador
models = [
    ('xgb', XGBClassifier(objective='binary:logistic', max_depth=3, n_estimators=2, random_state=1)),
    ('boosting', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3, random_state=1), n_estimators=2, random_state=1)),
    ('rf', RandomForestClassifier(max_depth=3, n_estimators=2, random_state=1)),
    ('bagging', BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3, random_state=1), n_estimators=2, random_state=1)),
    ('stacking', StackingClassifier(estimators=stacking_estimators, final_estimator=DecisionTreeClassifier(max_depth=3, random_state=1), cv=2))
]

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score_fmt = "{:.2f}".format(accuracy_score(y_test, y_pred))
    print(y_pred, score_fmt, name)

[0 0] 0.50 xgb
[0 1] 1.00 boosting
[0 0] 0.50 rf
[0 0] 0.50 bagging
[1 0] 0.00 stacking


- `Boosting` reporta o maior score
- `RandomForest` + `Bagging` e `XGB` + `Boosting` deveria reportar os mesmos resultados

# Parte 2

In [8]:
df = pd.read_csv('glass.csv')

In [9]:
df.head(10)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1
5,1.51596,12.79,3.61,1.62,72.97,0.64,8.07,0.0,0.26,1
6,1.51743,13.3,3.6,1.14,73.09,0.58,8.17,0.0,0.0,1
7,1.51756,13.15,3.61,1.05,73.24,0.57,8.24,0.0,0.0,1
8,1.51918,14.04,3.58,1.37,72.08,0.56,8.3,0.0,0.0,1
9,1.51755,13.0,3.6,1.36,72.99,0.57,8.4,0.0,0.11,1


In [10]:
X = df.drop(columns='Type')
y = df['Type']

In [11]:
# Armazena o score de cada modelo para cada fold
models_scores = {}

# Cria conjuntos de dados baseado em 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=1)

for train_index, test_index in kf.split(X):
    # Faz o split de treino e teste
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Modelos utilizados no Stacking
    stacking_estimators = [
        ('rdg', RidgeClassifier(normalize=True)),
        ('gnb', GaussianNB()),
        ('knn', KNeighborsClassifier(n_neighbors=2)),
        ('svm', make_pipeline(StandardScaler(), SVC(random_state=1)))
    ]

    # Nos modelos de Boosting e Bagging o modelo DecisionTree está como classificador base
    # No modelo Stacking o modelo DecisionTree está como combinador
    # Utiliza 'multi:softmax' no modelo XGBoost para problema multiclasse
    models = [
        ('dt', DecisionTreeClassifier(max_depth=3, random_state=1)),
        ('xgb', XGBClassifier(objective='multi:softmax', max_depth=3, n_estimators=2, random_state=1)),
        ('boosting', AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3, random_state=1), n_estimators=2, random_state=1)),
        ('rf', RandomForestClassifier(max_depth=3, n_estimators=2, random_state=1)),
        ('bagging', BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=3, random_state=1), n_estimators=2, random_state=1)),
        ('stacking', StackingClassifier(estimators=stacking_estimators, final_estimator=DecisionTreeClassifier(max_depth=3, random_state=1), cv=2))
    ]
    
    # Faz o treinamento e teste de cada modelo e armazena seu score
    for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        score = accuracy_score(y_test, y_pred)
        
        if name not in models_scores:
            models_scores[name] = []
        models_scores[name].append(score)

In [12]:
# faz a média de score dos 10 folds criados para cada modelo
for name in models_scores.keys():
    mean_scores_fmt = "{:.2}".format(np.mean(models_scores[name]))
    print(mean_scores_fmt, name)

0.67 dt
0.65 xgb
0.63 boosting
0.63 rf
0.64 bagging
0.59 stacking


- Na média, a `DecisionTree` reporta o maior score