In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import preprocessing
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


In [2]:
digits = load_digits()

X = digits.data
Y = digits.target

X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                    test_size=0.3, 
                                    random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Precisão:", round(clf.score(X_test, y_test),3))

Precisão: 0.844


In [3]:
clf = DecisionTreeClassifier(random_state = 42, 
                             max_depth = 10, 
                             criterion = 'entropy',
                             min_samples_leaf= 2
                             )
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Precisão:", round(clf.score(X_test, y_test),3))

Precisão: 0.881


Exercício 2 - Avaliação dos ganhos com a utilização de modelos Ensemble

In [4]:
params_clf = {
              'max_depth' : [5, 6, 7, 8, 9,10,20,50,100,200],
              'criterion' :['gini', 'entropy'],
              'min_samples_leaf' : [1,2,3,4,5,10,20,30]    
}

clf = DecisionTreeClassifier(random_state= 42)
clf_gs = GridSearchCV(clf, params_clf, cv=5, verbose= 2)
clf_gs.fit(X_train, y_train)
#save best model
clf_best = clf_gs.best_estimator_
#check best n_estimators value
print(clf_gs.best_params_)

Fitting 5 folds for each of 160 candidates, totalling 800 fits
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=1; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=1; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=1; total time=   0.0s


[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=1; total time=   0.1s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=1; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=2; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=2; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=2; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=2; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=2; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=3; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=3; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=3; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=3; total time=   0.0s
[CV] END ....criterion=gini, max_depth=5, min_samples_leaf=3; total time=   0.0s
[CV] END ....criterion=gini,

  _data = np.array(data, dtype=dtype, copy=copy,


In [5]:
rf = RandomForestClassifier()
params_rf = {    
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt'],
    'max_depth' : [None, 10, 20],
    'criterion' :['gini', 'entropy']}
rf_gs = GridSearchCV(rf, params_rf, cv=5, n_jobs=-1, verbose = 2)
rf_gs.fit(X_train, y_train)
#save best model
rf_best = rf_gs.best_estimator_
#check best n_estimators value
print(rf_gs.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=50; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=50; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=50; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=50; total time=   0.0s[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=50; total time=   0.0s

[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=auto, n_estimators=100; total time=   0.0s
[CV] END criterion=gini, max_depth=

90 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
26 fits failed with the following error:
Traceback (most recent call last):
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/codespace/.local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_c

{'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 100}


In [6]:
gb = GradientBoostingClassifier()
params_gb = {
    "max_depth":[3,5,8]
    }

gb_gs = GridSearchCV(gb, params_gb, cv=5, n_jobs=-1)
gb_gs.fit(X_train, y_train)
#save best model
gb_best = gb_gs.best_estimator_
#check best n_estimators value
print(gb_gs.best_params_)

{'max_depth': 3}


In [7]:
xgb_model = XGBClassifier(max_depth=10, n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
xgb_acc = accuracy_score(y_test, xgb_model.predict(X_test))

In [8]:
print('rf: {}'.format(rf_best.score(X_test, y_test)))
print('clf: {}'.format(clf_gs.score(X_test, y_test)))
print('gb: {}'.format(gb_gs.score(X_test, y_test)))
print(f"Acurácia XGBoost: {xgb_acc}")

rf: 0.9796296296296296
clf: 0.8648148148148148
gb: 0.9666666666666667
Acurácia XGBoost: 0.9703703703703703


In [9]:
from sklearn.ensemble import VotingClassifier

estimators = [ ('rf', rf_best), ('clf', clf_best), ('gb', gb_best)]
#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='hard')
#fit model to training data
ensemble.fit(X_train, y_train)
#test our model on the test data
ensemble.score(X_test, y_test)

0.9722222222222222

Exercício 3 - Visualização da Árvore de Decisão e Medida de Impureza