# Performance Baselines

In [59]:
%reload_ext watermark
%watermark -p scikit-learn,mlxtend,xgboost

scikit-learn: 1.0
mlxtend     : 0.19.0
xgboost     : 1.5.0



```
Author: Adam Shedivy
```

## Dataset

Source: https://archive.ics.uci.edu/ml/datasets/Dry+Bean+Dataset

In [2]:
import pandas as pd


X_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_train.csv', header=None).values
y_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_train.csv', header=None).values.ravel().astype(int)

X_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_test.csv', header=None).values
y_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_test.csv', header=None).values.ravel().astype(int)

print('X_train.shape:', X_train.shape)
print('y_train.shape:', y_train.shape)
print('X_test.shape:', X_test.shape)
print('y_test.shape:', y_test.shape)

X_train.shape: (9119, 16)
y_train.shape: (9119,)
X_test.shape: (4492, 16)
y_test.shape: (4492,)


In [3]:
from sklearn.model_selection import train_test_split


X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 9119 1824 4492


## Baselines

Compare hyperparameter settings on validation set:

In [4]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")

Train Accuracy: 79.657%
Valid Accuracy: 71.162%


In [5]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")

Train Accuracy: 84.003%
Valid Accuracy: 71.930%


In [6]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")

Train Accuracy: 77.478%
Valid Accuracy: 69.518%


Choose best model and train on whole training set:

In [7]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
print(f"Train Accuracy: {model.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {model.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 84.965%
Test Accuracy: 71.305%


# Out of the box attempt

## Decision Trees

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from mlxtend.plotting import plot_decision_regions


tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=2, 
                              random_state=1)
tree.fit(X_train, y_train)
print(f"Train Accuracy: {tree.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {tree.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 62.682%
Test Accuracy: 61.932%


In [9]:
tree = DecisionTreeClassifier(criterion='entropy', 
                              max_depth=5, 
                              random_state=1)
tree.fit(X_train, y_train)
print(f"Train Accuracy: {tree.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {tree.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 90.361%
Test Accuracy: 88.980%


## majority Voting

In [4]:
from mlxtend.classifier import EnsembleVoteClassifier

clf1 = DecisionTreeClassifier(random_state=1, max_depth=None)
clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)
clf3 = DecisionTreeClassifier(random_state=1, max_depth=6)
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[1, 1, 1])

labels = ['Classifier 1', 'Classifier 2', 'Classifier 3', 'Ensemble']
for clf, label in zip([clf1, clf2, clf3, eclf], labels):

    clf.fit(X_train, y_train)
    print("Validation Accuracy: %0.2f [%s]" % (clf.score(X_valid, y_valid), label))
    
print("Test Accuracy: %0.2f" % eclf.score(X_test, y_test))

Validation Accuracy: 1.00 [Classifier 1]
Validation Accuracy: 0.90 [Classifier 2]
Validation Accuracy: 0.91 [Classifier 3]
Validation Accuracy: 0.93 [Ensemble]
Test Accuracy: 0.90


## Bagging

In [5]:
from sklearn.ensemble import BaggingClassifier

tree = DecisionTreeClassifier(criterion='entropy',
                              random_state=1,
                              max_depth=None)


bag = BaggingClassifier(base_estimator=tree,
                        n_estimators=500,
                        oob_score=True,
                        bootstrap=True,
                        bootstrap_features=False,
                        n_jobs=1,
                        random_state=1)

bag.fit(X_train, y_train)

print(f"Train Accuracy: {bag.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {bag.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 100.000%
Test Accuracy: 91.986%


## 'Boosting' methods

In [4]:
from sklearn.ensemble import GradientBoostingClassifier


boost = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=8,
    random_state=1)

boost.fit(X_train, y_train)
    
    
print(f"Train Accuracy: {boost.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {boost.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 100.000%
Test Accuracy: 92.164%


In [5]:
from sklearn.ensemble import HistGradientBoostingClassifier


boost = HistGradientBoostingClassifier(
    learning_rate=0.1,
    #n_estimators=100,
    #max_depth=8,
    random_state=1)

boost.fit(X_train, y_train)
    
print(f"Train Accuracy: {boost.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {boost.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 100.000%
Test Accuracy: 92.097%


In [7]:
import lightgbm as lgb


boost = lgb.LGBMClassifier()

boost.fit(X_train, y_train)


print(f"Train Accuracy: {boost.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {boost.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 100.000%
Test Accuracy: 92.342%


In [45]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(verbose=0,
                           n_estimators=300,
                           max_depth=6)

boost.fit(X_train, y_train)

print(f"Train Accuracy: {boost.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {boost.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 97.938%
Test Accuracy: 92.631%


# hyperparameter tuning

In [40]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

boost = CatBoostClassifier(verbose=0)

params = {'max_depth': [5, 6, 7],'n_estimators':[200, 300]}

grid = GridSearchCV(estimator=boost,
                    param_grid=params,
                    cv=5,
                    n_jobs=1,
                    verbose=2)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

[CV] END ......................max_depth=5, n_estimators=200; total time=   2.1s
[CV] END ......................max_depth=5, n_estimators=200; total time=   2.2s
[CV] END ......................max_depth=5, n_estimators=200; total time=   2.4s
[CV] END ......................max_depth=5, n_estimators=200; total time=   2.3s
[CV] END ......................max_depth=5, n_estimators=200; total time=   2.3s
[CV] END ......................max_depth=5, n_estimators=300; total time=   3.9s
[CV] END ......................max_depth=5, n_estimators=300; total time=   3.9s
[CV] END ......................max_depth=5, n_estimators=300; total time=   3.9s
[CV] END ......................max_depth=5, n_estimators=300; total time=   3.7s
[CV] END ......................max_depth=5, n_estimators=300; total time=   3.6s
[CV] END ......................max_depth=6, n_estimators=200; total time=   4.3s
[CV] END ......................max_depth=6, n_estimators=200; total time=   4.2s
[CV] END ...................

GridSearchCV(cv=5,
             estimator=<catboost.core.CatBoostClassifier object at 0x0000028A895463D0>,
             n_jobs=1,
             param_grid={'max_depth': [5, 6, 7], 'n_estimators': [200, 300]},
             verbose=2)

In [47]:
grid.best_params_

{'max_depth': 5, 'n_estimators': 300}

In [46]:
print(f"Train Accuracy: {grid.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {grid.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 97.127%
Test Accuracy: 92.654%
