In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.metrics import accuracy_score
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
headers = ['lettr', 'x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br',
           'x-ege', 'xegvy', 'y-ege', 'yegvx']
      
df = pd.read_csv('letter-recognition.data', header = None, names = headers, na_values = ' ?')

In [3]:
df = df.dropna()

In [4]:
# Replacing the labels with 0's and 1's
for_numeric = ['A','B','C','D','E','F','G','H','I','J','K','L','M']

numeric = df['lettr'].tolist()

for i, j in enumerate(numeric):
    if j in for_numeric :
        numeric[i] = 1
    else:
        numeric[i] = 0
        
df['lettr'] = numeric

In [5]:
# Seperating features and labels
X_p = df.iloc[:, 1:]
y_p = df.iloc[:, 0]

In [6]:
# accuracy vectors for train set
logreg_accuracy_train = []
randforest_accuracy_train = []
knn_accuracy_train = []

In [7]:
# accuracy vectors for test set
logreg_accuracy = []
randforest_accuracy = []
knn_accuracy = []

In [8]:
%%time
import warnings
warnings.filterwarnings('ignore')

CPU times: user 21 µs, sys: 9 µs, total: 30 µs
Wall time: 35.3 µs


## Trial 1

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=12345,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear', multi_class='auto')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [10]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 89.90% | outer ACC 91.00%
outer fold 1/5 | tuning Logistic | inner ACC 72.72% | outer ACC 69.70%
outer fold 1/5 | tuning RandomForest | inner ACC 92.60% | outer ACC 93.50%
outer fold 2/5 | tuning KNN      | inner ACC 90.18% | outer ACC 89.10%
outer fold 2/5 | tuning Logistic | inner ACC 72.40% | outer ACC 72.90%
outer fold 2/5 | tuning RandomForest | inner ACC 92.97% | outer ACC 92.20%
outer fold 3/5 | tuning KNN      | inner ACC 89.83% | outer ACC 91.20%
outer fold 3/5 | tuning Logistic | inner ACC 72.20% | outer ACC 74.20%
outer fold 3/5 | tuning RandomForest | inner ACC 92.97% | outer ACC 94.60%
outer fold 4/5 | tuning KNN      | inner ACC 89.03% | outer ACC 91.90%
outer fold 4/5 | tuning Logistic | inner ACC 72.17% | outer ACC 74.50%
outer fold 4/5 | tuning RandomForest | inner ACC 92.58% | outer ACC 93.90%
outer fold 5/5 | tuning KNN      | inner ACC 89.53% | outer ACC 89.00%
outer fold 5/5 | tuning Logistic | inner ACC 72.75% | outer A

In [11]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 90.44% +\- 1.174
RandomForest | outer CV acc. 93.42% +\- 0.823
Logistic | outer CV acc. 72.50% +\- 1.821

KNN best parameters {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 4}
Logistic best parameters {'classifier__C': 1.0}


In [12]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 91.02% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
Training Accuracy: 100.00%
Test Accuracy: 92.83%

Accuracy 93.74% (average over CV test folds)
Best Parameters: {'classifier__max_features': 8}
Training Accuracy: 100.00%
Test Accuracy: 94.92%

Accuracy 72.48% (average over CV test folds)
Best Parameters: {'classifier__C': 0.1}
Training Accuracy: 72.30%
Test Accuracy: 72.59%


In [13]:
print(knn_accuracy[0])
print(randforest_accuracy[0])
print(logreg_accuracy[0]) 

0.9283333333333333
0.9492
0.7258666666666667


## Trial 2

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=5432,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear', multi_class='auto')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [15]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 90.12% | outer ACC 92.90%
outer fold 1/5 | tuning Logistic | inner ACC 73.10% | outer ACC 72.30%
outer fold 1/5 | tuning RandomForest | inner ACC 93.40% | outer ACC 95.70%
outer fold 2/5 | tuning KNN      | inner ACC 90.00% | outer ACC 93.00%
outer fold 2/5 | tuning Logistic | inner ACC 72.52% | outer ACC 74.70%
outer fold 2/5 | tuning RandomForest | inner ACC 93.33% | outer ACC 93.90%
outer fold 3/5 | tuning KNN      | inner ACC 90.70% | outer ACC 91.20%
outer fold 3/5 | tuning Logistic | inner ACC 72.85% | outer ACC 73.10%
outer fold 3/5 | tuning RandomForest | inner ACC 93.50% | outer ACC 93.60%
outer fold 4/5 | tuning KNN      | inner ACC 90.10% | outer ACC 90.20%
outer fold 4/5 | tuning Logistic | inner ACC 73.85% | outer ACC 70.10%
outer fold 4/5 | tuning RandomForest | inner ACC 93.50% | outer ACC 92.70%
outer fold 5/5 | tuning KNN      | inner ACC 90.60% | outer ACC 89.50%
outer fold 5/5 | tuning Logistic | inner ACC 72.58% | outer A

In [16]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 91.36% +\- 1.407
RandomForest | outer CV acc. 93.76% +\- 1.065
Logistic | outer CV acc. 73.02% +\- 1.755

KNN best parameters {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 6}
Logistic best parameters {'classifier__C': 1.0}


In [17]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 91.56% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
Training Accuracy: 100.00%
Test Accuracy: 93.19%

Accuracy 94.06% (average over CV test folds)
Best Parameters: {'classifier__max_features': 4}
Training Accuracy: 100.00%
Test Accuracy: 94.85%

Accuracy 72.84% (average over CV test folds)
Best Parameters: {'classifier__C': 10.0}
Training Accuracy: 73.06%
Test Accuracy: 72.37%


In [18]:
print(knn_accuracy[1])
print(randforest_accuracy[1])
print(logreg_accuracy[1]) 

0.9318666666666666
0.9485333333333333
0.7236666666666667


## Trial 3

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=13245,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear', multi_class='auto')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [20]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 90.62% | outer ACC 91.20%
outer fold 1/5 | tuning Logistic | inner ACC 72.62% | outer ACC 69.30%
outer fold 1/5 | tuning RandomForest | inner ACC 93.77% | outer ACC 94.50%
outer fold 2/5 | tuning KNN      | inner ACC 90.53% | outer ACC 92.60%
outer fold 2/5 | tuning Logistic | inner ACC 71.75% | outer ACC 71.70%
outer fold 2/5 | tuning RandomForest | inner ACC 93.40% | outer ACC 94.70%
outer fold 3/5 | tuning KNN      | inner ACC 90.22% | outer ACC 91.10%
outer fold 3/5 | tuning Logistic | inner ACC 71.95% | outer ACC 73.20%
outer fold 3/5 | tuning RandomForest | inner ACC 93.42% | outer ACC 95.30%
outer fold 4/5 | tuning KNN      | inner ACC 90.50% | outer ACC 90.70%
outer fold 4/5 | tuning Logistic | inner ACC 72.28% | outer ACC 73.80%
outer fold 4/5 | tuning RandomForest | inner ACC 93.73% | outer ACC 92.70%
outer fold 5/5 | tuning KNN      | inner ACC 90.42% | outer ACC 90.30%
outer fold 5/5 | tuning Logistic | inner ACC 72.45% | outer A

In [21]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 91.18% +\- 0.778
RandomForest | outer CV acc. 93.96% +\- 1.102
Logistic | outer CV acc. 71.96% +\- 1.555

KNN best parameters {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 4}
Logistic best parameters {'classifier__C': 100.0}


In [22]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 91.80% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
Training Accuracy: 100.00%
Test Accuracy: 92.02%

Accuracy 94.24% (average over CV test folds)
Best Parameters: {'classifier__max_features': 4}
Training Accuracy: 100.00%
Test Accuracy: 94.66%

Accuracy 72.14% (average over CV test folds)
Best Parameters: {'classifier__C': 100.0}
Training Accuracy: 72.40%
Test Accuracy: 72.33%


In [23]:
print(knn_accuracy[2])
print(randforest_accuracy[2])
print(logreg_accuracy[2]) 

0.9202
0.9466
0.7233333333333334


In [24]:
#report average train accuracy per classifier (with best parameter)
print("Average KNN Test Accuracy: ", np.mean(knn_accuracy_train))
print("Average Random Forest Test Accuracy: ", np.mean(randforest_accuracy_train))
print("Average Logistic Regression Test Accuracy: ", np.mean(logreg_accuracy_train))

Average KNN Test Accuracy:  1.0
Average Random Forest Test Accuracy:  1.0
Average Logistic Regression Test Accuracy:  0.7258666666666667


In [25]:
#report average test accuracy per classifier (with best parameter)
print("Average KNN Test Accuracy: ", np.mean(knn_accuracy))
print("Average Random Forest Test Accuracy: ", np.mean(randforest_accuracy))
print("Average Logistic Regression Test Accuracy: ", np.mean(logreg_accuracy))

Average KNN Test Accuracy:  0.9268
Average Random Forest Test Accuracy:  0.9481111111111112
Average Logistic Regression Test Accuracy:  0.7242888888888889


## T - test for LETTER dataset

In [26]:
from scipy import stats
import numpy as np

In [27]:
# T-test between the different algorithms

knn_forest = stats.ttest_ind(knn_accuracy, randforest_accuracy)

forest_logistic = stats.ttest_ind(randforest_accuracy, logreg_accuracy)

logistic_knn = stats.ttest_ind(logreg_accuracy, knn_accuracy)

In [28]:
# Results of the T-tests

print('KNN and RandomForest: ', knn_forest)
print('\nRandomForest and Logistic Regression: ', forest_logistic)
print('\nLogistic Regression and KNN: ', logistic_knn)

KNN and RandomForest:  Ttest_indResult(statistic=-6.018486029250711, pvalue=0.003838995866673888)

RandomForest and Logistic Regression:  Ttest_indResult(statistic=201.03832462525068, pvalue=3.6725201869652128e-09)

Logistic Regression and KNN:  Ttest_indResult(statistic=-57.1373229787318, pvalue=5.618046883959578e-07)
