In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.metrics import accuracy_score
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
headers = ['Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
       'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3',
       'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3',
       'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8',
       'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12',
       'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16',
       'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20',
       'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24',
       'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28',
       'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32',
       'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36',
       'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40',
       'Cover_Type']
      
df = pd.read_csv('covtype.data', header = None, names = headers, na_values = ' ?')

In [3]:
df = df.dropna()

In [4]:
# Finding the most common cover type
numeric = df['Cover_Type'].tolist()
def most_common(numeric):
    return max(set(numeric), key = numeric.count)

print(most_common(numeric))

2


In [5]:
# Replacing the labels with 0's and 1's
for i, j in enumerate(numeric):
    if j == 2:
        numeric[i] = 1
    else:
        numeric[i] = 0
        
df['Cover_Type'] = numeric

In [6]:
# Seperating features and labels
X_p = df.iloc[:, :-1]
y_p = df.iloc[:, -1]

In [7]:
# accuracy vectors for train set
logreg_accuracy_train = []
randforest_accuracy_train = []
knn_accuracy_train = []

In [8]:
# accuracy vectors for test set
logreg_accuracy = []
randforest_accuracy = []
knn_accuracy = []

## Trial 1

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=12345,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear', multi_class='auto')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16, 20])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [10]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 76.47% | outer ACC 78.62%
outer fold 1/5 | tuning Logistic | inner ACC 74.97% | outer ACC 73.73%
outer fold 1/5 | tuning RandomForest | inner ACC 81.70% | outer ACC 80.52%
outer fold 2/5 | tuning KNN      | inner ACC 75.24% | outer ACC 80.22%
outer fold 2/5 | tuning Logistic | inner ACC 73.84% | outer ACC 77.12%
outer fold 2/5 | tuning RandomForest | inner ACC 80.70% | outer ACC 83.52%
outer fold 3/5 | tuning KNN      | inner ACC 76.92% | outer ACC 76.10%
outer fold 3/5 | tuning Logistic | inner ACC 74.70% | outer ACC 74.60%
outer fold 3/5 | tuning RandomForest | inner ACC 81.45% | outer ACC 82.00%
outer fold 4/5 | tuning KNN      | inner ACC 77.41% | outer ACC 73.17%
outer fold 4/5 | tuning Logistic | inner ACC 74.83% | outer ACC 73.17%
outer fold 4/5 | tuning RandomForest | inner ACC 81.78% | outer ACC 79.28%
outer fold 5/5 | tuning KNN      | inner ACC 76.31% | outer ACC 76.28%
outer fold 5/5 | tuning Logistic | inner ACC 74.31% | outer A

In [11]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 76.88% +\- 2.404
RandomForest | outer CV acc. 81.76% +\- 1.662
Logistic | outer CV acc. 74.68% +\- 1.354

KNN best parameters {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 4}
Logistic best parameters {'classifier__C': 10.0}


In [12]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 77.50% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
Training Accuracy: 100.00%
Test Accuracy: 76.98%

Accuracy 82.00% (average over CV test folds)
Best Parameters: {'classifier__max_features': 16}
Training Accuracy: 100.00%
Test Accuracy: 82.23%

Accuracy 74.36% (average over CV test folds)
Best Parameters: {'classifier__C': 0.1}
Training Accuracy: 75.02%
Test Accuracy: 75.05%


In [13]:
print(knn_accuracy[0])
print(randforest_accuracy[0])
print(logreg_accuracy[0]) 

0.7697964625736964
0.8223422428699402
0.7504895731338931


## Trial 2

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=54321,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear', multi_class='auto')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16, 20])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [15]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 76.24% | outer ACC 74.13%
outer fold 1/5 | tuning Logistic | inner ACC 75.22% | outer ACC 74.13%
outer fold 1/5 | tuning RandomForest | inner ACC 81.87% | outer ACC 82.32%
outer fold 2/5 | tuning KNN      | inner ACC 76.09% | outer ACC 76.22%
outer fold 2/5 | tuning Logistic | inner ACC 75.42% | outer ACC 75.62%
outer fold 2/5 | tuning RandomForest | inner ACC 81.52% | outer ACC 82.72%
outer fold 3/5 | tuning KNN      | inner ACC 75.33% | outer ACC 77.80%
outer fold 3/5 | tuning Logistic | inner ACC 75.38% | outer ACC 76.30%
outer fold 3/5 | tuning RandomForest | inner ACC 81.00% | outer ACC 82.20%
outer fold 4/5 | tuning KNN      | inner ACC 76.58% | outer ACC 74.87%
outer fold 4/5 | tuning Logistic | inner ACC 76.11% | outer ACC 74.07%
outer fold 4/5 | tuning RandomForest | inner ACC 81.95% | outer ACC 80.38%
outer fold 5/5 | tuning KNN      | inner ACC 75.61% | outer ACC 78.68%
outer fold 5/5 | tuning Logistic | inner ACC 74.86% | outer A

In [16]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 76.34% +\- 1.713
RandomForest | outer CV acc. 82.18% +\- 0.976
Logistic | outer CV acc. 75.28% +\- 0.994

KNN best parameters {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 16}
Logistic best parameters {'classifier__C': 1.0}


In [17]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 76.16% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
Training Accuracy: 100.00%
Test Accuracy: 77.13%

Accuracy 82.14% (average over CV test folds)
Best Parameters: {'classifier__max_features': 12}
Training Accuracy: 100.00%
Test Accuracy: 82.33%

Accuracy 75.36% (average over CV test folds)
Best Parameters: {'classifier__C': 10.0}
Training Accuracy: 76.14%
Test Accuracy: 74.97%


In [18]:
print(knn_accuracy[1])
print(randforest_accuracy[1])
print(logreg_accuracy[1]) 

0.7713051116990618
0.8233127087630119
0.7497447969833961


## Trial 3

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=13245,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear', multi_class='auto')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16, 20])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [20]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 75.29% | outer ACC 74.13%
outer fold 1/5 | tuning Logistic | inner ACC 75.29% | outer ACC 71.73%
outer fold 1/5 | tuning RandomForest | inner ACC 80.40% | outer ACC 78.32%
outer fold 2/5 | tuning KNN      | inner ACC 75.94% | outer ACC 76.02%
outer fold 2/5 | tuning Logistic | inner ACC 74.37% | outer ACC 75.82%
outer fold 2/5 | tuning RandomForest | inner ACC 80.77% | outer ACC 81.42%
outer fold 3/5 | tuning KNN      | inner ACC 75.52% | outer ACC 75.20%
outer fold 3/5 | tuning Logistic | inner ACC 74.65% | outer ACC 72.20%
outer fold 3/5 | tuning RandomForest | inner ACC 80.95% | outer ACC 79.60%
outer fold 4/5 | tuning KNN      | inner ACC 75.28% | outer ACC 76.58%
outer fold 4/5 | tuning Logistic | inner ACC 74.53% | outer ACC 75.38%
outer fold 4/5 | tuning RandomForest | inner ACC 80.70% | outer ACC 81.08%
outer fold 5/5 | tuning KNN      | inner ACC 74.78% | outer ACC 77.78%
outer fold 5/5 | tuning Logistic | inner ACC 73.68% | outer A

In [21]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 75.94% +\- 1.235
RandomForest | outer CV acc. 80.76% +\- 1.715
Logistic | outer CV acc. 74.16% +\- 1.805

KNN best parameters {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 4}
Logistic best parameters {'classifier__C': 0.1}


In [22]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 76.00% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
Training Accuracy: 100.00%
Test Accuracy: 77.41%

Accuracy 81.34% (average over CV test folds)
Best Parameters: {'classifier__max_features': 6}
Training Accuracy: 100.00%
Test Accuracy: 82.22%

Accuracy 74.64% (average over CV test folds)
Best Parameters: {'classifier__C': 100.0}
Training Accuracy: 75.14%
Test Accuracy: 75.59%


In [23]:
print(knn_accuracy[2])
print(randforest_accuracy[2])
print(logreg_accuracy[2]) 

0.7741297056311327
0.8221842600501379
0.755926959855003


In [24]:
#report average train accuracy per classifier (with best parameter)
print("Average KNN Test Accuracy: ", np.mean(knn_accuracy_train))
print("Average Random Forest Test Accuracy: ", np.mean(randforest_accuracy_train))
print("Average Logistic Regression Test Accuracy: ", np.mean(logreg_accuracy_train))

Average KNN Test Accuracy:  1.0
Average Random Forest Test Accuracy:  1.0
Average Logistic Regression Test Accuracy:  0.7543333333333333


In [25]:
#report average test accuracy per classifier (with best parameter)
print("Average KNN Test Accuracy: ", np.mean(knn_accuracy))
print("Average Random Forest Test Accuracy: ", np.mean(randforest_accuracy))
print("Average Logistic Regression Test Accuracy: ", np.mean(logreg_accuracy))

Average KNN Test Accuracy:  0.7717437599679636
Average Random Forest Test Accuracy:  0.82261307056103
Average Logistic Regression Test Accuracy:  0.7520537766574308


## T - test for COV_TYPE dataset

In [26]:
from scipy import stats
import numpy as np

In [27]:
# T-test between the different algorithms

knn_forest = stats.ttest_ind(knn_accuracy, randforest_accuracy)

forest_logistic = stats.ttest_ind(randforest_accuracy, logreg_accuracy)

logistic_knn = stats.ttest_ind(logreg_accuracy, knn_accuracy)

In [28]:
# Results of the T-tests

print('KNN and RandomForest: ', knn_forest)
print('\nRandomForest and Logistic Regression: ', forest_logistic)
print('\nLogistic Regression and KNN: ', logistic_knn)

KNN and RandomForest:  Ttest_indResult(statistic=-38.59381360746943, pvalue=2.692397259865767e-06)

RandomForest and Logistic Regression:  Ttest_indResult(statistic=35.63298684574943, pvalue=3.702247880570312e-06)

Logistic Regression and KNN:  Ttest_indResult(statistic=-8.465804406727974, pvalue=0.0010668985208018861)
