In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.metrics import accuracy_score
from pandas import read_csv
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
headers = ['age',
            'wrk-cls',
            'fnlwgt',
            'edu-lvl',
            'edu-num',
            'marriage',
            'occupation',
            'relationship',
            'race',
            'sex',
            'cap-gain',
            'cap-loss',
            'hr-per-wk',
            'country',
            'income']

df = pd.read_csv('adult.data', header = None, names = headers, na_values = ' ?')

In [3]:
df = df.dropna()

In [4]:
# Replacing the labels with 0's and 1's
numeric = df['income'].tolist()
for i, j in enumerate(numeric):
    if j == ' >50K':
        numeric[i] = 1
    else:
        numeric[i] = 0
        
df['income'] = numeric

In [5]:
# Seperating features and labels
X_p = pd.get_dummies(df.iloc[:, :-1])
y_p = df.iloc[:, -1]

In [6]:
# accuracy vectors for train set
logreg_accuracy_train = []
randforest_accuracy_train = []
knn_accuracy_train = []

In [7]:
# accuracy vectors for test set
logreg_accuracy = []
randforest_accuracy = []
knn_accuracy = []

## Trial 1

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=12345,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16, 20])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [9]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 82.80% | outer ACC 82.90%
outer fold 1/5 | tuning Logistic | inner ACC 84.08% | outer ACC 84.80%
outer fold 1/5 | tuning RandomForest | inner ACC 85.17% | outer ACC 86.20%
outer fold 2/5 | tuning KNN      | inner ACC 82.83% | outer ACC 82.40%
outer fold 2/5 | tuning Logistic | inner ACC 84.47% | outer ACC 84.00%
outer fold 2/5 | tuning RandomForest | inner ACC 85.52% | outer ACC 84.60%
outer fold 3/5 | tuning KNN      | inner ACC 82.80% | outer ACC 82.50%
outer fold 3/5 | tuning Logistic | inner ACC 84.35% | outer ACC 84.10%
outer fold 3/5 | tuning RandomForest | inner ACC 85.60% | outer ACC 84.30%
outer fold 4/5 | tuning KNN      | inner ACC 83.05% | outer ACC 82.10%
outer fold 4/5 | tuning Logistic | inner ACC 84.10% | outer ACC 83.60%
outer fold 4/5 | tuning RandomForest | inner ACC 85.38% | outer ACC 85.50%
outer fold 5/5 | tuning KNN      | inner ACC 82.88% | outer ACC 83.40%
outer fold 5/5 | tuning Logistic | inner ACC 84.25% | outer A

In [10]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 82.66% +\- 0.450
RandomForest | outer CV acc. 85.08% +\- 0.685
Logistic | outer CV acc. 84.16% +\- 0.393

KNN best parameters {'classifier__n_neighbors': 80, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 12}
Logistic best parameters {'classifier__C': 0.1}


In [11]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 83.10% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 80, 'classifier__weights': 'distance'}
Training Accuracy: 100.00%
Test Accuracy: 82.37%

Accuracy 85.76% (average over CV test folds)
Best Parameters: {'classifier__max_features': 16}
Training Accuracy: 100.00%
Test Accuracy: 85.01%

Accuracy 84.36% (average over CV test folds)
Best Parameters: {'classifier__C': 1.0}
Training Accuracy: 85.28%
Test Accuracy: 84.25%


In [12]:
print(knn_accuracy[0])
print(randforest_accuracy[0])
print(logreg_accuracy[0]) 

0.8237024083936094
0.8501311501470471
0.8424608536682299


## Trial 2

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=54321,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16, 20])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [14]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 82.88% | outer ACC 83.10%
outer fold 1/5 | tuning Logistic | inner ACC 84.78% | outer ACC 84.40%
outer fold 1/5 | tuning RandomForest | inner ACC 85.35% | outer ACC 84.90%
outer fold 2/5 | tuning KNN      | inner ACC 83.28% | outer ACC 82.10%
outer fold 2/5 | tuning Logistic | inner ACC 84.40% | outer ACC 85.10%
outer fold 2/5 | tuning RandomForest | inner ACC 85.62% | outer ACC 84.80%
outer fold 3/5 | tuning KNN      | inner ACC 83.23% | outer ACC 82.70%
outer fold 3/5 | tuning Logistic | inner ACC 84.40% | outer ACC 85.20%
outer fold 3/5 | tuning RandomForest | inner ACC 84.95% | outer ACC 85.90%
outer fold 4/5 | tuning KNN      | inner ACC 83.35% | outer ACC 82.60%
outer fold 4/5 | tuning Logistic | inner ACC 84.62% | outer ACC 83.70%
outer fold 4/5 | tuning RandomForest | inner ACC 85.75% | outer ACC 84.50%
outer fold 5/5 | tuning KNN      | inner ACC 83.12% | outer ACC 81.30%
outer fold 5/5 | tuning Logistic | inner ACC 84.55% | outer A

In [15]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 82.36% +\- 0.618
RandomForest | outer CV acc. 85.06% +\- 0.476
Logistic | outer CV acc. 84.70% +\- 0.576

KNN best parameters {'classifier__n_neighbors': 20, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 20}
Logistic best parameters {'classifier__C': 0.1}


In [16]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 83.06% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 60, 'classifier__weights': 'distance'}
Training Accuracy: 100.00%
Test Accuracy: 82.30%

Accuracy 85.64% (average over CV test folds)
Best Parameters: {'classifier__max_features': 20}
Training Accuracy: 100.00%
Test Accuracy: 84.38%

Accuracy 84.78% (average over CV test folds)
Best Parameters: {'classifier__C': 0.1}
Training Accuracy: 85.48%
Test Accuracy: 84.33%


In [17]:
print(knn_accuracy[1])
print(randforest_accuracy[1])
print(logreg_accuracy[1]) 

0.8230267864239726
0.8438120976075034
0.8433351879818775


## Trial 3

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# take all our penguin data, and reserve 20% of it for testing 
X_train, X_test, y_train, y_test = train_test_split(X_p, y_p,
                                                    train_size=5000,
                                                    random_state=13245,
                                                    stratify=y_p)


# Initializing Classifiers
clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier(n_estimators = 1024)
clf3 = LogisticRegression(solver='liblinear')

# Declaring parameters
K_list = np.array([n*20 for n in range(1,26)])
F_list = np.array([1, 2, 4, 6, 8, 12, 16, 20])
C_list = np.array([10**(-8), 10**(-7), 10**(-6), 10**(-5), 10**(-4), 10**(-3), 10**(-2), 10**(-1), 
                       10**(0), 10**(1), 10**(2), 10**(3), 10**(4)])

# Building the pipelines
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])


# Setting up the parameter grids
param_grid1 = [{'classifier__weights': ['uniform', 'distance'],
                'classifier__n_neighbors': K_list}]

param_grid2 = [{'classifier__max_features': F_list}]

param_grid3 = [{'classifier__C': C_list}]


# Setting up multiple GridSearchCV objects, 1 for each algorithm
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3),
                            (pipe1, pipe2, pipe3),
                            ('KNN', 'RandomForest', 'Logistic')):
    gcv = GridSearchCV(estimator=est,
                       param_grid=pgrid,
                       scoring='accuracy',
                       n_jobs=3,
                       cv=5, # just 2-fold inner loop, i.e. train/test
                       verbose=0,
                       return_train_score=True,
                       refit='accuracy')
    gridcvs[name] = gcv

In [19]:
%%time 
# ^^ this handy Jupyter magic times the execution of the cell for you

cv_scores = {name: [] for name, gs_est in gridcvs.items()}

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# The outer loop for algorithm selection
c = 1
for outer_train_idx, outer_valid_idx in skfold.split(X_train,y_train):
    for name, gs_est in sorted(gridcvs.items()):
        print('outer fold %d/5 | tuning %-8s' % (c, name), end='')

        # The inner loop for hyperparameter tuning
        gs_est.fit(X_train.iloc[outer_train_idx], y_train.iloc[outer_train_idx])
        y_pred = gs_est.predict(X_train.iloc[outer_valid_idx])
        acc = accuracy_score(y_true=y_train.iloc[outer_valid_idx], y_pred=y_pred)
        print(' | inner ACC %.2f%% | outer ACC %.2f%%' %
              (gs_est.best_score_ * 100, acc * 100))
        cv_scores[name].append(acc)

    c += 1

outer fold 1/5 | tuning KNN      | inner ACC 81.95% | outer ACC 80.80%
outer fold 1/5 | tuning Logistic | inner ACC 83.20% | outer ACC 84.20%
outer fold 1/5 | tuning RandomForest | inner ACC 84.20% | outer ACC 84.10%
outer fold 2/5 | tuning KNN      | inner ACC 81.73% | outer ACC 82.60%
outer fold 2/5 | tuning Logistic | inner ACC 83.10% | outer ACC 83.60%
outer fold 2/5 | tuning RandomForest | inner ACC 84.08% | outer ACC 84.30%
outer fold 3/5 | tuning KNN      | inner ACC 81.62% | outer ACC 83.40%
outer fold 3/5 | tuning Logistic | inner ACC 83.12% | outer ACC 84.60%
outer fold 3/5 | tuning RandomForest | inner ACC 83.45% | outer ACC 85.00%
outer fold 4/5 | tuning KNN      | inner ACC 81.95% | outer ACC 80.60%
outer fold 4/5 | tuning Logistic | inner ACC 83.47% | outer ACC 82.70%
outer fold 4/5 | tuning RandomForest | inner ACC 84.17% | outer ACC 84.10%
outer fold 5/5 | tuning KNN      | inner ACC 82.05% | outer ACC 81.60%
outer fold 5/5 | tuning Logistic | inner ACC 83.38% | outer A

In [20]:
# Looking at the results
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
          name, 100 * np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

KNN      | outer CV acc. 81.80% +\- 1.066
RandomForest | outer CV acc. 84.22% +\- 0.453
Logistic | outer CV acc. 83.58% +\- 0.749

KNN best parameters {'classifier__n_neighbors': 60, 'classifier__weights': 'distance'}
RandomForest best parameters {'classifier__max_features': 8}
Logistic best parameters {'classifier__C': 0.1}


In [21]:
# Fitting a model to the whole training set
# using the "best" KNN algorithm
best_algo = gridcvs['KNN']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['KNN'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
knn_accuracy.append(test_acc)
knn_accuracy_train.append(train_acc)

# using the "best" RandomForest algorithm
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))

print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
randforest_accuracy.append(test_acc)
randforest_accuracy_train.append(train_acc)

# using the "best" Logistic algorithm
best_algo = gridcvs['Logistic']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


print('\nAccuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best Parameters: %s' % gridcvs['Logistic'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))
logreg_accuracy.append(test_acc)
logreg_accuracy_train.append(train_acc)

Accuracy 82.06% (average over CV test folds)
Best Parameters: {'classifier__n_neighbors': 40, 'classifier__weights': 'uniform'}
Training Accuracy: 82.80%
Test Accuracy: 81.86%

Accuracy 83.82% (average over CV test folds)
Best Parameters: {'classifier__max_features': 12}
Training Accuracy: 100.00%
Test Accuracy: 84.72%

Accuracy 83.38% (average over CV test folds)
Best Parameters: {'classifier__C': 1.0}
Training Accuracy: 84.20%
Test Accuracy: 84.64%


In [22]:
print(knn_accuracy[2])
print(randforest_accuracy[2])
print(logreg_accuracy[2]) 

0.8185756299181305
0.8471902074556872
0.846395358079644


In [23]:
#report average train accuracy per classifier (with best parameter)
print("Average KNN Test Accuracy: ", np.mean(knn_accuracy_train))
print("Average Random Forest Test Accuracy: ", np.mean(randforest_accuracy_train))
print("Average Logistic Regression Test Accuracy: ", np.mean(logreg_accuracy_train))

Average KNN Test Accuracy:  0.9426666666666667
Average Random Forest Test Accuracy:  1.0
Average Logistic Regression Test Accuracy:  0.8498666666666667


In [24]:
#report average test accuracy per classifier (with best parameter)
print("Average KNN Test Accuracy: ", np.mean(knn_accuracy))
print("Average Random Forest Test Accuracy: ", np.mean(randforest_accuracy))
print("Average Logistic Regression Test Accuracy: ", np.mean(logreg_accuracy))

Average KNN Test Accuracy:  0.8217682749119043
Average Random Forest Test Accuracy:  0.8470444850700792
Average Logistic Regression Test Accuracy:  0.8440637999099171


## T - test for ADULT dataset

In [25]:
from scipy import stats
import numpy as np

In [26]:
# T-test between the different algorithms

knn_forest = stats.ttest_ind(knn_accuracy, randforest_accuracy)

forest_logistic = stats.ttest_ind(randforest_accuracy, logreg_accuracy)

logistic_knn = stats.ttest_ind(logreg_accuracy, knn_accuracy)

In [27]:
# Results of the T-tests

print('KNN and RandomForest: ', knn_forest)
print('\nRandomForest and Logistic Regression: ', forest_logistic)
print('\nLogistic Regression and KNN: ', logistic_knn)

KNN and RandomForest:  Ttest_indResult(statistic=-10.389223178576424, pvalue=0.00048468528084998807)

RandomForest and Logistic Regression:  Ttest_indResult(statistic=1.3668281006611518, pvalue=0.2434617939743712)

Logistic Regression and KNN:  Ttest_indResult(statistic=11.135200471769094, pvalue=0.00037013662334016575)
