In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
import statistics
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
abalone_data = pd.read_csv('abalone.csv')
abalone_data = abalone_data[abalone_data['Type'].isin(['M', 'F'])].reset_index().drop(columns = ['index'])
X = abalone_data.iloc[:, 1:-1]
y = abalone_data.iloc[:, 0]

In [3]:
all_models = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA"]

all_classifiers = [KNeighborsClassifier(n_neighbors = 3, weights = 'uniform'),
                SVC(kernel = "linear", C = 0.025),
                SVC(gamma = 2, C = 1),
                GaussianProcessClassifier(1.0 * RBF(1.0)),
                DecisionTreeClassifier(max_depth = 5),
                RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1),
                MLPClassifier(alpha = 1, max_iter = 1000),
                AdaBoostClassifier(),
                GaussianNB(),
                QuadraticDiscriminantAnalysis()]


cv_classifiers = pd.DataFrame(columns = ['Model', 'CV_1', 'CV_2', 'CV_3', 'CV_4', 'CV_5', 'AVG_CV'])
for i, model in enumerate(all_classifiers):
    cv_scores = cross_val_score(model, X, y, cv = 5)
    to_append = {
        'Model': all_models[i],
        'CV_1': cv_scores[0],
        'CV_2': cv_scores[1],
        'CV_3': cv_scores[2],
        'CV_4': cv_scores[3],
        'CV_5': cv_scores[4],
        'AVG_CV': statistics.mean(cv_scores)}
    cv_classifiers = cv_classifiers.append(to_append, ignore_index = True)

In [4]:
cv_classifiers

Unnamed: 0,Model,CV_1,CV_2,CV_3,CV_4,CV_5,AVG_CV
0,Nearest Neighbors,0.56261,0.499118,0.541446,0.488536,0.566138,0.53157
1,Linear SVM,0.539683,0.539683,0.539683,0.537919,0.537919,0.538977
2,RBF SVM,0.548501,0.550265,0.555556,0.550265,0.557319,0.552381
3,Gaussian Process,0.54321,0.550265,0.520282,0.557319,0.571429,0.548501
4,Decision Tree,0.559083,0.564374,0.514991,0.518519,0.564374,0.544268
5,Random Forest,0.536155,0.548501,0.527337,0.502646,0.559083,0.534744
6,Neural Net,0.541446,0.544974,0.546737,0.537919,0.525573,0.53933
7,AdaBoost,0.520282,0.530864,0.546737,0.54321,0.57672,0.543563
8,Naive Bayes,0.546737,0.500882,0.516755,0.514991,0.536155,0.523104
9,QDA,0.529101,0.537919,0.520282,0.534392,0.532628,0.530864


In [5]:
stdevs = []
for row in cv_classifiers.iterrows():
    cvs = []
    for i in range(1, 6):
        cvs.append(row[1]['CV_' + str(i)])
    var = statistics.variance(cvs)
    stdevs.append(var**2)
cv_classifiers['STDeviation'] = stdevs

In [7]:
np.quantile(cv_classifiers['AVG_CV'], 0.20)

0.5314285714285715

In [8]:
pull_models = np.quantile(cv_classifiers['AVG_CV'], 0.25)
best_model_indices = [i for i, row in enumerate(cv_classifiers.iterrows()) if row[1]['AVG_CV'] > pull_models]
print('Indices to pull: ' + str(best_model_indices))

use_models = []
for index in best_model_indices: 
    model = all_classifiers[index]
    name = all_models[index]
    
    to_append = (name, model)
    use_models.append(to_append)
print(use_models)

Indices to pull: [1, 2, 3, 4, 5, 6, 7]
[('Linear SVM', SVC(C=0.025, kernel='linear')), ('RBF SVM', SVC(C=1, gamma=2)), ('Gaussian Process', GaussianProcessClassifier(kernel=1**2 * RBF(length_scale=1))), ('Decision Tree', DecisionTreeClassifier(max_depth=5)), ('Random Forest', RandomForestClassifier(max_depth=5, max_features=1, n_estimators=10)), ('Neural Net', MLPClassifier(alpha=1, max_iter=1000)), ('AdaBoost', AdaBoostClassifier())]


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

vote_classifier = VotingClassifier(estimators = use_models, 
                                   voting = 'hard')
full_model = vote_classifier.fit(X_train, y_train)
accuracy = round(full_model.score(X_test, y_test), 3)

print('Voting classifier accuracy: ' + str(accuracy))

Voting classifier accuracy: 0.532


In [12]:
stacked_classifier = StackingClassifier(estimators = use_models, 
                                        final_estimator = LogisticRegression())
full_model2 = stacked_classifier.fit(X_train, y_train)
accuracy = round(full_model2.score(X_test, y_test), 3)

print('Stacked classifier accuracy: ' + str(accuracy))

Stacked classifier accuracy: 0.544


In [13]:
selected_model = cv_classifiers[cv_classifiers['AVG_CV'] == max(cv_classifiers['AVG_CV'])]
model_index = selected_model.index[0]
model_name = all_models[model_index]
print('Best model is: ' + str(model_name))


model = SVC()
parameters = {'kernel':('linear', 'poly', 'rbf', 'sigmoid', 'precomputed'), 
              'C':[1, 10], 
              'gamma': ('scale', 'auto')}
GridSearchCV(model, parameters)

Best model is: RBF SVM


GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'gamma': ('scale', 'auto'),
                         'kernel': ('linear', 'poly', 'rbf', 'sigmoid',
                                    'precomputed')})

2