In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, f1_score, confusion_matrix, roc_curve, accuracy_score, auc, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [0.1, 1]},
                    {'kernel': ['linear'], 'C': [0.01, 0.1, 1]},
                    {'kernel': ['poly'], 'C': [0.01, 0.1], 'degree': [3, 4], 'coef0': [0.1, 0.5]}]

nn_tuned_parameters = [{'hidden_layer_sizes': [(30,), (50,), (50, 10), (100, 10, 10)],
                        'activation': ['tanh','logistic','relu'],
                        'alpha':[0.0001,0.001,0.01,0.1,1,10], 'max_iter': range(50,200,50)}]
clf = SVC()
scorers = {
    'f1_score': make_scorer(f1_score)
}


In [4]:
data = pd.read_csv('data/processed/mushrooms_pca.csv')
y = data['class']
X = data.drop('class', axis=1)

X_train, X_test, y_train, y_test = \
        train_test_split(X, y.ravel(), test_size=.2, random_state=42)


In [5]:
grid_search = GridSearchCV(clf, tuned_parameters, scoring='f1',
                           cv=10, return_train_score=True, n_jobs=3)
grid_search.fit(X_train, y_train)


GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=3,
       param_grid=[{'kernel': ['rbf'], 'gamma': [0.001, 0.0001], 'C': [0.1, 1]}, {'kernel': ['linear'], 'C': [0.01, 0.1, 1]}, {'kernel': ['poly'], 'C': [0.01, 0.1], 'degree': [3, 4], 'coef0': [0.1, 0.5]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [13]:
y_true, y_pred = y_test, grid_search.predict(X_test)

print(grid_search.best_params_)
report = classification_report(y_true, y_pred)
print(report)

{'C': 1, 'kernel': 'linear'}
              precision    recall  f1-score   support

           0       0.87      0.88      0.88       843
           1       0.87      0.85      0.86       782

   micro avg       0.87      0.87      0.87      1625
   macro avg       0.87      0.87      0.87      1625
weighted avg       0.87      0.87      0.87      1625



In [11]:
confusion_matrix(y_true, y_pred)

array([[746,  97],
       [114, 668]], dtype=int64)

In [51]:
pd.DataFrame(grid_search.cv_results_)
print(pd.DataFrame(grid_search.cv_results_))

    mean_fit_time  std_fit_time  mean_score_time  std_score_time param_C  \
0       10.547579      0.325011         0.736031        0.051377     0.1   
1       10.534812      0.567531         0.738425        0.062746     0.1   
2        5.947686      0.117515         0.450894        0.020432       1   
3        9.706729      0.114025         0.712893        0.009322       1   
4        3.759839      0.144940         0.263296        0.006479    0.01   
5        3.303958      0.146280         0.213829        0.014204     0.1   
6        5.265312      0.225599         0.218815        0.014433       1   
7        9.835284      0.713873         0.694044        0.071563    0.01   
8        9.306998      0.605169         0.637295        0.055834    0.01   
9        9.356765      0.379068         0.690054        0.052139    0.01   
10       9.823616      0.388589         0.715985        0.058978    0.01   
11      10.137775      0.448015         0.700526        0.051927     0.1   
12      10.3

In [None]:

fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
roc_auc = auc(fpr, tpr)
plt.figure()

lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [69]:
clf = MLPClassifier()

grid_search_nn = GridSearchCV(clf, nn_tuned_parameters, scoring='f1',
                           cv=10, return_train_score=True, n_jobs=-1)
grid_search_nn.fit(X_train, y_train)


In [1]:
y_true, y_pred = y_test, grid_search_nn.predict(X_test)
print(grid_search_nn.best_params_)
report = classification_report(y_true, y_pred)
print(report)

NameError: name 'y_test' is not defined

In [67]:
print(pd.DataFrame(grid_search_nn.cv_results_))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       9.778337      0.056631         0.003690        0.000638   
1      12.667707      0.580131         0.003889        0.000299   
2       7.798036      0.701444         0.004488        0.000669   
3       6.302837      0.571219         0.004887        0.000829   

  param_hidden_layer_sizes                                 params  \
0                    (30,)          {'hidden_layer_sizes': (30,)}   
1                    (50,)          {'hidden_layer_sizes': (50,)}   
2                 (50, 10)       {'hidden_layer_sizes': (50, 10)}   
3            (100, 10, 10)  {'hidden_layer_sizes': (100, 10, 10)}   

   split0_test_score  split1_test_score  split2_test_score  split3_test_score  \
0           0.896661           0.892744           0.902516           0.876582   
1           0.886400           0.895238           0.902087           0.897314   
2           0.910236           0.898413           0.897314           0.8892