## Training classifier based on PCA data

In this notebook, using the PCA transformed arrays of train and test image sets, a variety of different classification algorithms have been employed to recognize the type of tumor (malignant or benign). In order to find the optimal hyperparameters for each classifier, a grid search is utilized which compares the outcomes using a 5-fold cross validation scheme. Eventually, for each model, the best hyperparameters along with different classification metrics (accuracy, recall, F$_1$ score, and the recognition rate) are calculated and stored in a data frame.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import recall_score, fbeta_score

In [None]:
df_train = pd.read_csv('train_data.csv')
df_test  = pd.read_csv('test_data.csv')

In [None]:
data = np.load('pca_data.npz')

In [None]:
Z_train = data['arr_0']
Z_test  = data['arr_1']

y_train = data['arr_2']
y_test  = data['arr_3']

In [None]:
models = {'LogisticRegression' : LogisticRegression(),
          'KNN': KNeighborsClassifier(),
          'GaussianNB': GaussianNB(),
          'RandomForest': RandomForestClassifier(random_state=42),
          'AdaBoost': AdaBoostClassifier(random_state=42),
          'GradientBoost': GradientBoostingClassifier(random_state=42),
          'SVM': SVC(random_state=42)}

In [None]:
params_model = {'LogisticRegression': {'penalty': ['l1', 'l2'],
                                       'C': np.logspace(-3, 3, 14)},
                'KNN': {'n_neighbors': [5, 7, 15],
                                         'weights': ['uniform', 'distance']},
                'GaussianNB': {},
                'RandomForest': {'n_estimators': [100, 150, 200],
                                           'max_depth': [None, 5, 7, 11]},
                'AdaBoost': {'n_estimators': [100, 150, 200]},
                'GradientBoost': {'n_estimators': [100, 150, 200]},
                'SVM': {'C': np.logspace(-3, 3, 14),
                        'gamma': np.logspace(-5, 0, 12)}
               }

In [None]:
df = pd.DataFrame(columns=['model', 'best_params',
                           'train_accuracy', 'test_accuracy', 'train_recall',
                           'test_recall', 'train_f1', 'test_f1', 'RR'])

In [None]:
for model_name in models.keys():
    
    print('\nGrid search using {} model ...\n'.format(model_name))
    
    params = params_model[model_name]
    
    model = models[model_name]
    
    grid = GridSearchCV(model, 
                        param_grid = params, 
                        cv = 5,
                        scoring = 'accuracy',
                        verbose = 1,
                        n_jobs = -1,
                        return_train_score = True)
    
    grid.fit(Z_train, y_train);
    
    df_test['pred'] = grid.predict(Z_test)
    df_test['correct'] = df_test.apply(lambda x: 1 if x['Label'] == x['pred'] else 0, axis=1)
    RR = df_test.groupby('Patient_Id')[['correct']].agg('mean').sum().values[0] / len(df_test.groupby('Patient_Id'))
    
    output = {}
    output['model'] = model_name
    output['best_params'] = grid.best_params_
    output['train_accuracy'] = grid.best_score_
    output['test_accuracy'] = grid.score(Z_test, y_test)
    output['train_recall'] = recall_score(y_train, grid.predict(Z_train))
    output['test_recall'] = recall_score(y_test, df_test['pred'])
    output['train_f1'] = fbeta_score(y_train, grid.predict(Z_train), 1)
    output['test_f1'] = fbeta_score(y_test, df_test['pred'], 1)
    output['RR'] = RR
    
    df = df.append(output, ignore_index=True)
    
    df.to_csv('results_pca.csv', index=False)