<a href="https://colab.research.google.com/github/bjkeulen/TM10007_PROJECT/blob/master/assignment_1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TM10007 Assignment template

In [0]:
# Run this to use from colab environment
!pip install -q --upgrade git+https://github.com/bjkeulen/TM10007_PROJECT

  Building wheel for brats (setup.py) ... [?25l[?25hdone


In [0]:
# General packages
import numpy as np 
import pandas as pd
from sklearn import metrics
from sklearn import model_selection
from sklearn.decomposition import PCA

# Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

## Data loading and cleaning

Below are functions to load the dataset of your choice. After that, it is all up to you to create and evaluate a classification method. Beware, there may be missing values in these datasets. Good luck!

In [0]:
# Data loading functions. Uncomment the one you want to use
from hn.load_data import load_data

data = load_data()
features = data.drop('label', 1)
label = data['label']

print(f'The number of samples of raw data: {len(data.index)}')
print(f'The number of columns of raw data: {len(data.columns)}')

# Binarize labels
label[label=='T12'] = 0
label[label=='T34'] = 1
label = label.astype(int)

# Feature selection

# Principal component analysis (PCA)
p = PCA(n_components=20)
p = p.fit(feat_design)
feat_design_pca = p.transform(feat_design)
feat_test_pca = p.transform(feat_test)

print(f'The number of samples after selection and PCA: {len(data.index)}')
print(f'The number of columns after selection and PCA: {len(data.columns)}')

cv_10fold = model_selection.StratifiedKFold(n_splits=10)
fold_N = 1

for design_index, test_index in cv_10fold.split(features, label):
    feat_design = features.ix[design_index]
    label_design = label.ix[design_index]

    feat_test = features.ix[test_index]
    label_test = label.ix[test_index]

    # Classifiers to be used
    clsfs = [LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), GaussianNB(), 
            LogisticRegression(), SGDClassifier()]
    col_names = ['LDA', 'QDA', 'Gaussian NB', 'Log Regression', 'SGDC']

    kernels = ['rbf', 'sigmoid']
    for kernel in kernels:
        clsfs.append(SVC(kernel=kernel))
        col_names.append(f'SVM {kernel}')

    index_names = ['AUC', 'Accuracy', 'F1 score', 'Precision', 'Recall']
    performance = pd.DataFrame(index=index_names)

    # Train, test and evaluate classifiers
    N = 0
    for clf in clsfs:
        # Train and test classifier
        clf.fit(feat_design_pca, label_design)
        pred = clf.predict(feat_test_pca)

        if hasattr(clf, 'predict_proba'):
        # The first column gives the probability for class = 0, so we take
        # the second which gives the probability class = 1:
            score = clf.predict_proba(feat_test_pca)[:, 1]
        else:
            score = pred

        # Performance
        auc = metrics.roc_auc_score(label_test, score)
        accuracy = metrics.accuracy_score(label_test, pred)
        F1 = metrics.f1_score(label_test, pred)
        precision = metrics.precision_score(label_test, pred)
        recall = metrics.recall_score(label_test, pred)
        
        # Put into dataframe
        performance[col_names[N]] = [auc, accuracy, F1, precision, recall]
        N += 1

    # SVM polynomial kernel, Random Forests and SVM
    N = 0
    # Determine best parameter of the classifier using a grid search and accuracy
    clsfs = [SVC(kernel='poly'), RandomForestClassifier(), KNeighborsClassifier()]
    parameters = [{'degree': list(range(1, 10))}, {'n_estimators': list(range(5, 200, 5))}, {'n_neighbors': list(range(1, 30, 2))}]
    col_names = ['SVM poly d=', 'RF n=', 'KNN k=']

    for clf in clsfs:
        cv_10fold_grid = model_selection.StratifiedKFold(n_splits=10)
        grid_search = model_selection.GridSearchCV(clf, parameters[N], cv=cv_10fold_grid, scoring='accuracy')
        grid_search.fit(feat_design_pca, label_design)
        best_clf = grid_search.best_estimator_
        
        if N == 0:
            best_parameter = best_clf.degree
        elif N == 1:
            best_parameter = best_clf.n_estimators
        elif N == 2:
            best_parameter = best_clf.n_neighbors

        # Predict classes using the best RF classifier
        pred = best_clf.predict(feat_test_pca)

        if hasattr(clf, 'predict_proba'):
            score = best_clf.predict_proba(feat_test_pca)[:, 1]
        else:
            score = pred

        auc = metrics.roc_auc_score(label_test, score)
        accuracy = metrics.accuracy_score(label_test, pred)
        F1 = metrics.f1_score(label_test, pred)
        precision = metrics.precision_score(label_test, pred)
        recall = metrics.recall_score(label_test, pred)
        
        performance[(col_names[N] + str(best_parameter))] = [auc, accuracy, F1, precision, recall]

        N += 1

    performance = performance.transpose()
    print(f'Fold {fold_N}')
    print(performance)
    print('\n')

    fold_N += 1

The number of samples: 113
The number of columns: 160
Fold 0
                     AUC  Accuracy  F1 score  Precision    Recall
LDA             0.861111  0.750000  0.769231   0.714286  0.833333
QDA             0.944444  0.833333  0.833333   0.833333  0.833333
Gaussian NB     0.805556  0.583333  0.285714   1.000000  0.166667
Log Regression  0.888889  0.833333  0.833333   0.833333  0.833333
SGDC            0.583333  0.583333  0.285714   1.000000  0.166667
SVM rbf         0.666667  0.666667  0.500000   1.000000  0.333333
SVM sigmoid     0.583333  0.583333  0.285714   1.000000  0.166667
SVM poly d=1    0.583333  0.583333  0.285714   1.000000  0.166667
RF n=110        0.833333  0.666667  0.666667   0.666667  0.666667
KNN k=7         0.736111  0.750000  0.769231   0.714286  0.833333


Fold 1
                     AUC  Accuracy  F1 score  Precision    Recall
LDA             0.638889  0.500000  0.400000       0.50  0.333333
QDA             0.708333  0.666667  0.600000       0.75  0.500000
Gaussi

In [0]:
# Avaraging dataframes

