# COMP5318 - Machine Learning and Data Mining: Assignment 1

In [64]:
import pandas as pd
import os
print(os.listdir("./Input/train"))
pd.set_option('display.max_columns', 10)

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
from sklearn.ensemble import BaggingClassifier

%matplotlib inline

['train.csv']


In [65]:
# train.csv including feature and label using for training model.
data_train_df = pd.read_csv('./Input/train/train.csv') 

In [66]:
# Selecting input feature
data_train_feature = data_train_df.loc[:, "v1":"v784"].to_numpy()

# Selecting output lable 
data_train_label = data_train_df.label.to_numpy()

In [67]:
# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    data_train_feature, data_train_label, random_state=0, stratify=data_train_label)

In [68]:
# Performance Metrics Calculator Helper 
def performance(y_true, y_pred, type):
    precision_test = precision_recall_fscore_support(y_true, y_pred, average='macro')
    print("Accuracy on " + type + " set: {:.3f}".format(accuracy_score(y_true, y_pred)))
    print("Precision on " + type + " set: {:.3f}".format(precision_test[0]))
    print("Recall on " + type + " set: {:.3f}".format(precision_test[1]))
    print("F-Score on " + type + " set: {:.3f}".format(precision_test[2]))

## ACCURACIES BEFORE PRE-PROCESSING

We ran the following code before pre processing to test the accuracies of various classifiers before pre processing. This is not a requirement in the assignment spec but we ran it in order to draw comparisons in the report. 

<img src="./static/BeforePreProcessing/knearest.png"/>

<img src="./static/BeforePreProcessing/logreg.png"/>

<img src="./static/BeforePreProcessing/nb.png"/>

<img src="./static/BeforePreProcessing/svm.png"/>

## DATA PRE-PROCESSING FOR TRAINING DATA

In [69]:
# Normalisation
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_norm = scaler.transform(X_train)
X_test_norm  = scaler.transform(X_test)

pd.DataFrame(X_train_norm)

Unnamed: 0,0,1,2,3,4,...,779,780,781,782,783
0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,...,0.462745,0.38,0.082353,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
22495,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.0
22496,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.0
22497,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.0
22498,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.00,0.000000,0.0,0.0


In [70]:
# Dimension Reduction
pca = PCA(n_components=0.9).fit(X_train_norm)

X_train_pca = pca.transform(X_train_norm)
X_test_pca = pca.transform(X_test_norm)

pd.DataFrame(X_train_pca)

Unnamed: 0,0,1,2,3,4,...,79,80,81,82,83
0,2.927449,-5.553593,4.019518,-1.395135,1.064630,...,-0.011555,-0.025837,-0.095283,-0.109759,-0.024098
1,1.992421,0.442999,-2.368003,0.811313,-1.177249,...,-0.479448,0.015362,-0.228318,-0.181703,-0.037604
2,2.916436,-4.533643,2.285697,-2.685905,0.292165,...,-0.053102,0.513047,-0.128389,0.073825,-0.079035
3,-6.189816,1.348044,-0.645078,-2.423576,1.313570,...,-0.153498,-0.184887,0.326095,0.042007,-0.248368
4,-2.974141,-4.625361,1.107169,0.583185,0.232016,...,0.265475,0.093921,0.410109,0.028499,0.002914
...,...,...,...,...,...,...,...,...,...,...,...
22495,1.257244,5.409011,5.380687,3.621838,-2.909050,...,-0.069430,0.024253,-0.329398,-0.153290,0.420994
22496,1.754955,-4.031853,0.866494,-0.421730,0.518514,...,-0.398757,-0.208873,0.206416,-0.112603,0.073324
22497,-6.374198,0.561054,-0.780198,-2.838871,1.389735,...,0.245362,0.187905,-0.085312,-0.223798,-0.003016
22498,-5.730576,2.660773,-0.073577,-3.724108,1.928655,...,0.021028,0.012773,0.234765,0.361081,0.209532


## KNN

### After Pre-Processing

In [71]:
%%time
# Accuracy of default KNN classifier after pre processing 
knn = KNeighborsClassifier()
knn.fit(X_train_pca, y_train)

y_pred = knn.predict(X_test_pca)
y_pred_train = knn.predict(X_train_pca)
performance(y_test, y_pred, 'test')
performance(y_train, y_pred_train, 'train')

Accuracy on test set: 0.850
Precision on test set: 0.851
Recall on test set: 0.850
F-Score on test set: 0.850
Accuracy on train set: 0.895
Precision on train set: 0.895
Recall on train set: 0.895
F-Score on train set: 0.894
CPU times: user 16.6 s, sys: 6.73 s, total: 23.4 s
Wall time: 15.4 s


### Parameter Tuning

<img src="./static/ParameterTuning/knearest.png"/>

In [72]:
%%time
# Create a KNN Classifier using best parameters 
knn = KNeighborsClassifier(p=1)
knn.fit(X_train_pca, y_train)

y_pred = knn.predict(X_test_pca)
y_pred_train = knn.predict(X_train_pca)
performance(y_test, y_pred, 'test')
performance(y_train, y_pred_train, 'train')

Accuracy on test set: 0.851
Precision on test set: 0.852
Recall on test set: 0.851
F-Score on test set: 0.851
Accuracy on train set: 0.898
Precision on train set: 0.899
Recall on train set: 0.898
F-Score on train set: 0.898
CPU times: user 52.6 s, sys: 3.88 s, total: 56.5 s
Wall time: 56.9 s


## LOGISTIC REGRESSION

### After pre-processing 

In [73]:
%%time
# Accuracy of default LogReg Classifier after pre processing 
logreg = LogisticRegression(max_iter = 5000)
logreg.fit(X_train_pca, y_train)

y_pred = logreg.predict(X_test_pca)
y_pred_train = logreg.predict(X_train_pca)
performance(y_test, y_pred, 'test')
performance(y_train, y_pred_train, 'train')

Accuracy on test set: 0.843
Precision on test set: 0.842
Recall on test set: 0.843
F-Score on test set: 0.842
Accuracy on train set: 0.854
Precision on train set: 0.853
Recall on train set: 0.854
F-Score on train set: 0.853
CPU times: user 34.3 s, sys: 3.37 s, total: 37.7 s
Wall time: 9.59 s


### Parameter Tuning

<img src="./static/ParameterTuning/logreg.png"/>

No need for creating the best classifier after parameter tuning because, as seen above, the default parameters are already the best ones for LogisticRegression. 

## NAIVE BAYES

### After pre-processing

In [74]:
%%time
# Accuracy of default NB Classifier after pre processing 
nb = GaussianNB()
nb.fit(X_train_pca, y_train)

y_pred = nb.predict(X_test_pca)
y_pred_train = nb.predict(X_train_pca)
performance(y_test, y_pred, 'test')
performance(y_train, y_pred_train, 'train')

Accuracy on test set: 0.771
Precision on test set: 0.778
Recall on test set: 0.771
F-Score on test set: 0.772
Accuracy on train set: 0.773
Precision on train set: 0.778
Recall on train set: 0.773
F-Score on train set: 0.773
CPU times: user 184 ms, sys: 44.6 ms, total: 228 ms
Wall time: 178 ms


### Parameter Tuning

<img src="./static/ParameterTuning/nb.png"/>

In [75]:
%%time
# Create a NB Classifier using best parameters and check accuracy 
nb = GaussianNB(var_smoothing=0.0002848035868435802)
nb.fit(X_train_pca, y_train)

y_pred = nb.predict(X_test_pca)
y_pred_train = nb.predict(X_train_pca)
performance(y_test, y_pred, 'test')
performance(y_train, y_pred_train, 'train')

Accuracy on test set: 0.772
Precision on test set: 0.780
Recall on test set: 0.772
F-Score on test set: 0.774
Accuracy on train set: 0.774
Precision on train set: 0.781
Recall on train set: 0.774
F-Score on train set: 0.775
CPU times: user 139 ms, sys: 32.4 ms, total: 171 ms
Wall time: 170 ms


## SVM

### After pre-processing

In [76]:
%%time
# Accuracy of default SVC classifier after pre-processing 
svm = SVC() 
svm.fit(X_train_pca, y_train)

y_pred = svm.predict(X_test_pca)
y_pred_train = svm.predict(X_train_pca)
performance(y_test, y_pred, 'test')
performance(y_train, y_pred_train, 'train')

Accuracy on test set: 0.878
Precision on test set: 0.877
Recall on test set: 0.878
F-Score on test set: 0.877
Accuracy on train set: 0.898
Precision on train set: 0.898
Recall on train set: 0.898
F-Score on train set: 0.898
CPU times: user 1min 9s, sys: 389 ms, total: 1min 9s
Wall time: 1min 10s


### Parameter Tuning

<img src="./static/ParameterTuning/svm.png"/>

In [77]:
%%time
# Create a SVM Classifier using best parameters 
svm = SVC(C=10, gamma=0.01)
svm.fit(X_train_pca, y_train)

y_pred = svm.predict(X_test_pca)
y_pred_train = svm.predict(X_train_pca)
performance(y_test, y_pred, 'test')
performance(y_train, y_pred_train, 'train')

Accuracy on test set: 0.887
Precision on test set: 0.886
Recall on test set: 0.887
F-Score on test set: 0.886
Accuracy on train set: 0.934
Precision on train set: 0.934
Recall on train set: 0.934
F-Score on train set: 0.933
CPU times: user 1min, sys: 397 ms, total: 1min
Wall time: 1min 1s


## BAGGING (SVM)

<img src="./static/ParameterTuning/bagging.png"/>

In [78]:
%%time
# Using bagging Ensemble on a set of SVCs with best parameters 
bclf = BaggingClassifier(base_estimator=SVC(C=10, gamma=0.01), 
    bootstrap=False, n_estimators=2).fit(X_train_pca, y_train)

y_pred = bclf.predict(X_test_pca)
y_pred_train = bclf.predict(X_train_pca)
performance(y_test, y_pred, 'test')
performance(y_train, y_pred_train, 'train')

Accuracy on test set: 0.887
Precision on test set: 0.886
Recall on test set: 0.887
F-Score on test set: 0.886
Accuracy on train set: 0.934
Precision on train set: 0.934
Recall on train set: 0.934
F-Score on train set: 0.933
CPU times: user 2min, sys: 891 ms, total: 2min
Wall time: 2min 2s


## PERFORMANCE COMPARISON OF CLASSIFIERS

<p align="center">
    <img src="./static/Performance/all_metrics.jpg"/>
</p>

<br>

<p align="center">
    <img src="./static/Performance/computation.jpg" align="center"/>
</p>