# COMP5318 - Machine Learning and Data Mining: Assignment 1

In [1]:
import pandas as pd
import os
print(os.listdir("./Input/train"))
pd.set_option('display.max_columns', 10)

from IPython.display import set_matplotlib_formats, display
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support
import graphviz

# TODO
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

%matplotlib inline

['train.csv']


In [2]:
# train.csv including feature and label using for training model.
data_train_df = pd.read_csv('./Input/train/train.csv') 

In [3]:
# Selecting input feature
data_train_feature = data_train_df.loc[:, "v1":"v784"].to_numpy()

# Selecting output lable 
data_train_label = data_train_df.label.to_numpy()

In [4]:
# Train Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    data_train_feature, data_train_label, random_state=0, stratify=data_train_label, test_size=0.001)

In [5]:
# Performance Metrics Calculator Helper 
def performance(y_true, y_pred, type):
    precision_test = precision_recall_fscore_support(y_true, y_pred, average='macro')
    print("Accuracy on " + type + " set: {:.3f}".format(accuracy_score(y_true, y_pred)))
    print("Precision on " + type + " set: {:.3f}".format(precision_test[0]))
    print("Recall on " + type + " set: {:.3f}".format(precision_test[1]))
    print("F-Score on " + type + " set: {:.3f}".format(precision_test[2]))

## DATA PRE-PROCESSING FOR TRAINING DATA

In [6]:
# Normalisation
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_norm = scaler.transform(X_train)
X_test_norm  = scaler.transform(X_test)

pd.DataFrame(X_train_norm)

Unnamed: 0,0,1,2,3,4,...,779,780,781,782,783
0,0.0,0.0,0.000000,0.000000,0.0,...,0.333333,0.164,0.000000,0.0,0.0
1,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
2,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.040,0.000000,0.0,0.0
3,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
4,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
29965,0.0,0.0,0.008403,0.006849,0.0,...,0.443137,0.420,0.090196,0.0,0.0
29966,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
29967,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
29968,0.0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0


In [7]:
# Dimension Reduction
pca = PCA(n_components=0.95).fit(X_train_norm)

X_train_pca = pca.transform(X_train_norm)
X_test_pca = pca.transform(X_test_norm)

pd.DataFrame(X_train_pca)

Unnamed: 0,0,1,2,3,4,...,182,183,184,185,186
0,0.928057,3.363762,0.341743,2.604983,5.795079,...,-0.090215,-0.067542,-0.315335,0.067186,-0.040930
1,0.054835,-1.412850,0.425954,2.556890,5.716199,...,-0.148328,-0.057979,-0.289384,0.033659,-0.096069
2,2.989437,0.663117,-1.844089,1.103248,-0.393202,...,0.158920,-0.082482,0.102989,-0.202741,-0.053536
3,-7.235179,-0.828898,-0.993131,2.690812,-1.054486,...,-0.086163,-0.378754,-0.098232,-0.235219,0.086510
4,-6.781096,-0.282193,-0.536756,0.757324,-0.761154,...,0.127089,0.079232,-0.173081,-0.039916,-0.055110
...,...,...,...,...,...,...,...,...,...,...,...
29965,7.514883,3.233630,-2.228791,0.627272,0.315171,...,-0.420866,0.111722,-0.270691,0.190237,0.233265
29966,-6.637647,-0.110831,-2.079483,-0.963641,0.704240,...,0.005914,-0.172712,-0.064717,-0.071830,-0.114470
29967,-7.330914,-0.583270,-2.425845,0.086672,0.647989,...,0.078979,0.141195,0.023047,0.037365,-0.081759
29968,2.431063,-5.513875,1.575352,-1.644387,-0.849300,...,0.026965,-0.003460,-0.024646,0.014324,-0.017806


## KNN

In [8]:
%%time
# Parameter Tuning 
param_grid = {'n_neighbors': [1, 3, 5, 11, 15], 'p': [1, 2]}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train_pca, y_train)

print("Test set score: {:.2f}".format(grid_search.score(X_test_pca, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))

Test set score: 0.87
Best parameters: {'n_neighbors': 5, 'p': 1}
Best cross-validation score: 0.85
Best estimator:
KNeighborsClassifier(p=1)
CPU times: user 512 ms, sys: 569 ms, total: 1.08 s
Wall time: 30min 44s


In [9]:
%%time
# Create a KNN Classifier using best parameters 
best_n = grid_search.best_params_['n_neighbors']
best_p = grid_search.best_params_['p']

knn = KNeighborsClassifier(n_neighbors=best_n, p=best_p)
knn.fit(X_train_pca, y_train)

y_pred_train = knn.predict(X_train_pca)
performance(y_train, y_pred_train, 'train')

Accuracy on train set: 0.899
Precision on train set: 0.900
Recall on train set: 0.899
F-Score on train set: 0.898
CPU times: user 4min 45s, sys: 10.4 s, total: 4min 56s
Wall time: 5min 46s


## SVM

In [10]:
%%time
# Parameter Tuning 
param_grid = {'C': [100, 10, 1.0, 0.1, 0.01], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel':  ['linear', 'poly', 'rbf', 'sigmoid']}

grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True, n_jobs=-1)
grid_search.fit(X_train_pca, y_train)

print("Test set score: {:.2f}".format(grid_search.score(X_test_pca, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))

Test set score: 0.83
Best parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Best cross-validation score: 0.89
Best estimator:
SVC(C=10, gamma=0.01)
CPU times: user 40.5 s, sys: 831 ms, total: 41.3 s
Wall time: 11h 6min 43s


In [11]:
%%time
# Create a SVM Classifier using best parameters 
best_C = grid_search.best_params_['C']
best_gamma = grid_search.best_params_['gamma']

svm = SVC(C=best_C, gamma=best_gamma)
svm.fit(X_train_pca, y_train)

y_pred_train = svm.predict(X_train_pca)
performance(y_train, y_pred_train, 'train')

Accuracy on train set: 0.955
Precision on train set: 0.955
Recall on train set: 0.955
F-Score on train set: 0.955
CPU times: user 2min 44s, sys: 1.13 s, total: 2min 45s
Wall time: 3min 19s


## DATA PRE-PROCESSING FOR BLIND TESTING DATA

In [12]:
# test_input.csv includes 5000 samples used for label prediction. Test samples do not have labels.
data_test_df = pd.read_csv('./Input/test/test_input.csv', index_col=0) 

In [13]:
# Data Normalisation
output_test_norm = scaler.transform(data_test_df.to_numpy())

pd.DataFrame(output_test_norm)

Unnamed: 0,0,1,2,3,4,...,779,780,781,782,783
0,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
1,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
2,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
3,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
4,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
4996,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
4997,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0
4998,0.0,0.0,0.00000,0.000000,0.0,...,0.000000,0.000,0.000000,0.0,0.0


In [14]:
# Dimension Reduction
output_test_pca = pca.transform(output_test_norm)

pd.DataFrame(output_test_pca)

Unnamed: 0,0,1,2,3,4,...,182,183,184,185,186
0,1.969176,-5.465572,1.963855,-1.911317,-1.070610,...,0.152434,-0.141998,-0.126998,0.048715,0.005861
1,1.910601,-5.687303,1.310701,-1.384259,-0.643309,...,0.004067,-0.018628,-0.005452,0.076242,0.024554
2,0.464712,-5.857919,1.600550,-1.983144,-2.696122,...,-0.003344,-0.097612,-0.147449,-0.040962,0.034372
3,-0.377191,-3.352772,-0.785884,1.747031,0.141522,...,0.070164,-0.142563,0.006056,0.151542,0.009402
4,-1.535240,-2.582608,-0.898018,0.405198,0.699272,...,-0.223594,0.124800,0.069084,0.042946,-0.058030
...,...,...,...,...,...,...,...,...,...,...,...
4995,2.744622,-3.227836,0.800372,0.581077,-0.293090,...,0.080491,0.043873,-0.039090,0.216866,-0.107188
4996,2.521653,5.929234,-0.288751,1.213837,3.842388,...,-0.231467,-0.088642,-0.108862,-0.071018,0.010272
4997,-5.671694,-0.559945,-1.204842,3.379102,-0.944850,...,-0.234581,-0.072824,-0.740899,-0.141401,-0.298439
4998,7.081308,1.454545,0.472432,1.113097,0.354064,...,-0.014282,0.192556,-0.009050,-0.175727,-0.156855


In [15]:
# Helper function to export csv file storing predictions of a classifier on the blind test set 
def export_predictions(filename, classifier):
    predictions = []
    filepath = './Output/' + filename + '.csv'

    for i in output_test_pca:
        prediction = classifier.predict([list(i)])
        predictions.append(prediction[0])

    output_df = pd.DataFrame(predictions, columns = ['label'])
    output_df.to_csv(filepath, sep=",", float_format='%d', index_label="id")


In [16]:
export_predictions('knn_kaggle', knn)
export_predictions('svm_kaggle', svm)

Example Usage:

`export_predictions('knn', knn)`. This will create a file "knn.csv" in the Output folder which will store the predictions of the KNN classifier for the blind testing data. 