# COMP5318 - Machine Learning and Data Mining: Assignment 1

In [4]:
import pandas as pd
import os
print(os.listdir("./Input/train"))
pd.set_option('display.max_columns', 10)

from IPython.display import set_matplotlib_formats, display
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import graphviz

%matplotlib inline

['train.csv']


In [5]:
# train.csv including feature and label using for training model.
data_train_df = pd.read_csv('./Input/train/train.csv') 

In [6]:
# Selecting input feature
data_train_feature = data_train_df.loc[:, "v1":"v784"].to_numpy()

# Selecting output lable 
data_train_label = data_train_df.label.to_numpy()

## DATA PRE-PROCESSING FOR TRAINING DATA

In [7]:
data_train_feature.shape

(30000, 784)

In [8]:
# Data Normalisation
data_train_feature = pd.DataFrame(data_train_feature)
skewed_feats = pd.DataFrame(data_train_feature).apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

data_train_feature[skewed_feats] = np.log1p(data_train_feature[skewed_feats])

In [9]:
# Filling NAs with the mean of the columns
data_train_feature 

Unnamed: 0,0,1,2,3,4,...,779,780,781,782,783
0,0.0,0.0,0.0,0.000000,0.693147,...,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.693147,0.000000,...,4.127134,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
29995,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0
29996,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0
29997,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0
29998,0.0,0.0,0.0,0.000000,0.000000,...,0.000000,0.0,0.0,0.0,0.0


In [10]:
# Data Standardisation
data_train_feature = StandardScaler().fit_transform(data_train_feature)

In [11]:
pca = PCA(n_components=0.9)
principalComponents = pca.fit_transform(data_train_feature)
principalDf = pd.DataFrame(data = principalComponents)

In [12]:
finalDf = pd.concat([principalDf, data_train_df.label], axis = 1)

## DATA PRE-PROCESSING FOR TESTING DATA

In [13]:
# test_input.csv includes 5000 samples used for label prediction. Test samples do not have labels.
data_test_df = pd.read_csv('./Input/test/test_input.csv', index_col=0) 
data_test_df

Unnamed: 0_level_0,v1,v2,v3,v4,v5,...,v780,v781,v782,v783,v784
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,0,0,0,0,...,0,0,0,0,0
1,0,0,0,0,0,...,0,0,0,0,0
2,0,0,0,0,0,...,0,0,0,0,0
3,0,0,0,0,0,...,0,0,0,0,0
4,0,0,0,0,0,...,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,...,0,0,0,0,0
4996,0,0,0,0,0,...,0,0,0,0,0
4997,0,0,0,0,0,...,0,0,0,0,0
4998,0,0,0,0,0,...,0,0,0,0,0


In [14]:
# Data Normalisation
skewed_feats = data_test_df.apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

data_test_df[skewed_feats] = np.log1p(data_test_df[skewed_feats])

In [15]:
# Filling NAs with the mean of the columns
data_test_df

Unnamed: 0_level_0,v1,v2,v3,v4,v5,...,v780,v781,v782,v783,v784
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0
1,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0
2,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0
3,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0
4,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0
4996,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0
4997,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0
4998,0,0.0,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.0,0.0


In [16]:
# Data Standardisation
data_test_df = StandardScaler().fit_transform(data_test_df)

In [17]:
# Dimension Reduction
pca = PCA(n_components=140) # Based on 90% explain variance for the training data 
principalComponents_test = pca.fit_transform(data_test_df)
principalDf_test = pd.DataFrame(data = principalComponents_test)

## KNN

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    principalDf.to_numpy(), data_train_label, random_state=0)

In [19]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

X_train shape: (22500, 140)
y_train shape: (22500,)


In [20]:
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_test shape: (7500, 140)
y_test shape: (7500,)


In [None]:
param_grid = {'n_neighbors': [1, 3, 5, 11, 15], 'p': [1, 2]}
print("Parameter grid:\n{}".format(param_grid))

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, return_train_score=True, n_jobs=-1)


grid_search.fit(X_train, y_train)

print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))

In [30]:
predictions = []

for i in principalDf_test.to_numpy():
    prediction = knn.predict([list(i)])
    predictions.append(prediction[0])

predictions

In [31]:
# output_df = pd.DataFrame(predictions, columns = ['label'])
# output_df.to_csv('./Output/test_output.csv', sep=",", float_format='%d', index_label="id")

Unnamed: 0,id,label
0,0,3
1,1,3
2,2,3
3,3,0
4,4,3
...,...,...
4995,4995,0
4996,4996,4
4997,4997,5
4998,4998,0


## LOGISTIC REGRESSION

In [33]:
X_train, X_test, y_train, y_test = train_test_split(principalDf.to_numpy(), data_train_label, 
                                                    stratify=data_train_label, random_state=42)

In [34]:
logreg = LogisticRegression(max_iter = 5000)
logreg.fit(X_train, y_train)
print("Accuracy on trainig set:", logreg.score(X_train, y_train)*100)
print("Accuracy on test set:", logreg.score(X_test, y_test)*100)

Accuracy on trainig set: 87.36888888888889
Accuracy on test set: 85.01333333333334


## NAIVE BAYES

In [35]:
X_train, X_test, y_train, y_test = train_test_split(principalDf.to_numpy(), data_train_label, 
                                                    stratify=data_train_label, random_state=42)

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Accuracy on test set: {:.3f}".format(accuracy_score(y_test, y_pred)))

Accuracy on test set: 0.669


In [36]:
# Parameter Tuning 
param_grid_nb = { 'var_smoothing': np.logspace(0,-9, num=100) }

nbModel_grid = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
nbModel_grid.fit(X_train, y_train)

print("Test set score: ", nbModel_grid.score(X_test, y_test))
print("Best parameters: {}".format(nbModel_grid.best_params_))
print("Best cross-validation score: ", nbModel_grid.best_score_)

ideal_var_smoothing = nbModel_grid.best_params_['var_smoothing']
nb = GaussianNB(var_smoothing=ideal_var_smoothing)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("\nAccuracy on test set after parameter tuning: {:.3f}".format(accuracy_score(y_test, y_pred)))

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Test set score:  0.6686666666666666
Best parameters: {'var_smoothing': 6.579332246575683e-06}
Best cross-validation score:  0.6711111111111111

Accuracy on test set after parameter tuning: 0.669


## DECISION TREE

In [37]:
X_train, X_test, y_train, y_test = train_test_split(principalDf.to_numpy(), data_train_label, 
                                                    stratify=data_train_label, random_state=42)

tree = DecisionTreeClassifier(criterion='entropy', max_depth=12, random_state=42)
tree.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=12, random_state=42)

In [38]:
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 0.908
Accuracy on test set: 0.776
