In [1]:
import numpy as np
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.metrics import f1_score


# Training data set has 80000 lines of data
# Load train data and split into two new sets: 
#      another train data set with front 80% = 6400 lines of data
#      validate data set with back 20% = 1600 lines of data

train_data = np.loadtxt("./training/dataset_training.txt", delimiter = ",")

train=[]
validate=[]
# get new train data set with 6400 lines
for i in range(0, 6400):
    train.append(train_data[i])
print("Traing dataset length: ", len(train))

# get validate data set with 1600 lines
for i in range(6400, 8000):
    validate.append(train_data[i])
print("Validate dataset length: ", len(validate))

# transfer into numpy array
train=np.array(train)
validate=np.array(validate)

# split train and validate data set with targert and features sets
train_t = train[:, -1]
train_f = train[:, :-1]

validate_t = validate[:, -1]
validate_f = validate[:, :-1]

# Normalize feature attributes [0,1]
# normalize train set on each non-zero feature with "max" if axis=0
train_f_n = preprocessing.normalize(train[:, :-1], norm='max', axis=0)

# normalize test set on each non-zero feature with "max" if axis=0
validate_f_n = preprocessing.normalize(validate[:, :-1], norm='max', axis=0)


Traing dataset length:  6400
Validate dataset length:  1600


In [2]:
# Using Default arguments in KNN classifer and see the accuracy in tranning / validate dataset

# Train the model with the training sets
knn_default = KNeighborsClassifier()
knn_default.fit(train_f_n, train_t)

# Predict the response for train dataset
pred_train_t = knn_default.predict(train_f_n)

# Predict the response for validate dataset
pred_validate_t = knn_default.predict(validate_f_n)

print("After normalization:")
print("Accuracy on training dataset:",metrics.accuracy_score(train_t, pred_train_t))
print("F1-score on training dataset:", f1_score(train_t, pred_train_t))

print("---------------")


print("Accuracy on validate dataset:",metrics.accuracy_score(validate_t, pred_validate_t))
print("F1-score on validate  dataset:", f1_score(validate_t, pred_validate_t))

After normalization:
Accuracy on training dataset: 0.913125
F1-score on training dataset: 0.8738656987295825
---------------
Accuracy on validate dataset: 0.86625
F1-score on validate  dataset: 0.7834008097165992


In [3]:
# Tunning the parameters in KNN to get a better accuracy

num_of_neighbours = range(3, 21)
p=[1, 2, 3]
weight_array=['uniform', 'distance']

validate_accuracy = []
# use iteration to caclulator different k in models, then return the average accuracy 
sum_array = []
index=0
max_index=0
max_accuracy=0
max_n=0
max_i=0
max_j='uniform'
for n in num_of_neighbours:
    for i in p:
        for j in weight_array:
            knn_test= KNeighborsClassifier(n_neighbors=n, p=i, weights=j)
            knn_test.fit(train_f_n, train_t)
            pred_validate_t = knn_test.predict(validate_f_n)
            pred_train_t = knn_test.predict(train_f_n)
            
            t_accuracy =metrics.accuracy_score(train_t, pred_train_t)
            v_accuracy =metrics.accuracy_score(validate_t, pred_validate_t)
            validate_accuracy.append(v_accuracy)
            print("index:", index,"num_of_neighbours=", n,"p=", i, "weights=", j, "Accuracy on train dataset:", t_accuracy, "Accuracy on validate dataset:", v_accuracy)
            sum_array.append((index, n, i, j, metrics.accuracy_score(validate_t, pred_validate_t)))
            if v_accuracy > max_accuracy:
                max_accuracy=v_accuracy
                max_index=index
                max_n=n
                max_i=i
                max_j=j
            index=index+1
print("Best parameters:", max_n, max_i, max_j)
print("MAX accuracy: ", max_index, max_accuracy)

index: 0 num_of_neighbours= 3 p= 1 weights= uniform Accuracy on train dataset: 0.92515625 Accuracy on validate dataset: 0.8475
index: 1 num_of_neighbours= 3 p= 1 weights= distance Accuracy on train dataset: 1.0 Accuracy on validate dataset: 0.8475
index: 2 num_of_neighbours= 3 p= 2 weights= uniform Accuracy on train dataset: 0.92890625 Accuracy on validate dataset: 0.856875
index: 3 num_of_neighbours= 3 p= 2 weights= distance Accuracy on train dataset: 1.0 Accuracy on validate dataset: 0.856875
index: 4 num_of_neighbours= 3 p= 3 weights= uniform Accuracy on train dataset: 0.93 Accuracy on validate dataset: 0.855625
index: 5 num_of_neighbours= 3 p= 3 weights= distance Accuracy on train dataset: 1.0 Accuracy on validate dataset: 0.855625
index: 6 num_of_neighbours= 4 p= 1 weights= uniform Accuracy on train dataset: 0.8909375 Accuracy on validate dataset: 0.84125
index: 7 num_of_neighbours= 4 p= 1 weights= distance Accuracy on train dataset: 1.0 Accuracy on validate dataset: 0.850625
inde

In [4]:
# test the knn with best parameters:
knn_2 = KNeighborsClassifier(n_neighbors=16, p=1, weights='distance')
# Train the model with the training sets
knn_2.fit(train_f_n, train_t)

# Predict the response for train dataset
pred_train_t = knn_2.predict(train_f_n)
print("After normalization:")
# Model Accuracy, how often is the classifier correct?
print("Accuracy on training dataset:",metrics.accuracy_score(train_t, pred_train_t))
print("F1-score on training dataset:", f1_score(train_t, pred_train_t))
print("---------------")

# Predict the response for validate dataset
pred_validate_t = knn_2.predict(validate_f_n)

# Model Accuracy, how often is the classifier correct?
print("Accuracy on validate dataset:",metrics.accuracy_score(validate_t, pred_validate_t))
print("F1-score on validate dataset:", f1_score(validate_t, pred_validate_t))

After normalization:
Accuracy on training dataset: 1.0
F1-score on training dataset: 1.0
---------------
Accuracy on validate dataset: 0.884375
F1-score on validate dataset: 0.8070907194994786


In [5]:
#### cross validation  
from sklearn.metrics import confusion_matrix

# Create a new KNN model for best performance paramter obtained previously
knn_cv = KNeighborsClassifier(n_neighbors=16, weights='distance', p=1)

# Train model with cross-validation of 10 
n = 10
cv_result = cross_validate(knn_cv, train_f_n, train_t, cv=n, return_estimator=True)
estimator = cv_result['estimator']

print("Training data set - cross validation:")
accuracy_scores = []
for i in range(n):
    pred_train_cv = estimator[i].predict(train_f_n)
    # Model Accuracy, how often is the classifier correct?
    accuracy_score = metrics.accuracy_score(train_t, pred_train_cv)
    accuracy_scores.append(accuracy_score)
    print("acccuracy on model %d: %f" % (i+1,accuracy_score))
accuracy_scores = np.array(accuracy_scores)
print("Average Accuracy: %f " % (accuracy_scores.mean()))
print("-----------------------")
print("Validate data set - cross validation:")
maxscore = -1
best_result_test = []
accuracy_scores = []
model = -1
for i in range(n):
    pred_validate_cv = estimator[i].predict(validate_f_n)
    # Model Accuracy, how often is the classifier correct?
    accuracy_score = metrics.accuracy_score(validate_t, pred_validate_cv)
    if accuracy_score > maxscore:
        maxscore = accuracy_score
        best_result = pred_validate_cv
        model = i
    accuracy_scores.append(accuracy_score)
    print("acccuracy on model %d: %f" % (i+1,accuracy_score))
accuracy_scores = np.array(accuracy_scores)
print("Average Accuracy: %f" % (accuracy_scores.mean()))
print("-------------------------------")

pred_train_cv = estimator[model-1].predict(train_f_n)
pred_validate_cv = estimator[model-1].predict(validate_f_n)
# Model Accuracy, how often is the classifier correct?
print("Final Evaluation on training data set:")
print("Accuracy:",metrics.accuracy_score(train_t, pred_train_cv))

tn, fp, fn, tp = confusion_matrix(train_t, pred_train_cv).ravel()
print("tn", "fp", "fn", "tp")
print(tn, fp, fn, tp)

print('F1-SCORE = %f' %(f1_score(train_t, pred_train_cv)))
print("--------------------")

# Model Accuracy, how often is the classifier correct?
print("Final Evaluation on validate data set:")
print("Accuracy:",metrics.accuracy_score(validate_t, pred_validate_cv))

tn, fp, fn, tp = confusion_matrix(validate_t, pred_validate_cv).ravel()
print("tn", "fp", "fn", "tp")
print(tn, fp, fn, tp)
print('F1-SCORE = %f' %(f1_score(validate_t, pred_validate_cv)))

Training data set - cross validation:
acccuracy on model 1: 0.987656
acccuracy on model 2: 0.989844
acccuracy on model 3: 0.988125
acccuracy on model 4: 0.988594
acccuracy on model 5: 0.987812
acccuracy on model 6: 0.986719
acccuracy on model 7: 0.985625
acccuracy on model 8: 0.988750
acccuracy on model 9: 0.986250
acccuracy on model 10: 0.987656
Average Accuracy: 0.987703 
-----------------------
Validate data set - cross validation:
acccuracy on model 1: 0.876250
acccuracy on model 2: 0.877500
acccuracy on model 3: 0.878125
acccuracy on model 4: 0.873125
acccuracy on model 5: 0.876250
acccuracy on model 6: 0.878125
acccuracy on model 7: 0.883750
acccuracy on model 8: 0.878750
acccuracy on model 9: 0.880000
acccuracy on model 10: 0.878125
Average Accuracy: 0.878000
-------------------------------
Final Evaluation on training data set:
Accuracy: 0.98671875
tn fp fn tp
4056 17 68 2259
F1-SCORE = 0.981534
--------------------
Final Evaluation on validate data set:
Accuracy: 0.878125
tn f

In [6]:
test_data2 = np.loadtxt("./testing/dataset_testing.txt", delimiter = ",")
test_t2 = test_data2[:, -1]
test_x2 = test_data2[:, :-1]

test_x2 = preprocessing.normalize(test_data2[:, :-1], norm='max', axis=0)

In [7]:
pred_test_cv = estimator[model-1].predict(test_x2)

print("Final Evaluation on testing data set:")
print("Accuracy:",metrics.accuracy_score(test_t2, pred_test_cv))

tn, fp, fn, tp = confusion_matrix(test_t2, pred_test_cv).ravel()
print("tn", "fp", "fn", "tp")
print(tn, fp, fn, tp)
print('F1-SCORE = %f' %(f1_score(test_t2, pred_test_cv)))


Final Evaluation on testing data set:
Accuracy: 0.861
tn fp fn tp
1198 49 229 524
F1-SCORE = 0.790347


In [9]:

## update predictor_2_y.txt
a_file = open("predictor_2_y.txt", "w")
for row in pred_test_cv:
    a_file.write(str(row))
    a_file.write("\n")

a_file.close()