# Dataset one


In [52]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

In [53]:
trainData = pd.read_csv('../DataFiles/TrainData1.txt', delimiter='\t', header=None)
trainLabels = pd.read_csv('../DataFiles/TrainLabel1.txt', delimiter='\t', header=None)
testData = pd.read_csv('../DataFiles/TestData1.txt', delimiter='\t', header=None)


In [54]:
trainData.replace(1.00000000000000e+99, np.nan, inplace=True)
testData.replace(1.00000000000000e+99, np.nan, inplace=True)

myImputer = SimpleImputer(strategy="mean")
trainDataImputed = pd.DataFrame(myImputer.fit_transform(trainData), columns=trainData.columns)
testDataImputed = pd.DataFrame(myImputer.transform(testData), columns=testData.columns)

In [55]:
myScaler = MinMaxScaler()
trainDataScaled = pd.DataFrame(myScaler.fit_transform(trainDataImputed), columns=trainData.columns)
testDataScaled = pd.DataFrame(myScaler.transform(testDataImputed), columns=testData.columns)

In [56]:
""" 
Final just means this will be the final step in pre-processing the data, so data set one is ready for the machine
learning model
"""

mySelector = VarianceThreshold(threshold=0.03)
trainDataFinal = mySelector.fit_transform(trainDataScaled)
testDataFinal = mySelector.transform(testDataScaled)

print("The original number of features for TrainData1 was:", trainDataScaled.shape[1])
print("The number of features after feature selection:", trainDataFinal.shape[1])

The original number of features for TrainData1 was: 3312
The number of features after feature selection: 2085


In [57]:
""" 
This cell contains the KNN model, and will also analyze the results using accuracy_Score and classification_report imports from sklearn.metric, after the report is generated, the test data will
be inputted into the model in the next cell
"""

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(trainDataFinal, trainLabels.values.ravel())

trainTest = KNN.predict(trainDataFinal)                                 # AC1028 -- trainTest is the prediction made by the KNN model

d1Accuracy = accuracy_score(trainLabels, trainTest)                     # AC1028 -- d1 stands for dataset one
d1_report = classification_report(trainLabels, trainTest)


# AC1028 -- These analysis are only for the training data
print("Accuracy score:", d1Accuracy)
print("Classification report:\n", d1_report)


Accuracy score: 0.98
Classification report:
               precision    recall  f1-score   support

           1       0.98      0.99      0.99       108
           2       0.93      0.93      0.93        14
           3       1.00      0.91      0.95        11
           4       1.00      1.00      1.00        14
           5       1.00      1.00      1.00         3

    accuracy                           0.98       150
   macro avg       0.98      0.97      0.97       150
weighted avg       0.98      0.98      0.98       150



In [58]:
testDataPrediction = KNN.predict(testDataFinal)
testDataPrediction

array([2, 1, 1, 1, 1, 2, 1, 1, 3, 1, 3, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 4,
       3, 3, 4, 1, 5, 4, 1, 3, 1, 1, 4, 1, 1, 1, 1, 4, 3, 5, 3, 1, 4, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1])