# Dataset four

In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFE
import os

In [26]:
""" 
TrainData 4 contains 112 features with 2547 samples. Testdata4 contains 112 features with 1092 samples. There are 9 classes in this dataset.
"""


trainData = pd.read_csv('../DataFiles/TrainData4.txt', sep='\\s+', header=None)
trainLabels = pd.read_csv('../DataFiles/TrainLabel4.txt', sep='\\s+', header=None)
testData = pd.read_csv('../DataFiles/TestData4.txt', sep='\\s+', header=None)
trainData

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,102,103,104,105,106,107,108,109,110,111
0,-0.449870,-0.200490,-0.487290,-0.061085,-0.051024,-0.021653,0.307880,-0.057097,-0.015610,0.132410,...,4.5912,23.2100,146.23,-178.08,152.01,-129.720,126.480,-147.33,168.65,180.33
1,-0.528430,-0.259120,-0.425840,-0.061339,-0.075853,-0.027442,0.301660,-0.064007,-0.042905,0.330570,...,9.7736,-4.6825,103.02,-182.73,168.97,-151.290,124.890,-118.42,125.41,203.31
2,-0.498230,-0.263460,-0.406830,-0.050683,-0.066742,-0.024397,0.275000,-0.130610,-0.105670,0.285960,...,38.8230,-53.3400,161.33,-180.05,151.52,-127.850,117.960,-125.76,139.76,193.23
3,-0.424050,-0.183920,-0.337610,-0.035511,-0.048362,-0.008383,0.234870,-0.197570,-0.075233,0.133230,...,81.5990,-93.0770,145.09,172.44,-196.78,-135.790,124.880,-134.61,145.45,194.52
4,-0.214870,-0.245080,-0.252040,-0.111790,-0.045751,-0.035225,0.216660,-0.216510,-0.085224,0.331200,...,-178.4200,-149.6900,154.25,-168.03,172.94,161.300,-164.670,179.68,-194.52,192.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,-0.055413,-0.028708,-0.050070,-0.092470,-0.087130,-0.037582,0.044267,-0.384240,-0.281220,0.051134,...,-129.1200,-183.1200,172.83,199.73,-226.64,158.590,-97.553,127.84,-185.16,203.81
2543,-0.044765,-0.028708,-0.020877,-0.034470,-0.014174,-0.037582,0.043463,-0.386750,-0.293210,0.026051,...,190.4200,-171.6100,170.07,150.35,-188.96,214.620,-172.170,168.19,-190.24,197.18
2544,-0.053542,-0.089060,-0.090247,-0.172140,-0.162600,-0.081363,0.049554,-0.413170,-0.293190,0.121840,...,168.5300,-170.9900,181.53,154.11,-175.15,179.540,-172.070,204.38,-203.96,189.57
2545,-0.104460,-0.231630,-0.228320,-0.260140,-0.243730,-0.184850,0.055340,-0.424890,-0.290430,0.226890,...,180.4800,-212.7500,182.87,176.90,-216.82,-85.159,131.410,135.09,-138.10,179.36


In [27]:
trainData.replace(1.00000000000000e+99, np.nan, inplace=True)
testData.replace(1.00000000000000e+99, np.nan, inplace=True)

# There are no missing values in any of the txt files
print("Missing values in trainData:", trainData.isnull().sum().sum())
print("Missing values in testData:", testData.isnull().sum().sum())

Missing values in trainData: 0
Missing values in testData: 0


In [28]:
myScaler = StandardScaler()         

trainDataScaled = myScaler.fit_transform(trainData)
testDataScaled = myScaler.transform(testData)
trainDataScaled


array([[-4.90621523, -1.51505757, -5.77090401, ..., -0.74893122,
         0.94272041,  1.05676578],
       [-5.98790689, -2.33139224, -4.85284398, ..., -0.57140259,
         0.69268576,  1.18668064],
       [-5.57208347, -2.39182022, -4.56883549, ..., -0.61647558,
         0.77566442,  1.12969449],
       ...,
       [ 0.55082005,  0.03643778,  0.16089926, ...,  1.41082639,
        -1.21189138,  1.10900309],
       [-0.15026925, -1.94863529, -1.9019047 , ...,  0.98533492,
        -0.83105691,  1.051282  ],
       [ 0.65012199,  0.23664375,  0.38643277, ...,  1.0788583 ,
        -1.04911442,  1.12986409]])

In [29]:
""" 
Recall: TrainData 4 contains 112 features with 2547 samples. Testdata4 contains 112 features with 1092 samples. There are 9 classes in this dataset.
"""

X_train, X_val, y_train, y_val = train_test_split(trainDataScaled, trainLabels, shuffle=True, test_size=0.2, random_state=50)

mySVM = SVC(kernel='linear')                        # AC1103 -- This is for feature selection which utlizes RFE(wrapper method) with Support Vector Machine Classification
myRFE = RFE(estimator=mySVM, n_features_to_select=65, step=10)
myRFE.fit(X_train, y_train.values.ravel())

# AC1103 -- After RFE, 65 features are selected and then the training and validation sets are transformed to have those 65 models, now the data can be classified using 'rbf' kernel
X_trainRFE = myRFE.transform(X_train)
X_valRFE = myRFE.transform(X_val)


#AC1103 -- This cell is just feature selecting, there is no classification happening --> went from 112 features to 65

In [30]:
mySVM = SVC(C=3)

mySVM.fit(X_trainRFE, y_train.values.ravel())
X_valPrediction = mySVM.predict(X_valRFE)

X_valAccuracy = accuracy_score(y_val, X_valPrediction)
X_valReport = classification_report(y_val, X_valPrediction)

print(f"The validation set had an accuracy of {X_valAccuracy}")

The validation set had an accuracy of 0.9411764705882353


In [31]:
fullRFE = RFE(estimator=SVC(kernel='linear'), n_features_to_select=65, step=10)         # AC1103 -- Applying RFE to the whole training set

fullRFE.fit(trainDataScaled, trainLabels.values.ravel())
trainDataRFE = fullRFE.transform(trainDataScaled)
testDataRFE = fullRFE.transform(testDataScaled)

print("Shape of trainData after RFE:", trainDataRFE.shape)
print("Shape of testData after RFE:", testDataRFE.shape)


mySVM.fit(trainDataRFE, trainLabels.values.ravel())
testDataPredictions = mySVM.predict(testDataRFE)

print("\nPredictions on Test Data:", testDataPredictions)

Shape of trainData after RFE: (2547, 65)
Shape of testData after RFE: (1092, 65)

Predictions on Test Data: [1 1 1 ... 8 8 8]


In [32]:
output_dir = r"C:\Users\alexs\OneDrive\Desktop\PythonProjects\CSC4850_Project\PredictionResults"

os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "CastroClassification4.txt")
np.savetxt(output_path, testDataPredictions, fmt='%d')



print(f"Predictions saved to {output_path}.")

Predictions saved to C:\Users\alexs\OneDrive\Desktop\PythonProjects\CSC4850_Project\PredictionResults\CastroClassification4.txt.
