# Dataset two

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFE
import os

In [9]:
trainData = pd.read_csv('../DataFiles/TrainData2.txt', sep='\\s+', header=None)
trainLabels = pd.read_csv('../DataFiles/TrainLabel2.txt', sep='\\s+', header=None)
testData = pd.read_csv('../DataFiles/TestData2.txt', sep='\\s+', header=None)


trainData.replace(1.00000000000000e+99, np.nan, inplace=True)
testData.replace(1.00000000000000e+99, np.nan, inplace=True)

missing_count = trainData.isnull().sum()
print(missing_count)                                        # AC1030 -- Seems like there are no missing values in the columns


0       0
1       0
2       0
3       0
4       0
       ..
9177    0
9178    0
9179    0
9180    0
9181    0
Length: 9182, dtype: int64


In [10]:
myScaler = StandardScaler()
trainDataScaled = pd.DataFrame(myScaler.fit_transform(trainData), columns=trainData.columns)
testDataScaled = pd.DataFrame(myScaler.transform(testData), columns=testData.columns)

trainDataScaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9172,9173,9174,9175,9176,9177,9178,9179,9180,9181
0,-1.313080,-1.164162,-0.779650,-0.475662,0.899782,-0.280217,-1.193035,-0.429770,1.611351,1.889286,...,-1.906047,-0.590314,1.929592,-2.058137,-0.853503,-0.492580,0.912588,0.601876,-1.283092,-0.233477
1,-2.759333,-0.691264,-0.083329,1.682208,1.757625,-0.280217,-0.256175,-0.893120,0.921040,0.145660,...,-1.006441,0.004877,-0.095919,-1.382224,-0.701428,-0.021511,2.653455,2.480532,-0.948575,-0.233477
2,-3.023660,-1.916416,-0.858908,-0.475662,1.364528,-0.280217,-0.072487,-0.089946,0.906323,0.588239,...,-0.098816,0.298460,-0.560174,-1.061871,-0.324529,0.040183,0.298640,0.098596,-0.930896,-0.233477
3,-1.832215,-0.807015,-0.570201,2.052349,1.420727,-0.280217,0.034051,-0.390852,0.924123,-0.702770,...,-0.860045,0.310411,0.623988,-1.025413,-0.876269,0.624282,0.261103,0.005309,-0.984524,-0.233477
4,-3.686274,-0.728925,-1.245444,2.555487,1.574455,-0.280217,0.315748,-0.826483,1.461213,1.156419,...,-2.829919,-0.605763,2.646978,-1.426403,-1.094628,-0.021511,0.261103,-0.305643,-1.283092,-0.233477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.304233,-0.449868,-1.245444,-0.475662,-0.104473,-0.280217,0.279345,-0.766669,0.010632,1.292742,...,-0.917135,-0.246854,3.885169,-0.788293,-1.312634,0.819839,-0.120271,-0.097966,-1.021290,-0.233477
96,0.136172,-0.482202,1.382376,-0.072358,-1.143398,-0.280217,0.347270,1.880320,0.450546,-0.702770,...,1.098924,-0.670200,-0.560174,1.407445,-0.411903,0.113385,-0.996559,-1.190265,0.333307,-0.233477
97,-0.610147,-0.418173,-1.245444,-0.475662,0.187462,4.024032,0.549409,-0.515695,-0.515132,-0.702770,...,-0.080774,0.427579,2.450107,0.606226,-0.484851,0.633842,-0.292311,-0.249388,-0.129376,-0.233477
98,-0.763761,-0.654468,-0.216028,-0.475662,0.187462,2.378364,0.176983,-0.563178,0.982455,0.765621,...,-0.976161,-1.453242,-0.560174,-0.414424,-0.765101,0.460875,-0.996559,-0.451253,-0.732571,-0.233477


In [11]:
""" 
Recall: TrainData 2 contains 9182 features with 100 samples. Testdata2 contains 9182 features with 74 samples. There are 11 classes in this dataset.
"""

X_train, X_val, y_train, y_val = train_test_split(trainDataScaled, trainLabels, shuffle=True, test_size=0.2, random_state=50)
mySVM = SVC(kernel='linear')
myRFE = RFE(estimator=mySVM, n_features_to_select=5050, step=10)
myRFE.fit(X_train, y_train.values.ravel())

X_trainRFE = myRFE.transform(X_train)
X_valRFE = myRFE.transform(X_val)


In [12]:
mySVM = SVC(C=3)

mySVM.fit(X_trainRFE, y_train.values.ravel())
X_valPrediction = mySVM.predict(X_valRFE)

X_valAccuracy = accuracy_score(y_val, X_valPrediction)
X_valReport = classification_report(y_val, X_valPrediction)

print(f"The validation set had an accuracy of {X_valAccuracy}")

The validation set had an accuracy of 0.9


In [13]:
fullRFE = RFE(estimator=SVC(kernel='linear'), n_features_to_select=5000, step=10)         # AC1103 -- Applying RFE to the whole training set

fullRFE.fit(trainDataScaled, trainLabels.values.ravel())
trainDataRFE = fullRFE.transform(trainDataScaled)
testDataRFE = fullRFE.transform(testDataScaled)

print("Shape of trainData after RFE:", trainDataRFE.shape)
print("Shape of testData after RFE:", testDataRFE.shape)


mySVM.fit(trainDataRFE, trainLabels.values.ravel())
testDataPredictions = mySVM.predict(testDataRFE)

print("\nPredictions on Test Data:\n", testDataPredictions)

Shape of trainData after RFE: (100, 5000)
Shape of testData after RFE: (74, 5000)

Predictions on Test Data:
 [ 3 11 10  4  8  5  8  8  4  5 10 11  1  5 10  3 10  3  8 10  4  4  4  4
  4  4  4  4  5  4 10 11 10 11 11 10 11  3  3  3  3  3  3  3  3  3  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  8  8  8  8  8  8  8  8  8  8  8  8
  8  8]


In [14]:
output_dir = r"C:\Users\alexs\OneDrive\Desktop\PythonProjects\CSC4850_Project\PredictionResults"

os.makedirs(output_dir, exist_ok=True)

output_path = os.path.join(output_dir, "CastroClassification2.txt")
np.savetxt(output_path, testDataPredictions, fmt='%d')



print(f"Predictions saved to {output_path}.")

Predictions saved to C:\Users\alexs\OneDrive\Desktop\PythonProjects\CSC4850_Project\PredictionResults\CastroClassification2.txt.
