# Εργασία - Αναγνώριση Προτύπων και Μηχανική Μάθηση, Μέρος Δ
## Ομάδα 11
**Παναγιώτης Μπελαντάκης**, AEM: 10305  
**Αλέξανδρος Φωτιάδης**, AEM: 10392

### Προεπεξεργασία δεδομένων

In [206]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
from sklearn.decomposition import PCA,KernelPCA
# Load the datasets
#C:\Users\mpela\Desktop\HMMY\9ο εξάμηνο\Αναγνώριση Προτύπων\ERGASIA\
#C:\Users\mpela\Desktop\HMMY\9ο εξάμηνο\Αναγνώριση Προτύπων\ERGASIA\
train_path = "datasetTV.csv"
test_path = "datasetTest.csv"
datasetTV = pd.read_csv(train_path)
datasetTest = pd.read_csv(test_path, header=None)

# Separate features and labels in the training set
X = datasetTV.iloc[:, :-1].values  # Features (all but the last column)
y = datasetTV.iloc[:, -1].values   # Labels (last column)

# Test set features
X_test = datasetTest.values

# Split the training set into training and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


### Δοκιμή διαφόρων classifiers χωρίς hyperparameter tuning

In [209]:
"""
Applying the classifiers. They will be trained in the training set and their accuracy will be computed based on how well they perform in thw
validation set
"""

# Naive Bayes Classifier
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_val_pred_nb = nb_classifier.predict(X_val)
nb_accuracy = accuracy_score(y_val, y_val_pred_nb)
print(f"Naive Bayes Validation Accuracy without cross validation: {nb_accuracy * 100:.2f}%")

# Neural Network Classifier
nn_classifier = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)
nn_classifier.fit(X_train, y_train)
y_val_pred_nn = nn_classifier.predict(X_val)
nn_accuracy = accuracy_score(y_val, y_val_pred_nn)
print(f"Neural Network Validation Accuracy without cross validation: {nn_accuracy * 100:.2f}%")

# Logistic Regression Classifier
lr_classifier = LogisticRegression(max_iter=500, random_state=42)
lr_classifier.fit(X_train, y_train)
y_val_pred_lr = lr_classifier.predict(X_val)
lr_accuracy = accuracy_score(y_val, y_val_pred_lr)
print(f"Logistic Regression Validation Accuracy without cross validation: {lr_accuracy * 100:.2f}%")

# k-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)
y_val_pred_knn = knn_classifier.predict(X_val)
knn_accuracy = accuracy_score(y_val, y_val_pred_knn)
print(f"k-Nearest Neighbors Validation Accuracy without cross validation: {knn_accuracy * 100:.2f}%")

# Support Vector Machine (SVM) Classifier
svm_classifier = SVC(kernel='poly', random_state=42)
svm_classifier.fit(X_train, y_train)
y_val_pred_svm = svm_classifier.predict(X_val)
svm_accuracy = accuracy_score(y_val, y_val_pred_svm)
print(f"SVM Validation Accuracy without cross validation: {svm_accuracy * 100:.2f}%")

# Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, bootstrap=True, random_state=42)
rf_classifier.fit(X_train, y_train)
y_val_pred_rf = rf_classifier.predict(X_val)
rf_accuracy = accuracy_score(y_val, y_val_pred_rf)
print(f"Random Forest Validation Accuracy without cross validation: {rf_accuracy * 100:.2f}%\n")

"""
Now, we will apply k-fold cross validation so than results are more reliable and realistic
"""

# Define classifiers
classifiers = {
    'Naive_Bayes': GaussianNB(),
    'Neural_Network': MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42),
    'Logistic_Regression': LogisticRegression(max_iter=500, random_state=42),
    'k-NN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='poly', random_state=42),
    'Random_Forest': RandomForestClassifier(n_estimators=100, random_state=42,bootstrap=True)
}

# Perform k-fold cross-validation for each classifier using cross_val_score function
cv = 3
results = {}
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    results[name] = np.mean(scores)
    print(f"{name} Cross-Validation Accuracy: {np.mean(scores) * 100:.2f}%")

# Find and print the best-performing classifier
best_classifier = max(results, key=results.get)
print(f"Best Classifier: {best_classifier} with Accuracy: {results[best_classifier] * 100:.2f}%")

Naive Bayes Validation Accuracy without cross validation: 69.93%
Neural Network Validation Accuracy without cross validation: 83.65%
Logistic Regression Validation Accuracy without cross validation: 78.44%
k-Nearest Neighbors Validation Accuracy without cross validation: 84.05%
SVM Validation Accuracy without cross validation: 86.96%
Random Forest Validation Accuracy without cross validation: 81.36%

Naive_Bayes Cross-Validation Accuracy: 70.34%
Neural_Network Cross-Validation Accuracy: 82.27%
Logistic_Regression Cross-Validation Accuracy: 76.23%
k-NN Cross-Validation Accuracy: 83.91%
SVM Cross-Validation Accuracy: 85.86%
Random_Forest Cross-Validation Accuracy: 80.19%
Best Classifier: SVM with Accuracy: 85.86%


### Hyperparameter tuning στον SVM ταξινομητή

In [211]:
# Support Vector Machine (SVM) Classifier with Grid Search
svm_parameters = {
    'C': [0.1, 1, 10, 100],
    'gamma' : ['scale'],
    'kernel': ['rbf']
}
"""
NOTES:
- gamma = 'scale': It adjusts the influence of each data point based on the dataset's scale.
- kernel = 'rbf'. We tested it with both 'rbf' and 'poly'. The best result seemed to be with poly(86.56%). However, when we set the kernel
equal to 'rbf' only, we get a slightly better accuracy(86.74%). The reason why we don't get the same result the first time with both kernels might be
the cross-validation variability and since random state=42, we have reproducibility.
"""
svm_classifier = GridSearchCV(SVC(random_state=42), svm_parameters, cv=3, scoring='accuracy')
svm_classifier.fit(X_train, y_train)

# Best SVM model
best_svm = svm_classifier.best_estimator_
y_val_pred_svm = best_svm.predict(X_val)
svm_accuracy = accuracy_score(y_val, y_val_pred_svm)
print(f"SVM (RBF Kernel) Validation Accuracy after Grid Search: {svm_accuracy * 100:.2f}%")
print(svm_classifier.best_params_)

SVM (RBF Kernel) Validation Accuracy after Grid Search: 86.74%
{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}


### SVM με μείωση διάστασης(Kernel PCA)

In [214]:
# Apply Kernel PCA for dimensionality reduction
kpca = KernelPCA(kernel='rbf', n_components=100, random_state=42)
X_kpca = kpca.fit_transform(X)
X_test_kpca = kpca.transform(X_test)

# Split the reduced training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_kpca, y, test_size=0.2, random_state=42)

# SVM Classifier without Bagging
svm_kpca_classifier = SVC(C=10, gamma='scale', kernel='rbf', random_state=42)
svm_kpca_classifier.fit(X_train, y_train)
y_val_pred_svm_kpca = svm_kpca_classifier.predict(X_val)
svm_kpca_accuracy = accuracy_score(y_val, y_val_pred_svm_kpca)
print(f"SVM with Kernel PCA Validation Accuracy: {svm_kpca_accuracy * 100:.2f}%")


SVM with Kernel PCA Validation Accuracy: 87.99%


### SVM με μείωση διάστασης(PCA)

In [217]:
# Apply Classic PCA for dimensionality reduction
pca = PCA(n_components=50, random_state=42)  # Adjust n_components as needed
X_pca = pca.fit_transform(X)
X_test_pca = pca.transform(X_test)

# Split the reduced training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# SVM Classifier without Bagging
svm_pca_classifier = SVC(C=10, gamma='scale', kernel='rbf', random_state=42)
svm_pca_classifier.fit(X_train, y_train)
y_val_pred_svm_pca = svm_pca_classifier.predict(X_val)
svm_pca_accuracy = accuracy_score(y_val, y_val_pred_svm_pca)
print(f"SVM with Classic PCA Validation Accuracy: {svm_pca_accuracy * 100:.2f}%")

SVM with Classic PCA Validation Accuracy: 88.56%


### SVM με Bagging

In [225]:
# SVM Classifier with Bagging (without dimensionality reduction)
base_svm = SVC(C=10, gamma='scale', kernel='rbf', random_state=42)
bagging_classifier = BaggingClassifier(base_svm, n_estimators=10, random_state=42)
bagging_classifier.fit(X_train, y_train)
y_val_pred_bagging = bagging_classifier.predict(X_val)
bagging_accuracy = accuracy_score(y_val, y_val_pred_bagging)
print(f"Bagging with SVM (No Dimensionality Reduction) Validation Accuracy: {bagging_accuracy * 100:.2f}%")

Bagging with SVM (No Dimensionality Reduction) Validation Accuracy: 88.05%


### SVM με PCA και Bagging

In [238]:
# Apply Kernel PCA for dimensionality reduction
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X)
X_test_pca = pca.transform(X_test)

# Split the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# SVM Classifier with Bagging
base_svm = SVC(C=10, gamma='scale', kernel='rbf', random_state=42)
bagging_classifier_pca = BaggingClassifier(estimator=base_svm, n_estimators=10, random_state=42)
bagging_classifier_pca.fit(X_train, y_train)
y_val_pred_bagging_pca = bagging_classifier_pca.predict(X_val)
bagging_pca_accuracy = accuracy_score(y_val, y_val_pred_bagging_pca)
print(f"Bagging with SVM Validation Accuracy: {bagging_pca_accuracy * 100:.2f}%")

Bagging with SVM Validation Accuracy: 89.02%


### Προβλέψεις για το τελικό μοντέλo(SVM με PCA) στο test set

In [248]:
# Predict on test set and save predictions
y_test_pred_svm_pca = svm_pca_classifier.predict(X_test_pca)

# Save predictions as an npy file with the name Labels11
np.save('labels11.npy', y_test_pred_svm_pca)

6955
