In [6]:
import scipy.io as sio
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import svm
from sklearn import tree
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [7]:
# Load the wine dataset
def load_dataset(filepath, columns):
    data = pd.read_csv(filepath, header=None)
    data.columns = columns
    data = shuffle(data, random_state=42)
    return data

# Define wine dataset path and column names
wine_dataset = {
    "path": "wine.data",
    "columns": [
        "Class", "Alcohol", "Malic_Acid", "Ash", "Alcalinity_of_Ash", "Magnesium", 
        "Total_Phenols", "Flavanoids", "Nonflavanoid_Phenols", "Proanthocyanins", 
        "Color_Intensity", "Hue", "OD280/OD315", "Proline"
    ]
}

# Hyperparameter grid search for each classifier
def perform_grid_search(clf, param_grid, X_train, y_train):
    grid_search = GridSearchCV(clf, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_, grid_search.best_params_

# Define classifiers and their parameter grids
classifiers = {
    "SVM": {
        "model": svm.SVC(),
        "params": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
    },
    "DecisionTree": {
        "model": tree.DecisionTreeClassifier(random_state=42),
        "params": {"max_depth": [None, 10, 20], "min_samples_split": [2, 5, 10]}
    }
}

# Process the wine dataset
print("\nProcessing dataset: wine\n")
data = load_dataset(wine_dataset["path"], wine_dataset["columns"])

X = data.drop("Class", axis=1)
y = data["Class"]

# Normalize features
X = (X - X.mean()) / X.std()

splits = {
    "20/80": 0.2,
    "50/50": 0.5,
    "80/20": 0.8
}

average_weighted_accuracies = {"20/80": {}, "50/50": {}, "80/20": {}}

for split_name, test_size in splits.items():
    print(f"\nSplit: {split_name}\n")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    for clf_name, clf_info in classifiers.items():
        print(f"\nClassifier: {clf_name}\n")
        best_model, best_params = perform_grid_search(clf_info["model"], clf_info["params"], X_train, y_train)
        y_pred = best_model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        average_weighted_accuracies[split_name][clf_name] = report["weighted avg"]["f1-score"]
        print(f"Best Parameters: {best_params}")
        print(classification_report(y_test, y_pred))

print("\nAverage Weighted Accuracy by Partition:\n")
for split, accuracies in average_weighted_accuracies.items():
    print(f"{split}: {accuracies}")


Processing dataset: wine


Split: 20/80


Classifier: SVM

Best Parameters: {'C': 1, 'kernel': 'rbf'}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        16
           3       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36


Classifier: RandomForest

Best Parameters: {'max_depth': None, 'n_estimators': 50}
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00        16
           3       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36


Classifier: DecisionTree

Best Parameters: {'max_d