# PCA al fine di ridurre l'overfitting dei modelli nei problemi di classificazione multiclasse andiamo ad applicare principal component analysis

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib
import sys
sys.path.append('../Scripts')
from utility import evaluate_and_save_model_multiclass

# === 1. Caricamento dati splittati
X_train = pd.read_csv("../data/splitted_category/X_train.csv")
X_test = pd.read_csv("../data/splitted_category/X_test.csv")
y_train = pd.read_csv("../data/splitted_category/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/splitted_category/y_test.csv").values.ravel()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.90)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f"Dimensionalità ridotta da {X_train_scaled.shape[1]} a {X_train_pca.shape[1]} componenti principali")

models = {
    "SVM": SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(64, 32), alpha=0.01, max_iter=300, early_stopping=True, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    print(f"\n== {name} con PCA === ")
    model.fit(X_train_pca, y_train)
    y_pred_train = model.predict(X_train_pca)
    y_pred_test = model.predict(X_test_pca)
    evaluate_and_save_model_multiclass(
        model,
        name,
        y_train,
        y_pred_train,
        y_test,
        y_pred_test,
        "../results/classification_category/pca",
        f"../models/{name}_pca.joblib"
    )


Dimensionalità ridotta da 997 a 661 componenti principali

== SVM con PCA === 

== MLP con PCA === 

== Random Forest con PCA === 
