# Classificazione multiclasse Pre processing
In questa fase è sato preparato il dataset originale per la classificazione multiclasse sulla variabile category senza alcuna aggregazione delle classi. Operazioni eseguite:
- Caricamento e pulizia iniziale
- Trasformazione delle variabili numeriche
- Gestione delle variabili categoriche
- Creazione dataset finale e salvataggio

Dopo il preprocessing sono stati applicati diversi modelli di classificazione per valutare le performance sulla variabile category composta da 18 classi. L'obiettivo è di verificare quanto bene i modelli riuscissero a classificare le 18 classi originali prima di effettuare l'aggregazione(che si trova in 10_preprocessing_category.ipynb), infine le performance tra i modelli addestrati sul dataset originale e su quello con aggregazioni saranno messe a confronto, al fine di valutare se con l'aggregazione c'è stato un miglioramento. Modelli utilizzati:
- Decision tree sia con criterio gini che entropy
- Random forest
- SVM
- Naive bayes
- knn
- ann

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
sys.path.append('../Scripts')
from utility import evaluate_and_save_model_multiclass

INPUT_PATH = "../data/raw/Billionaires Statistics Dataset.csv"
OUTPUT_DIR = "../data/splitted_senza_aggregazione/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

df = pd.read_csv(INPUT_PATH)
df = df.drop_duplicates()

df['gdp_country'] = df['gdp_country'].astype(str).str.replace("$", "", regex=False)
df['gdp_country'] = df['gdp_country'].str.replace(",", "").str.strip()
df['gdp_country'] = pd.to_numeric(df['gdp_country'], errors='coerce')



df["log_finalWorth"] = np.log1p(df["finalWorth"])
df["log_gdp_country"] = np.log1p(df["gdp_country"])
df["selfMade_encoded"] = df["selfMade"].map({True: 1, False: 0})




numerical_cols = ['selfMade_encoded', 'log_finalWorth', 'age', 'log_gdp_country', 'cpi_country',
                  'life_expectancy_country', 'gross_tertiary_education_enrollment',
                  'total_tax_rate_country', 'population_country']

categorical_cols = ['country', 'source', 'gender', 'status']


for col in numerical_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)



categorical_encoded = pd.get_dummies(df[categorical_cols], drop_first=True)

target = 'category'
y = df[target]
le = LabelEncoder()
y_encoded = le.fit_transform(y)

joblib.dump(le, "../models/label_encoder_category_original.joblib")


X = pd.concat([df[numerical_cols], categorical_encoded], axis=1)

mask = y.notna()
X = X[mask]
y_encoded = y_encoded[mask]

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

X_train.to_csv(os.path.join(OUTPUT_DIR, "X_train.csv"), index=False)
X_test.to_csv(os.path.join(OUTPUT_DIR, "X_test.csv"), index=False)
pd.Series(y_train, name="category_encoded").to_csv(os.path.join(OUTPUT_DIR, "y_train.csv"), index=False)
pd.Series(y_test, name="category_encoded").to_csv(os.path.join(OUTPUT_DIR, "y_test.csv"), index=False)

print("\n Dataset pronto per la classificazione su 'category'. Salvato in '../data/splitted_senza_aggregazione/'")


 Dataset pronto per la classificazione su 'category'. Salvato in '../data/splitted_senza_aggregazione/'


In [4]:
X_train = pd.read_csv("../data/splitted_senza_aggregazione/X_train.csv")
X_test = pd.read_csv("../data/splitted_senza_aggregazione/X_test.csv")
y_train = pd.read_csv("../data/splitted_senza_aggregazione/y_train.csv").values.ravel()
y_test = pd.read_csv("../data/splitted_senza_aggregazione/y_test.csv").values.ravel()

# Decision tree con criterio gini
viene utilizzato class_weight='balanced' per gestire le classi sbilanciate

In [6]:
from sklearn.tree import DecisionTreeClassifier
import sys
sys.path.append('../Scripts')
from utility import evaluate_and_save_model_multiclass


model = DecisionTreeClassifier(class_weight='balanced',	random_state=42 )

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

evaluate_and_save_model_multiclass(
    model,
    "decision tree category default gini",
    y_train,
    y_train_pred,
    y_test,
    y_test_pred,
    "../results/classification_category/senza aggregazione/decision tree",
    "../models/decision_tree_category_original"
)

# Decision tree con criterio entropy

In [7]:
model = DecisionTreeClassifier(class_weight='balanced', criterion='entropy', random_state=42 )

model.fit(X_train, y_train)

evaluate_and_save_model_multiclass(
    model,
    "decision tree category default entropy",
    y_train,
    y_train_pred,
    y_test,
    y_test_pred,
    "../results/classification_category/senza aggregazione/decision tree",
    "../models/decision_tree_category_original_entropy"
)

# Random forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=100,
    random_state=42
)
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


evaluate_and_save_model_multiclass(
    model,
    "random forest",
    y_train,
    y_train_pred,
    y_test,
    y_test_pred,
    "../results/classification_category/senza aggregazione/random forest",
    "../models/randomforest_original"
)

# KNN

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k=5
knn = KNeighborsClassifier(n_neighbors=k)  
knn.fit(X_train_scaled, y_train)

y_pred_train = knn.predict(X_train_scaled)
y_pred_test = knn.predict(X_test_scaled)


evaluate_and_save_model_multiclass(
    knn,
    "knn",
    y_train,
    y_pred_train,
    y_test,
    y_pred_test,
    "../results/classification_category/senza aggregazione/knn",
    "../models/knn_original",
    {"k":5}
)

# Naive bayes

In [11]:
from sklearn.naive_bayes import GaussianNB


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = GaussianNB()
model.fit(X_train_scaled, y_train)

y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

evaluate_and_save_model_multiclass(
    model,
    "naive bayes",
    y_train,
    y_pred_train,
    y_test,
    y_pred_test,
    "../results/classification_category/senza aggregazione/naibayes",
    "../models/bayes_original"
)


# ANN

In [12]:
from sklearn.neural_network import MLPClassifier


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


mlp = MLPClassifier(
    hidden_layer_sizes=(100, 50),         
    activation='relu',                    
    solver='adam',                        
    alpha=0.05,                         
    learning_rate='adaptive',            
    max_iter=300,                          
    early_stopping=True,                  
    validation_fraction=0.2,              
    random_state=42
)

mlp.fit(X_train_scaled, y_train)

y_pred_train = mlp.predict(X_train_scaled)
y_pred_test = mlp.predict(X_test_scaled)

evaluate_and_save_model_multiclass(
    mlp,
    "ANN",
    y_train,
    y_pred_train,
    y_test,
    y_pred_test,
    "../results/classification_category/senza aggregazione/ann",
    "../models/ann_original"
)   

# SVM

In [None]:
from sklearn.svm import SVC


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


model = SVC(kernel='rbf', C=1.0, gamma='scale',class_weight='balanced', random_state=42)
model.fit(X_train_scaled, y_train)

y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

evaluate_and_save_model_multiclass(
    model,
    "SVM",
    y_train,
    y_pred_train,
    y_test,
    y_pred_test,
    "../results/classification_category/senza aggregazione/svm",
    "../models/svm_original",
    {"kernel":"rbf","gamma":"scale","C":1.0}
)