In [6]:
import sys
import subprocess
# Assicura installazione plotly
try:
    import IPython
except ImportError:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "plotly", "--quiet"])

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score
from xgboost import XGBRegressor
from xgboost import XGBClassifier
import xgboost as xgb

Caricamento dataset

In [None]:
df = pd.read_csv('titanic_train.csv')

In [None]:
df.head()

Pulizia del dataset

In [None]:
# controllo righe mancanti 
df.isnull().sum()

In [None]:
df = df.fillna(df.median(numeric_only = True))
df = df.drop_duplicates()
df.dtypes

Encoding delle variabili

In [None]:
""" for col in df.select_dtypes(include='object'):
    df[col] = LabelEncoder().fit_transform(df[col]) """

Feature Engineering

In [None]:
df = df.drop_duplicates()
df['Family'] = df['SibSp'] + df['Parch'] + 1
df['IsAlone'] = (df['Family'] == 1).astype(int)

# Feature Title da Name
df['Title'] = df['Name'].str.extract(r' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
rare_titles = ['Dr','Rev','Col','Major','Count','Lady','Sir','Jonkheer','Don','Capt']
df['Title'] = df['Title'].replace(rare_titles, 'Rare')

# Feature Deck da Cabin
df['Deck'] = df['Cabin'].str[0].fillna('Unknown')

Rimuovere gli outlier

In [None]:
""" num_cols = df.select_dtypes(include = 'number').columns
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1 

df = df[~((df < (Q1 - 1.5*IQR)) | (df > (Q3 + 1.5*IQR))).any(axis=1)] """

In [None]:
for col in ['Fare', 'Age']:
    # Calcola IQR dopo aver imputato NaN (uso la mediana per Fare/Age solo per calcolare IQR)
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    lower_bound = Q1 - 1.5 * IQR
    
 
    df[col] = np.where(df[col] > upper_bound, upper_bound, 
                       np.where(df[col] < lower_bound, lower_bound, df[col]))

Divisione feature e target

In [None]:
X = df.drop(columns = ['Survived', 'PassengerId', 'Name', 'Embarked'])
y = df['Survived'] 

In [None]:
# suddivisione dataset in train e test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
df.columns

In [None]:
# valutazione della presenza di outlier 
plt.figure(figsize = (10, 6))
sns.boxplot(data=df[['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']])
plt.show()

In [None]:
categorical_cols = df.select_dtypes(include = ['object']).columns.tolist()
for col in categorical_cols:
    df[col] = LabelEncoder().fit_transform(df[col])
    
    df.head()

Preprocessor

In [None]:
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()

# Pipeline Numerica: Imputazione e Standardizzazione
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Gestisce NaN in Age/Fare
    ('scaler', StandardScaler())
])

# Pipeline Categorica: Imputazione e One-Hot Encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Gestisce NaN in Embarked/Title/Deck
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combinazione
preprocessor_raw = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' # Ignora le colonne non specificate (come 'Sex' e 'Pclass' che sono giÃ  numeriche ma usate come 'object')
)

Modelli

In [None]:
def objective_raw(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 600),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'n_jobs': -1,
        'random_state': 42
    }

    model = xgb.XGBClassifier(**params)

    clf = Pipeline(steps=[
        ('preprocessor', preprocessor_raw),
        ('model', model)
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')

    return scores.mean()


study_raw = optuna.create_study(direction='maximize')
study_raw.optimize(objective_raw, n_trials=50, show_progress_bar=True)

print(f"\nBest CV Accuracy: {study_raw.best_value:.4f}")
print("Best params:", study_raw.best_params)

In [None]:
# ================================
# TRAINING MODELLO RAW FINALE
# ================================


best_params_raw = study_raw.best_params


final_pipeline_raw = Pipeline(steps=[
    ('preprocessor', preprocessor_raw),
    ('model', xgb.XGBClassifier(**best_params_raw, random_state=42, use_label_encoder=False))
])

# Fitting sul train set
final_pipeline_raw.fit(X_train, y_train)

# Predizioni sul test set
y_pred_raw = final_pipeline_raw.predict(X_test)

# Accuracy
acc_raw = accuracy_score(y_test, y_pred_raw)

# ================================
# STAMPA RISULTATI
# ================================
prev_fe_acc = 0.8400 

print("\n" + "="*50)
print(" RISULTATI DELLO STUDIO DI ABLAZIONE ")
print("="*50)
print(f"Modello con Feature Engineering (Prev.):  {prev_fe_acc:.4f}") 
print(f"Modello su Dati Raw (Attuale):           {acc_raw:.4f}")
print("-" * 50)
delta = (prev_fe_acc - acc_raw) * 100
print(f"Delta Performance: {delta:.2f} punti percentuali persi")
print("="*50)

In [None]:
### A. Matrice di Confusione (CM)
from sklearn.metrics import confusion_matrix
print("\nGenerazione Matrice di Confusione...")
cm = confusion_matrix(y_test, y_pred_raw)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Deceduto (0)','Sopravvissuto (1)'], 
            yticklabels=['Deceduto (0)','Sopravvissuto (1)'])
plt.title("Matrice di Confusione")
plt.ylabel("Valori Reali")
plt.xlabel("Predizioni del Modello")
plt.show() # 

In [None]:
## B. Curva di Apprendimento
print("\nGenerazione Curva di Apprendimento (Diagnosi Bias/Varianza)...")
try:
    train_sizes, train_scores, test_scores = learning_curve(
        estimator=final_pipeline_raw, 
        X=X_train, y=y_train, 
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), 
        n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 5), 
        scoring='accuracy'
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Punteggio Training")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Punteggio Cross-Validation")
    plt.title("Curva di Apprendimento (Diagnosi Performance)")
    plt.xlabel("Dimensione del Set di Addestramento")
    plt.ylabel("Accuratezza")
    plt.grid()
    plt.legend(loc="best")
    plt.show() 

except Exception as e:
    print(f"Errore nella Curva di Apprendimento: {e}")