In [None]:
# IMPORT LIBRARIES
# 📦 Import principali librerie
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 📦 Scikit-learn per modellazione e metriche
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, 
    f1_score, confusion_matrix, classification_report, 
    precision_recall_curve, average_precision_score
)

# 📦 Scikit-learn per modelli di classificazione
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier, VotingClassifier, 
    StackingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.inspection import permutation_importance

# 📦 XGBoost
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

# 📦 TensorFlow per modelli di rete neurale
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# 📦 IPython Widgets (per interfacce interattive)
import ipywidgets as widgets
from ipywidgets import Layout
from IPython.display import display, clear_output

# 📊 Tabulate per visualizzare tabelle
from tabulate import tabulate

import requests
import io
from pyexcel_xls import get_data 

In [None]:
# 1) Scarica il .xls da GitHub
url = "https://raw.githubusercontent.com/aporrini/Financial-Product-Recommender/main/Dataset2_Needs.xls"
xls_bytes = requests.get(url).content

# 2) Carica tutto in memoria con pyexcel-xls
workbook = get_data(io.BytesIO(xls_bytes))

# 3) Trasforma i fogli “Needs” e “Products” in DataFrame
needs_df = pd.DataFrame(workbook['Needs'][1:],    columns=workbook['Needs'][0])
products_df = pd.DataFrame(workbook['Products'][1:], columns=workbook['Products'][0])

# 4) Metadata sheet: trova la prima riga con >=2 celle e usala come header
meta_rows = workbook['Metadata']
header_idx = next(i for i, row in enumerate(meta_rows) if len(row) > 1)
meta_header = meta_rows[header_idx]
meta_data   = meta_rows[header_idx+1:]

# Crea il DataFrame e rinomina esplicitamente le prime due colonne
metadata_df = pd.DataFrame(meta_data, columns=meta_header)
# Assicuriamoci che ci siano queste due colonne, qualunque sia il loro nome originario:
metadata_df = metadata_df.rename(
    columns={
        metadata_df.columns[0]: 'Metadata',
        metadata_df.columns[1]: 'Unnamed: 1'
    }
)

# 5) Cleanup (ad esempio, togli 'ID' da needs_df)
if 'ID' in needs_df.columns:
    needs_df = needs_df.drop(columns='ID')

# 6) Verifica veloce
print(needs_df.shape, products_df.shape, metadata_df.shape)

In [None]:
def create_variable_summary(df, metadata_df):
    # Create empty lists to store the chosen statistics
    stats_dict = {
        'Variable': [],
        'Description': [],
        'Mean': [],
        'Std': [],
        'Missing': [],
        'Min': [],
        'Max': []
    }

    # Create a metadata dictionary for easy lookup
    meta_dict = dict(zip(metadata_df['Metadata'], metadata_df['Unnamed: 1']))

    for col in df.columns:
        stats_dict['Variable'].append(col)
        stats_dict['Description'].append(meta_dict.get(col, 'N/A'))

        # Calculate some statistics for each column
        if pd.api.types.is_numeric_dtype(df[col]):
            stats_dict['Mean'].append(f"{df[col].mean():.2f}")
            stats_dict['Std'].append(f"{df[col].std():.2f}")
            stats_dict['Min'].append(f"{df[col].min():.2f}")
            stats_dict['Max'].append(f"{df[col].max():.2f}")
        else:
            stats_dict['Mean'].append('N/A')
            stats_dict['Std'].append('N/A')
            stats_dict['Min'].append('N/A')
            stats_dict['Max'].append('N/A')

        stats_dict['Missing'].append(df[col].isna().sum())

    return pd.DataFrame(stats_dict)


# Create summary tables
print("NEEDS VARIABLES SUMMARY:")
needs_summary = create_variable_summary(needs_df, metadata_df)
display(needs_summary.style
        .set_properties(**{'text-align': 'left'})
        .hide(axis='index'))

print("\nPRODUCTS VARIABLES SUMMARY:")
products_summary = create_variable_summary(products_df, metadata_df)
display(products_summary.style
        .set_properties(**{'text-align': 'left'})
        .hide(axis='index'))


In [None]:
# Step 1: Feature engineering and transformation function
def prepare_features(df):
    X = df.copy()

    # Log transformation for Wealth and Income
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])

    # Feature engineering
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features_base = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                     'RiskPropensity', 'Wealth_log', 'Income_log']

    features_engineered = features_base + [
        'Income_Wealth_Ratio_log',
        'Is_Single',
        'Is_Senior',
        'Has_Education',
        'Risk_Age_Interaction'
    ]

    scaler = MinMaxScaler()
    X_base = pd.DataFrame(scaler.fit_transform(X[features_base]), columns=features_base)
    X_engineered = pd.DataFrame(scaler.fit_transform(X[features_engineered]), columns=features_engineered)

    return X_base, X_engineered

# Step 2: Data split function
def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )
    return X_train, X_test, y_train, y_test

# Step 3: Model training and evaluation function
def train_evaluate_model(X_train, y_train, X_test, y_test, model, k_folds=5):
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    cv_metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': []
    }

    for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_train_fold, y_train_fold)
        y_val_pred = model.predict(X_val_fold)

        cv_metrics['accuracy'].append(accuracy_score(y_val_fold, y_val_pred))
        cv_metrics['precision'].append(precision_score(y_val_fold, y_val_pred))
        cv_metrics['recall'].append(recall_score(y_val_fold, y_val_pred))
        cv_metrics['f1'].append(f1_score(y_val_fold, y_val_pred))

    model.fit(X_train, y_train)
    y_test_pred = model.predict(X_test)

    return {
        'cv_metrics': {
            metric: {
                'mean': np.mean(scores),
                'std': np.std(scores)
            } for metric, scores in cv_metrics.items()
        },
        'test_metrics': {
            'accuracy': accuracy_score(y_test, y_test_pred),
            'precision': precision_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred),
            'f1': f1_score(y_test, y_test_pred)
        }
    }

# Step 4: Display results function
def display_results_table(results_dict, model_name, feature_type):
    cv_data = {
        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1'],
        'CV Mean': [
            results_dict['cv_metrics']['accuracy']['mean'],
            results_dict['cv_metrics']['precision']['mean'],
            results_dict['cv_metrics']['recall']['mean'],
            results_dict['cv_metrics']['f1']['mean']
        ],
        'CV Std': [
            results_dict['cv_metrics']['accuracy']['std'],
            results_dict['cv_metrics']['precision']['std'],
            results_dict['cv_metrics']['recall']['std'],
            results_dict['cv_metrics']['f1']['std']
        ],
        'Test Set': [
            results_dict['test_metrics']['accuracy'],
            results_dict['test_metrics']['precision'],
            results_dict['test_metrics']['recall'],
            results_dict['test_metrics']['f1']
        ]
    }

    df = pd.DataFrame(cv_data)
    df = df.round(3)

    print(f"\n{model_name} - {feature_type}")
    print("=" * 60)
    print(tabulate(df, headers='keys', tablefmt='pretty'))


In [None]:
# Step 5: Run full analysis
# Prepare features
X_base, X_engineered = prepare_features(needs_df)
y_income = needs_df['IncomeInvestment']
y_accum = needs_df['AccumulationInvestment']

# Define models
models = {
    'SVM': SVC(),
    'NaiveBayes': GaussianNB(),
    'DecisionTree': DecisionTreeClassifier(random_state=42)
}

# Actually run analysis and display results
for target_name, y in [('Income Investment', y_income), ('Accumulation Investment', y_accum)]:
    print(f"\nTarget Variable: {target_name}")
    print("=" * 80)

    X_base_train, X_base_test, y_train, y_test = split_data(X_base, y)
    X_eng_train, X_eng_test, _, _ = split_data(X_engineered, y)

    for model_name, model in models.items():
        results_base = train_evaluate_model(X_base_train, y_train, X_base_test, y_test, model)
        display_results_table(results_base, model_name, "Base Features")

        results_eng = train_evaluate_model(X_eng_train, y_train, X_eng_test, y_test, model)
        display_results_table(results_eng, model_name, "Engineered Features")

In [None]:
# --- Feature Engineering Avanzato ---
def prepare_features_advanced(df):
    X = df.copy()
    X.columns = X.columns.str.strip()

    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income'])
    X['Income_Wealth_Ratio'] = X['Income'] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income'].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features_base = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                     'RiskPropensity', 'Wealth_log', 'Income_log']

    features_engineered = features_base + [
        'Income_Wealth_Ratio_log',
        'Is_Single',
        'Is_Senior',
        'Has_Education',
        'Risk_Age_Interaction'
    ]

    scaler = MinMaxScaler()
    X_base = pd.DataFrame(scaler.fit_transform(X[features_base]), columns=features_base)
    X_engineered = pd.DataFrame(scaler.fit_transform(X[features_engineered]), columns=features_engineered)

    return X_base, X_engineered

# --- Prepara features ---
X_base, X_engineered = prepare_features_advanced(needs_df)
y_income = needs_df['IncomeInvestment']
y_accum = needs_df['AccumulationInvestment']

# --- Definizione modelli ---
models = {
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False),
    'RandomForest': RandomForestClassifier(random_state=42)
}

# --- Esecuzione analisi completa ---
for target_name, y in [('Income Investment', y_income), ('Accumulation Investment', y_accum)]:
    print(f"\n📈 Target Variable: {target_name}")
    print("=" * 90)

    X_base_train, X_base_test, y_train, y_test = split_data(X_base, y)
    X_eng_train, X_eng_test, _, _ = split_data(X_engineered, y)

    for model_name, model in models.items():
        results_base = train_evaluate_model(X_base_train, y_train, X_base_test, y_test, model)
        display_results_table(results_base, model_name, "Base Features")

        results_eng = train_evaluate_model(X_eng_train, y_train, X_eng_test, y_test, model)
        display_results_table(results_eng, model_name, "Engineered Features")


MULTI_OUTPUT CLASSIFIER

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Crea una colonna con la combinazione come stringa
needs_df['Combo'] = needs_df['AccumulationInvestment'].astype(str) + ',' + needs_df['IncomeInvestment'].astype(str)

# Conta le occorrenze
combo_counts = needs_df['Combo'].value_counts().sort_index()
combo_df = combo_counts.rename_axis('Combination').reset_index(name='Count')

# Mappa etichette leggibili
label_map = {
    '0,0': '(0,0) - Nessun investimento',
    '0,1': '(0,1) - Solo Income',
    '1,0': '(1,0) - Solo Accumulation',
    '1,1': '(1,1) - Entrambi'
}
combo_df['Label'] = combo_df['Combination'].map(label_map)

# Plot
plt.figure(figsize=(8,5))
sns.barplot(data=combo_df, x='Label', y='Count', palette='viridis')
plt.title('Distribuzione delle combinazioni di investimento')
plt.xlabel('Combinazione')
plt.ylabel('Numero di osservazioni')
plt.xticks(rotation=20)
plt.tight_layout()
plt.show()

# (Opzionale) stampa tabella
print(combo_df[['Label', 'Count']])


In [None]:
# --- Funzione per il feature engineering (copiata dal tuo codice) ---
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features_base = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                     'RiskPropensity', 'Wealth_log', 'Income_log']

    features_engineered = features_base + [
        'Income_Wealth_Ratio_log',
        'Is_Single',
        'Is_Senior',
        'Has_Education',
        'Risk_Age_Interaction'
    ]

    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    X_engineered = pd.DataFrame(scaler.fit_transform(X[features_engineered]), columns=features_engineered)

    return X_engineered

# --- Funzione per plottare confusion matrix ---
def plot_confusion_matrices(y_true, y_pred, target_names):
    for i, target in enumerate(target_names):
        cm = confusion_matrix(y_true.iloc[:, i], y_pred[:, i])
        plt.figure(figsize=(5, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
                    xticklabels=['0', '1'], yticklabels=['0', '1'])
        plt.title(f"Confusion Matrix - {target}")
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.tight_layout()
        plt.show()

# --- Funzione di valutazione F1 medio ---
def evaluate_test_f1(y_true, y_pred):
    f1_acc = f1_score(y_true.iloc[:, 0], y_pred[:, 0])
    f1_inc = f1_score(y_true.iloc[:, 1], y_pred[:, 1])
    return {
        'Accumulation F1': f1_acc,
        'Income F1': f1_inc,
        'Macro Avg F1': np.mean([f1_acc, f1_inc])
    }

# --- Prepara i dati ---
X_engineered = prepare_features(needs_df)
y = needs_df[['AccumulationInvestment', 'IncomeInvestment']]
X_train, X_test, y_train, y_test = train_test_split(X_engineered, y, test_size=0.2, random_state=42, stratify=y['AccumulationInvestment'])

# --- Addestra e valuta ---
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# --- Mostra confusion matrix ---
plot_confusion_matrices(y_test, y_pred, ['AccumulationInvestment', 'IncomeInvestment'])

# --- Mostra F1-score medi ---
f1_results = evaluate_test_f1(y_test, y_pred)
print("\nTest F1 Scores:")
for k, v in f1_results.items():
    print(f"{k}: {v:.3f}")

# --- (Opzionale) Classification report testuale ---
print("\nClassification Reports:")
for i, col in enumerate(y.columns):
     print(f"\n{col}")
     print(classification_report(y_test[col], y_pred[:, i]))


In [None]:
# --- Feature engineering ---
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# --- Encode target ---
def encode_target(y_df):
    return y_df['AccumulationInvestment'] * 2 + y_df['IncomeInvestment']

# --- Prepara i dati ---
X = prepare_features(needs_df)
y_encoded = encode_target(needs_df[['AccumulationInvestment', 'IncomeInvestment']])
y_cat = to_categorical(y_encoded, num_classes=4)

X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)



model = Sequential([
    Input(shape=(X.shape[1],)),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(32, activation='relu'),
    Dropout(0.2),

    Dense(4, activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

early_stop = EarlyStopping(patience=5, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop]
)


# --- Plot delle curve di loss ---
plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Predizioni ---
y_pred_probs = model.predict(X_test)
y_pred_classes = np.argmax(y_pred_probs, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

# --- Etichette per le classi ---
labels = ['(0,0)', '(0,1)', '(1,0)', '(1,1)']

# --- Confusion Matrix ---
cm = confusion_matrix(y_test_classes, y_pred_classes)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix (Output Combinations)")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# --- Classification Report con etichette ---
print(classification_report(y_test_classes, y_pred_classes, target_names=labels))


In [None]:
# Calcola la correlazione tra i due target
corr = needs_df['AccumulationInvestment'].corr(needs_df['IncomeInvestment'])
print(f"Correlazione tra Accumulation e Income: {corr:.3f}")


classifier separati- voting classifier

In [None]:
# 🛠️ FUNZIONE DI FEATURE ENGINEERING
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# 📥 CARICAMENTO DATI: sostituisci questa parte con il tuo file CSV o DataFrame
# needs_df = pd.read_csv('/path/to/your/data.csv')
# Per test temporaneo:
# from sklearn.datasets import make_classification
# needs_df = pd.DataFrame(...)

# 🧠 PREPARAZIONE DEI DATI
X = prepare_features(needs_df)
y_acc = needs_df['AccumulationInvestment']
y_inc = needs_df['IncomeInvestment']

# Split per i due target
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_acc, test_size=0.2, random_state=42, stratify=y_acc)
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_inc, test_size=0.2, random_state=42, stratify=y_inc)

# 🧩 VOTING CLASSIFIER ACCUMULATION
voting_acc = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('lr', LogisticRegression(max_iter=1000)),
        ('nb', GaussianNB())
    ],
    voting='soft'
)

# 🧩 VOTING CLASSIFIER INCOME
voting_inc = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('lr', LogisticRegression(max_iter=1000)),
        ('nb', GaussianNB())
    ],
    voting='soft'
)

# 🏋️‍♂️ TRAINING
voting_acc.fit(X_train_a, y_train_a)
voting_inc.fit(X_train_i, y_train_i)

# 🔮 PREDICTION
y_pred_acc = voting_acc.predict(X_test_a)
y_pred_inc = voting_inc.predict(X_test_i)

# 🎯 F1-score solo sulla classe 1
f1_acc = f1_score(y_test_a, y_pred_acc, pos_label=1)
f1_inc = f1_score(y_test_i, y_pred_inc, pos_label=1)
macro_f1 = np.mean([f1_acc, f1_inc])

print("F1-score Accumulation (class 1):", round(f1_acc, 3))
print("F1-score Income (class 1):", round(f1_inc, 3))
print("Macro F1-score (class 1):", round(macro_f1, 3))

# 📊 CONFUSION MATRICES
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test_a, y_pred_acc), annot=True, fmt='d', cmap='Blues', ax=axs[0])
axs[0].set_title("Confusion Matrix - Accumulation")
axs[0].set_xlabel("Predicted")
axs[0].set_ylabel("Actual")

sns.heatmap(confusion_matrix(y_test_i, y_pred_inc), annot=True, fmt='d', cmap='Greens', ax=axs[1])
axs[1].set_title("Confusion Matrix - Income")
axs[1].set_xlabel("Predicted")
axs[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

# 📄 CLASSIFICATION REPORT (facoltativo)
print("\n--- Classification Report: AccumulationInvestment ---")
print(classification_report(y_test_a, y_pred_acc))
print("\n--- Classification Report: IncomeInvestment ---")
print(classification_report(y_test_i, y_pred_inc))


In [None]:
# ⚙️ FEATURE ENGINEERING
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# 🧠 PREPARAZIONE
X = prepare_features(needs_df)
y_acc = needs_df['AccumulationInvestment']
y_inc = needs_df['IncomeInvestment']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_acc, test_size=0.2, random_state=42, stratify=y_acc)
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_inc, test_size=0.2, random_state=42, stratify=y_inc)

# 🧩 MODELLI
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# ✅ VotingClassifier per ACCUMULATION
voting_acc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('knn', knn)],
    voting='soft'
)

# ✅ VotingClassifier per INCOME
voting_inc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('knn', knn)],
    voting='soft'
)

# 🏋️‍♂️ TRAINING
voting_acc.fit(X_train_a, y_train_a)
voting_inc.fit(X_train_i, y_train_i)

# 🔮 PREDICT
y_pred_acc = voting_acc.predict(X_test_a)
y_pred_inc = voting_inc.predict(X_test_i)

# 🎯 F1 METRICS sulla classe 1
f1_acc = f1_score(y_test_a, y_pred_acc, pos_label=1)
f1_inc = f1_score(y_test_i, y_pred_inc, pos_label=1)
macro_f1 = np.mean([f1_acc, f1_inc])

print("F1 Accumulation (class 1):", round(f1_acc, 3))
print("F1 Income (class 1):", round(f1_inc, 3))
print("Macro F1 (class 1):", round(macro_f1, 3))

# 📊 CONFUSION MATRICES
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test_a, y_pred_acc), annot=True, fmt='d', cmap='Blues', ax=axs[0])
axs[0].set_title("Confusion Matrix - Accumulation")
axs[0].set_xlabel("Predicted")
axs[0].set_ylabel("Actual")

sns.heatmap(confusion_matrix(y_test_i, y_pred_inc), annot=True, fmt='d', cmap='Greens', ax=axs[1])
axs[1].set_title("Confusion Matrix - Income")
axs[1].set_xlabel("Predicted")
axs[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

# 📄 CLASSIFICATION REPORT (facoltativo)
print("\n--- Classification Report: AccumulationInvestment ---")
print(classification_report(y_test_a, y_pred_acc))
print("\n--- Classification Report: IncomeInvestment ---")
print(classification_report(y_test_i, y_pred_inc))


In [None]:
# ⚙️ FEATURE ENGINEERING
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# ⚖️ scale_pos_weight for XGBoost
def get_scale_pos_weight(y):
    return (y == 0).sum() / (y == 1).sum()

# 🧠 PREPARAZIONE
X = prepare_features(needs_df)
y_acc = needs_df['AccumulationInvestment']
y_inc = needs_df['IncomeInvestment']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_acc, test_size=0.2, random_state=42, stratify=y_acc)
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_inc, test_size=0.2, random_state=42, stratify=y_inc)

# 🧩 MODELLI CON CLASS BALANCING
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
xgb_acc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_a), random_state=42)
xgb_inc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_i), random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# ✅ VotingClassifier per ACCUMULATION
voting_acc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_acc), ('knn', knn)],
    voting='soft'
)

# ✅ VotingClassifier per INCOME
voting_inc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_inc), ('knn', knn)],
    voting='soft'
)

# 🏋️‍♂️ TRAINING
voting_acc.fit(X_train_a, y_train_a)
voting_inc.fit(X_train_i, y_train_i)

# 🔮 PREDICT ACCUMULATION
y_pred_acc = voting_acc.predict(X_test_a)

# 🔮 PREDICT INCOME con soglia custom
probs_inc = voting_inc.predict_proba(X_test_i)
probs_class1 = probs_inc[:, 1] if isinstance(probs_inc, np.ndarray) else probs_inc[1][:, 1]

# 👇 MODIFICA LA SOGLIA QUI
threshold = 0.3
y_pred_inc = (probs_class1 >= threshold).astype(int)

# 🎯 F1 e recall METRICS
f1_acc = f1_score(y_test_a, y_pred_acc, pos_label=1)
f1_inc = f1_score(y_test_i, y_pred_inc, pos_label=1)
macro_f1 = np.mean([f1_acc, f1_inc])

print("F1 Accumulation (class 1):", round(f1_acc, 3))
print("F1 Income (class 1):", round(f1_inc, 3))
print("Macro F1 (class 1):", round(macro_f1, 3))

# 📊 CONFUSION MATRICES
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test_a, y_pred_acc), annot=True, fmt='d', cmap='Blues', ax=axs[0])
axs[0].set_title("Confusion Matrix - Accumulation")
axs[0].set_xlabel("Predicted")
axs[0].set_ylabel("Actual")

sns.heatmap(confusion_matrix(y_test_i, y_pred_inc), annot=True, fmt='d', cmap='Greens', ax=axs[1])
axs[1].set_title(f"Confusion Matrix - Income (soglia {threshold})")
axs[1].set_xlabel("Predicted")
axs[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

# 📄 CLASSIFICATION REPORT
print("\n--- Classification Report: AccumulationInvestment ---")
print(classification_report(y_test_a, y_pred_acc, digits=3))
print(f"\n--- Classification Report: IncomeInvestment (soglia {threshold}) ---")
print(classification_report(y_test_i, y_pred_inc, digits=3))



massimizzo recall con threshold personalizzata a 0.25 e uso non knn ma regression più adatto al soft voting

In [None]:
# ⚙️ FEATURE ENGINEERING
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# ⚖️ Calcolo scale_pos_weight per XGBoost
def get_scale_pos_weight(y):
    return (y == 0).sum() / (y == 1).sum()

# 🧠 PREPARAZIONE DATI
X = prepare_features(needs_df)
y_acc = needs_df['AccumulationInvestment']
y_inc = needs_df['IncomeInvestment']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_acc, test_size=0.2, random_state=42, stratify=y_acc)
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_inc, test_size=0.2, random_state=42, stratify=y_inc)

# 🧩 MODELLI CON BILANCIAMENTO CLASSI
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
xgb_acc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_a), random_state=42)
xgb_inc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_i), random_state=42)
lr = LogisticRegression(max_iter=1000, class_weight='balanced')

# ✅ VotingClassifier per ACCUMULATION
voting_acc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_acc), ('lr', lr)],
    voting='soft'
)

# ✅ VotingClassifier per INCOME (con XGB specifico)
voting_inc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_inc), ('lr', lr)],
    voting='soft'
)

# 🏋️‍♂️ TRAINING
voting_acc.fit(X_train_a, y_train_a)
voting_inc.fit(X_train_i, y_train_i)

# 🔮 PREDICT ACCUMULATION
y_pred_acc = voting_acc.predict(X_test_a)

# 🔮 PREDICT INCOME con threshold personalizzato
probs_inc = voting_inc.predict_proba(X_test_i)
probs_class1 = probs_inc[:, 1] if isinstance(probs_inc, np.ndarray) else probs_inc[1][:, 1]

# 👇 SOGLIA PERSONALIZZATA PER MASSIMIZZARE RECALL
threshold = 0.25
y_pred_inc = (probs_class1 >= threshold).astype(int)

# 🎯 METRICHE F1 E RECALL
f1_acc = f1_score(y_test_a, y_pred_acc, pos_label=1)
f1_inc = f1_score(y_test_i, y_pred_inc, pos_label=1)
macro_f1 = np.mean([f1_acc, f1_inc])

print("F1-score Accumulation (class 1):", round(f1_acc, 3))
print("F1-score Income (class 1):", round(f1_inc, 3))
print("Macro F1-score (class 1):", round(macro_f1, 3))

# 📊 CONFUSION MATRICES
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test_a, y_pred_acc), annot=True, fmt='d', cmap='Blues', ax=axs[0])
axs[0].set_title("Confusion Matrix - Accumulation")
axs[0].set_xlabel("Predicted")
axs[0].set_ylabel("Actual")

sns.heatmap(confusion_matrix(y_test_i, y_pred_inc), annot=True, fmt='d', cmap='Greens', ax=axs[1])
axs[1].set_title(f"Confusion Matrix - Income (soglia {threshold})")
axs[1].set_xlabel("Predicted")
axs[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

# 📄 REPORT CLASSIFICAZIONE
print("\n--- Classification Report: AccumulationInvestment ---")
print(classification_report(y_test_a, y_pred_acc, digits=3))
print(f"\n--- Classification Report: IncomeInvestment (soglia {threshold}) ---")
print(classification_report(y_test_i, y_pred_inc, digits=3))



massimizzo recall con threshold personalizzata a 0.3 e uso non knn ma regression più adatto al soft voting

In [None]:
# ⚙️ FEATURE ENGINEERING
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# ⚖️ Calcolo scale_pos_weight per XGBoost
def get_scale_pos_weight(y):
    return (y == 0).sum() / (y == 1).sum()

# 🧠 PREPARAZIONE DATI
X = prepare_features(needs_df)
y_acc = needs_df['AccumulationInvestment']
y_inc = needs_df['IncomeInvestment']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_acc, test_size=0.2, random_state=42, stratify=y_acc)
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_inc, test_size=0.2, random_state=42, stratify=y_inc)

# 🧩 MODELLI CON BILANCIAMENTO CLASSI
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
xgb_acc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_a), random_state=42)
xgb_inc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_i), random_state=42)
lr = LogisticRegression(max_iter=1000, class_weight='balanced')

# ✅ VotingClassifier per ACCUMULATION
voting_acc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_acc), ('lr', lr)],
    voting='soft'
)

# ✅ VotingClassifier per INCOME (con XGB specifico)
voting_inc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_inc), ('lr', lr)],
    voting='soft'
)

# 🏋️‍♂️ TRAINING
voting_acc.fit(X_train_a, y_train_a)
voting_inc.fit(X_train_i, y_train_i)

# 🔮 PREDICT ACCUMULATION
y_pred_acc = voting_acc.predict(X_test_a)

# 🔮 PREDICT INCOME con threshold personalizzato
probs_inc = voting_inc.predict_proba(X_test_i)
probs_class1 = probs_inc[:, 1] if isinstance(probs_inc, np.ndarray) else probs_inc[1][:, 1]

# 👇 SOGLIA PERSONALIZZATA PER MASSIMIZZARE RECALL
threshold = 0.3
y_pred_inc = (probs_class1 >= threshold).astype(int)

# 🎯 METRICHE F1 E RECALL
f1_acc = f1_score(y_test_a, y_pred_acc, pos_label=1)
f1_inc = f1_score(y_test_i, y_pred_inc, pos_label=1)
macro_f1 = np.mean([f1_acc, f1_inc])

print("F1-score Accumulation (class 1):", round(f1_acc, 3))
print("F1-score Income (class 1):", round(f1_inc, 3))
print("Macro F1-score (class 1):", round(macro_f1, 3))

# 📊 CONFUSION MATRICES
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test_a, y_pred_acc), annot=True, fmt='d', cmap='Blues', ax=axs[0])
axs[0].set_title("Confusion Matrix - Accumulation")
axs[0].set_xlabel("Predicted")
axs[0].set_ylabel("Actual")

sns.heatmap(confusion_matrix(y_test_i, y_pred_inc), annot=True, fmt='d', cmap='Greens', ax=axs[1])
axs[1].set_title(f"Confusion Matrix - Income (soglia {threshold})")
axs[1].set_xlabel("Predicted")
axs[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

# 📄 REPORT CLASSIFICAZIONE
print("\n--- Classification Report: AccumulationInvestment ---")
print(classification_report(y_test_a, y_pred_acc, digits=3))
print(f"\n--- Classification Report: IncomeInvestment (soglia {threshold}) ---")
print(classification_report(y_test_i, y_pred_inc, digits=3))


massimizzo recall con threshold personalizzata a 0.20 e uso non knn ma regression più adatto al soft voting

In [None]:
# ⚙️ FEATURE ENGINEERING
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# ⚖️ Calcolo scale_pos_weight per XGBoost
def get_scale_pos_weight(y):
    return (y == 0).sum() / (y == 1).sum()

# 🧠 PREPARAZIONE DATI
X = prepare_features(needs_df)
y_acc = needs_df['AccumulationInvestment']
y_inc = needs_df['IncomeInvestment']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_acc, test_size=0.2, random_state=42, stratify=y_acc)
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_inc, test_size=0.2, random_state=42, stratify=y_inc)

# 🧩 MODELLI CON BILANCIAMENTO CLASSI
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
xgb_acc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_a), random_state=42)
xgb_inc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_i), random_state=42)
lr = LogisticRegression(max_iter=1000, class_weight='balanced')

# ✅ VotingClassifier per ACCUMULATION
voting_acc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_acc), ('lr', lr)],
    voting='soft'
)

# ✅ VotingClassifier per INCOME (con XGB specifico)
voting_inc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_inc), ('lr', lr)],
    voting='soft'
)

# 🏋️‍♂️ TRAINING
voting_acc.fit(X_train_a, y_train_a)
voting_inc.fit(X_train_i, y_train_i)

# 🔮 PREDICT ACCUMULATION
y_pred_acc = voting_acc.predict(X_test_a)

# 🔮 PREDICT INCOME con threshold personalizzato
probs_inc = voting_inc.predict_proba(X_test_i)
probs_class1 = probs_inc[:, 1] if isinstance(probs_inc, np.ndarray) else probs_inc[1][:, 1]

# 👇 SOGLIA PERSONALIZZATA PER MASSIMIZZARE RECALL
threshold = 0.2
y_pred_inc = (probs_class1 >= threshold).astype(int)

# 🎯 METRICHE F1 E RECALL
f1_acc = f1_score(y_test_a, y_pred_acc, pos_label=1)
f1_inc = f1_score(y_test_i, y_pred_inc, pos_label=1)
macro_f1 = np.mean([f1_acc, f1_inc])

print("F1-score Accumulation (class 1):", round(f1_acc, 3))
print("F1-score Income (class 1):", round(f1_inc, 3))
print("Macro F1-score (class 1):", round(macro_f1, 3))

# 📊 CONFUSION MATRICES
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test_a, y_pred_acc), annot=True, fmt='d', cmap='Blues', ax=axs[0])
axs[0].set_title("Confusion Matrix - Accumulation")
axs[0].set_xlabel("Predicted")
axs[0].set_ylabel("Actual")

sns.heatmap(confusion_matrix(y_test_i, y_pred_inc), annot=True, fmt='d', cmap='Greens', ax=axs[1])
axs[1].set_title(f"Confusion Matrix - Income (soglia {threshold})")
axs[1].set_xlabel("Predicted")
axs[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

# 📄 REPORT CLASSIFICAZIONE
print("\n--- Classification Report: AccumulationInvestment ---")
print(classification_report(y_test_a, y_pred_acc, digits=3))
print(f"\n--- Classification Report: IncomeInvestment (soglia {threshold}) ---")
print(classification_report(y_test_i, y_pred_inc, digits=3))

curva precision-recall per capire soglia ottimale

In [None]:
# Calcola precision, recall e threshold
precision, recall, thresholds = precision_recall_curve(y_test_i, probs_class1)

# Calcola Average Precision Score
aps = average_precision_score(y_test_i, probs_class1)

# Plot
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'PR Curve (AP = {aps:.3f})', linewidth=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - IncomeInvestment')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# (Facoltativo) Stampa soglie campione
for r, p, t in zip(recall[::30], precision[::30], thresholds[::30]):
    print(f"Soglia: {t:.2f} → Precision: {p:.2f}, Recall: {r:.2f}")


classifier con random forest xcgboost e regression

In [None]:
# ⚙️ FEATURE ENGINEERING
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# ⚖️ Calcolo scale_pos_weight per XGBoost
def get_scale_pos_weight(y):
    return (y == 0).sum() / (y == 1).sum()

# 🧠 PREPARAZIONE DATI
X = prepare_features(needs_df)
y_acc = needs_df['AccumulationInvestment']
y_inc = needs_df['IncomeInvestment']

X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_acc, test_size=0.2, random_state=42, stratify=y_acc)
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_inc, test_size=0.2, random_state=42, stratify=y_inc)

# 🧩 MODELLI INCLUSI NEL VOTING
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
xgb_acc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_a), random_state=42)
xgb_inc = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train_i), random_state=42)
lr = LogisticRegression(max_iter=1000, class_weight='balanced')

# ✅ VotingClassifier per ACCUMULATION
voting_acc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_acc), ('lr', lr)],
    voting='soft'
)

# ✅ VotingClassifier per INCOME
voting_inc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb_inc), ('lr', lr)],
    voting='soft'
)

# 🏋️‍♂️ TRAINING
voting_acc.fit(X_train_a, y_train_a)
voting_inc.fit(X_train_i, y_train_i)

# 🔮 PREDICT ACCUMULATION
y_pred_acc = voting_acc.predict(X_test_a)

# 🔮 PREDICT INCOME con threshold personalizzato
probs_inc = voting_inc.predict_proba(X_test_i)
probs_class1 = probs_inc[:, 1] if isinstance(probs_inc, np.ndarray) else probs_inc[1][:, 1]

# 👇 SOGLIA PERSONALIZZATA PER MAX RECALL
threshold = 0.25
y_pred_inc = (probs_class1 >= threshold).astype(int)

# 🎯 METRICHE
f1_acc = f1_score(y_test_a, y_pred_acc, pos_label=1)
f1_inc = f1_score(y_test_i, y_pred_inc, pos_label=1)
macro_f1 = np.mean([f1_acc, f1_inc])

print("F1 Accumulation (class 1):", round(f1_acc, 3))
print("F1 Income (class 1):", round(f1_inc, 3))
print("Macro F1 (class 1):", round(macro_f1, 3))

# 📊 CONFUSION MATRICES
fig, axs = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(confusion_matrix(y_test_a, y_pred_acc), annot=True, fmt='d', cmap='Blues', ax=axs[0])
axs[0].set_title("Confusion Matrix - Accumulation")
axs[0].set_xlabel("Predicted")
axs[0].set_ylabel("Actual")

sns.heatmap(confusion_matrix(y_test_i, y_pred_inc), annot=True, fmt='d', cmap='Greens', ax=axs[1])
axs[1].set_title(f"Confusion Matrix - Income (soglia {threshold})")
axs[1].set_xlabel("Predicted")
axs[1].set_ylabel("Actual")

plt.tight_layout()
plt.show()

# 📄 CLASSIFICATION REPORT
print("\n--- Classification Report: AccumulationInvestment ---")
print(classification_report(y_test_a, y_pred_acc, digits=3))
print(f"\n--- Classification Report: IncomeInvestment (soglia {threshold}) ---")
print(classification_report(y_test_i, y_pred_inc, digits=3))


In [None]:
# Calcola precision, recall e soglie
precision, recall, thresholds = precision_recall_curve(y_test_i, probs_class1)
aps = average_precision_score(y_test_i, probs_class1)

# Plot
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'PR Curve (AP = {aps:.3f})', linewidth=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve - IncomeInvestment')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# (Facoltativo) Mostra soglie a intervalli
for i in range(0, len(thresholds), max(1, len(thresholds) // 10)):
    print(f"Soglia = {thresholds[i]:.2f} → Precision = {precision[i]:.2f}, Recall = {recall[i]:.2f}")


In [None]:
# ⚙️ FEATURE ENGINEERING
def prepare_features(df):
    X = df.copy()
    X['Wealth_log'] = np.log1p(X['Wealth'])
    X['Income_log'] = np.log1p(X['Income '])
    X['Income_Wealth_Ratio'] = X['Income '] / X['Wealth'].replace(0, np.nan)
    X['Income_Wealth_Ratio_log'] = np.log1p(X['Income_Wealth_Ratio'].fillna(X['Income '].max()))
    X['Is_Single'] = (X['FamilyMembers'] == 1).astype(int)
    X['Is_Senior'] = (X['Age'] > 65).astype(int)
    X['Has_Education'] = (X['FinancialEducation'] > 0).astype(int)
    X['Risk_Age_Interaction'] = X['RiskPropensity'] * X['Age']

    features = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation',
                'RiskPropensity', 'Wealth_log', 'Income_log',
                'Income_Wealth_Ratio_log', 'Is_Single', 'Is_Senior',
                'Has_Education', 'Risk_Age_Interaction']

    scaler = MinMaxScaler()
    return pd.DataFrame(scaler.fit_transform(X[features]), columns=features)

# ⚖️ Bilanciamento per XGBoost
def get_scale_pos_weight(y):
    return (y == 0).sum() / (y == 1).sum()

# 🧠 DATI
X = prepare_features(needs_df)
y_inc = needs_df['IncomeInvestment']
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_inc, test_size=0.2, random_state=42, stratify=y_inc)

# 🧩 MODELLI
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                    scale_pos_weight=get_scale_pos_weight(y_train_i), random_state=42)
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=42)

# ✅ VotingClassifier
voting_inc = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('mlp', mlp)],
    voting='soft'
)

# 🏋️‍♂️ TRAINING
voting_inc.fit(X_train_i, y_train_i)

# 🔮 PREDICT con SOGLIA PERSONALIZZATA
probs = voting_inc.predict_proba(X_test_i)
probs_class1 = probs[:, 1]
threshold = 0.28
y_pred_inc = (probs_class1 >= threshold).astype(int)

# 🎯 METRICHE
f1_inc = f1_score(y_test_i, y_pred_inc, pos_label=1)
print(f"F1-score Income (class 1) con soglia {threshold}: {f1_inc:.3f}")

# 📊 CONFUSION MATRIX
sns.heatmap(confusion_matrix(y_test_i, y_pred_inc), annot=True, fmt='d', cmap='Greens')
plt.title(f"Confusion Matrix - IncomeInvestment (soglia {threshold})")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

# 📄 REPORT
print(classification_report(y_test_i, y_pred_inc, digits=3))


STACKING CLASSIFIER

In [None]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler

import ipywidgets as widgets
from IPython.display import display, clear_output
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler

import ipywidgets as widgets
from ipywidgets import Layout
from IPython.display import display, clear_output

# ⚖️ Calcolo scale_pos_weight
def get_scale_pos_weight(y):
    return (y == 0).sum() / (y == 1).sum()

# 🔍 Funzione per trovare la soglia ottimale F1
def find_best_threshold(y_true, probs, verbose=True):
    best_f1 = 0
    best_thresh = 0.5
    for t in np.linspace(0.05, 0.95, 100):
        preds = (probs >= t).astype(int)
        f1 = f1_score(y_true, preds, pos_label=1)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = t
    if verbose:
        print(f"✓ Best threshold = {best_thresh:.3f}, F1 = {best_f1:.3f}")
    return best_thresh, best_f1

# 🧠 Prepara i dati
y_income = needs_df['IncomeInvestment']
y_accum  = needs_df['AccumulationInvestment']

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y_income, test_size=0.2, random_state=42, stratify=y_income)
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(X, y_accum, test_size=0.2, random_state=42, stratify=y_accum)

# 🧱 Modelli base
def make_stack_model(y_train):
    rf  = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                        scale_pos_weight=get_scale_pos_weight(y_train), random_state=42)
    svm = SVC(probability=True, class_weight='balanced', random_state=42)
    base_learners = [('rf', rf), ('xgb', xgb), ('svm', svm)]
    meta = LogisticRegression(max_iter=1000)
    return StackingClassifier(estimators=base_learners, final_estimator=meta, cv=5)

# 🔍 Ottimizzazione soglia: recall >= 0.7, massimizza F1
def find_threshold_recall_constrained(y_true, probs, min_recall=0.7, verbose=True):
    best_f1 = 0
    best_thresh = None
    for t in np.linspace(0.05, 0.95, 200):
        preds = (probs >= t).astype(int)
        rec = recall_score(y_true, preds, pos_label=1)
        if rec >= min_recall:
            f1 = f1_score(y_true, preds, pos_label=1)
            if f1 > best_f1:
                best_f1 = f1
                best_thresh = t
    if verbose:
        if best_thresh is not None:
            print(f"✓ Best threshold with recall ≥ {min_recall:.2f}: {best_thresh:.3f} → F1 = {best_f1:.3f}")
        else:
            print("⚠️ Nessuna soglia soddisfa il vincolo di recall.")
    return best_thresh, best_f1

# 🏦 StackingClassifier - IncomeInvestment (con vincolo di recall)
from sklearn.metrics import recall_score

print("\n🏦 StackingClassifier - IncomeInvestment (recall ≥ 0.70)")

stack_income = make_stack_model(y_train_i)
stack_income.fit(X_train_i, y_train_i)
probs_i = stack_income.predict_proba(X_test_i)[:, 1]
best_t_i, best_f1_i = find_threshold_recall_constrained(y_test_i, probs_i, min_recall=0.7)

if best_t_i is not None:
    preds_i = (probs_i >= best_t_i).astype(int)

    print("\n📄 Classification Report - IncomeInvestment")
    print(classification_report(y_test_i, preds_i, digits=3))
    sns.heatmap(confusion_matrix(y_test_i, preds_i), annot=True, fmt='d', cmap='Greens')
    plt.title(f"Confusion Matrix - IncomeInvestment (soglia {best_t_i:.2f})")
    plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.show()
else:
    print("⚠️ Nessuna soglia raggiunge recall ≥ 0.70")


# 🔨 AccumulationInvestment
print("\n💰 StackingClassifier - AccumulationInvestment")
stack_accum = make_stack_model(y_train_a)
stack_accum.fit(X_train_a, y_train_a)
probs_a = stack_accum.predict_proba(X_test_a)[:, 1]
best_t_a, _ = find_best_threshold(y_test_a, probs_a)
preds_a = (probs_a >= best_t_a).astype(int)

print("\n📄 Classification Report - AccumulationInvestment")
print(classification_report(y_test_a, preds_a, digits=3))
sns.heatmap(confusion_matrix(y_test_a, preds_a), annot=True, fmt='d', cmap='Blues')
plt.title(f"Confusion Matrix - AccumulationInvestment (best threshold {best_t_a:.2f})")
plt.xlabel("Predicted"); plt.ylabel("Actual"); plt.show()


In [None]:
# 🧠 Mappa ID → ProductName
product_name_map = {
    1: "Balanced Mutual Fund",
    2: "Income Conservative Unit-Linked (Life Insurance)",
    3: "Fixed Income Mutual Fund",
    4: "Balanced High Dividend Mutual Fund",
    5: "Balanced Mutual Fund",
    6: "Defensive Flexible Allocation Unit-Linked (Life Insurance)",
    7: "Aggressive Flexible Allocation Unit-Linked (Life Insurance)",
    8: "Balanced Flexible Allocation Unit-Linked (Life Insurance)",
    9: "Cautious Allocation Segregated Account",
    10: "Fixed Income Segregated Account",
    11: "Total Return Aggressive Allocation Segregated Account"
}
products_df['ProductName'] = products_df['IDProduct'].map(product_name_map)

# ⚙️ Funzione raccomandazione
def build_nba_df(y_pred, X_test, product_type_label, epsilon=0.05):
    client_indices = np.where(y_pred == 1)[0]
    client_ids = needs_df.iloc[X_test.index[client_indices]].index.values
    client_risks = X_test.iloc[client_indices]['RiskPropensity'].values

    product_pool = products_df[products_df['Type'] == product_type_label].copy()
    nba_records = []

    for cid, crisk in zip(client_ids, client_risks):
        max_risk = crisk + epsilon
        suitable = product_pool[product_pool['Risk'] <= max_risk]
        if not suitable.empty:
            best_product = suitable.loc[suitable['Risk'].idxmax()]
            nba_records.append({
                'ClientID': cid,
                'ClientRiskPropensity': crisk,
                'RecommendedProductID': best_product['IDProduct'],
                'ProductRiskLevel': best_product['Risk'],
                'ProductName': best_product['ProductName']
            })
        else:
            nba_records.append({
                'ClientID': cid,
                'ClientRiskPropensity': crisk,
                'RecommendedProductID': 0,
                'ProductRiskLevel': 0,
                'ProductName': 'N/A'
            })

    return pd.DataFrame(nba_records)

# 📊 Analisi
def plot_recommendation_analysis(nba_df, label, epsilon=0.05, color='#4682B4'):
    import matplotlib.pyplot as plt
    import numpy as np

    plt.figure(figsize=(10, 6))
    plt.hist(nba_df['ClientRiskPropensity'], bins=10, color=color, alpha=0.7)
    plt.title(f'Risk Propensity of Target Clients ({label})')
    plt.xlabel('Risk Propensity')
    plt.ylabel('Frequency')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

    total = len(nba_df)
    valid = (nba_df['RecommendedProductID'] > 0).sum()
    print(f"\n📊 Recommendation statistics ({label}):")
    print(f"Total customers analyzed: {total}")
    print(f"Customers with valid recommendations: {valid} ({100 * valid / total:.2f}%)")
    print(f"Customers without suitable recommendations: {total - valid}")

    if valid > 0:
        plt.figure(figsize=(12, 6))
        counts = nba_df[nba_df['RecommendedProductID'] > 0]['RecommendedProductID'].value_counts().sort_index()
        plt.bar(counts.index.astype(str), counts.values, color=color)
        plt.title(f'Frequency distribution of recommended {label.lower()} products')
        plt.xlabel('Product ID')
        plt.ylabel('Number of recommendations')
        plt.xticks(rotation=45)
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.show()

        top3 = counts.nlargest(3).index
        print(f"\n🏅 Top 3 recommended {label.lower()} products:")
        for pid in top3:
            row = products_df[products_df['IDProduct'] == pid].iloc[0]
            print(f"\n🔹 Product ID: {pid}")
            print(f"Name: {row['ProductName']}")
            print(f"Risk: {row['Risk']}")
            print(f"Recommended to {counts[pid]} clients")

    plt.figure(figsize=(10, 8))
    sc = plt.scatter(nba_df['ClientRiskPropensity'], nba_df['ProductRiskLevel'],
                     c=nba_df['ProductRiskLevel'], cmap='viridis', s=80, alpha=0.9)
    max_val = max(nba_df['ClientRiskPropensity'].max(), nba_df['ProductRiskLevel'].max()) + epsilon + 0.05
    x_vals = np.linspace(0, max_val, 500)
    plt.fill_between(x_vals, x_vals, x_vals + epsilon, color='red', alpha=0.2, label=f'Excess Risk Zone (+ε={epsilon})')
    plt.plot(x_vals, x_vals, 'r--', label='Perfect match', alpha=0.8)
    plt.plot(x_vals, x_vals + epsilon, 'r:', alpha=0.5)
    plt.colorbar(sc, label='Product Risk Level')
    plt.title(f'Suitability: Client Risk Propensity vs Product Risk ({label})')
    plt.xlabel('Client Risk Propensity')
    plt.ylabel('Product Risk Level')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

# 🧾 Costruisci e visualizza
epsilon = 0.025
nba_income = build_nba_df(preds_i, X_test_i, product_type_label=0, epsilon=epsilon)
nba_accum  = build_nba_df(preds_a, X_test_a, product_type_label=1, epsilon=epsilon)

plot_recommendation_analysis(nba_income, 'Income-Investment', epsilon=epsilon, color='#2E8B57')
plot_recommendation_analysis(nba_accum,  'Accumulation-Investment', epsilon=epsilon, color='#4682B4')


In [None]:
# Aggiunta brute-force dei 5 prodotti extra
products_df = pd.read_excel(file_path, sheet_name='Products')
new_products = pd.DataFrame({
    'IDProduct': [12, 13, 14, 15, 16],
    'Type': [0, 0, 1, 1, 0],
    'Risk': [0.55, 0.70, 0.70, 0.15, 0.85]
})
products_df = pd.concat([products_df, new_products],ignore_index=True)

# Rimuovi righe con Risk = 0.12
products_df = products_df[products_df['Risk'] != 0.12]


In [None]:
# 🧠 Mappa ID → ProductName
product_name_map = {
    1: "Balanced Mutual Fund",
    2: "Income Conservative Unit-Linked (Life Insurance)",
    3: "Fixed Income Mutual Fund",
    4: "Balanced High Dividend Mutual Fund",
    5: "Balanced Mutual Fund",
    6: "Defensive Flexible Allocation Unit-Linked (Life Insurance)",
    7: "Aggressive Flexible Allocation Unit-Linked (Life Insurance)",
    8: "Balanced Flexible Allocation Unit-Linked (Life Insurance)",
    9: "Cautious Allocation Segregated Account",
    10: "Fixed Income Segregated Account",
    11: "Total Return Aggressive Allocation Segregated Account",
    12:	"Global Diversified Income Fund",
    13:	"Emerging Markets High Yield Bond Fund",
    14:	"Sustainable Growth Equity Portfolio",
    15:	"Short-Term Government Bond Accumulation Fund",
    16:	"Tranche Equity CDO"
}
products_df['ProductName'] = products_df['IDProduct'].map(product_name_map)

# ⚙️ Funzione raccomandazione
def build_nba_df(y_pred, X_test, product_type_label, epsilon=0.05):
    client_indices = np.where(y_pred == 1)[0]
    client_ids = needs_df.iloc[X_test.index[client_indices]].index.values
    client_risks = X_test.iloc[client_indices]['RiskPropensity'].values

    product_pool = products_df[products_df['Type'] == product_type_label].copy()
    nba_records = []

    for cid, crisk in zip(client_ids, client_risks):
        max_risk = crisk + epsilon
        suitable = product_pool[product_pool['Risk'] <= max_risk]
        if not suitable.empty:
            best_product = suitable.loc[suitable['Risk'].idxmax()]
            nba_records.append({
                'ClientID': cid,
                'ClientRiskPropensity': crisk,
                'RecommendedProductID': best_product['IDProduct'],
                'ProductRiskLevel': best_product['Risk'],
                'ProductName': best_product['ProductName']
            })
        else:
            nba_records.append({
                'ClientID': cid,
                'ClientRiskPropensity': crisk,
                'RecommendedProductID': 0,
                'ProductRiskLevel': 0,
                'ProductName': 'N/A'
            })

    return pd.DataFrame(nba_records)

# 📊 Analisi
def plot_recommendation_analysis(nba_df, label, epsilon=0.05, color='#4682B4'):
    import matplotlib.pyplot as plt
    import numpy as np

    plt.figure(figsize=(10, 6))
    plt.hist(nba_df['ClientRiskPropensity'], bins=10, color=color, alpha=0.7)
    plt.title(f'Risk Propensity of Target Clients ({label})')
    plt.xlabel('Risk Propensity')
    plt.ylabel('Frequency')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

    total = len(nba_df)
    valid = (nba_df['RecommendedProductID'] > 0).sum()
    print(f"\n📊 Recommendation statistics ({label}):")
    print(f"Total customers analyzed: {total}")
    print(f"Customers with valid recommendations: {valid} ({100 * valid / total:.2f}%)")
    print(f"Customers without suitable recommendations: {total - valid}")

    if valid > 0:
        plt.figure(figsize=(12, 6))
        counts = nba_df[nba_df['RecommendedProductID'] > 0]['RecommendedProductID'].value_counts().sort_index()
        plt.bar(counts.index.astype(str), counts.values, color=color)
        plt.title(f'Frequency distribution of recommended {label.lower()} products')
        plt.xlabel('Product ID')
        plt.ylabel('Number of recommendations')
        plt.xticks(rotation=45)
        plt.grid(axis='y', alpha=0.3)
        plt.tight_layout()
        plt.show()

        top3 = counts.nlargest(3).index
        print(f"\n🏅 Top 3 recommended {label.lower()} products:")
        for pid in top3:
            row = products_df[products_df['IDProduct'] == pid].iloc[0]
            print(f"\n🔹 Product ID: {pid}")
            print(f"Name: {row['ProductName']}")
            print(f"Risk: {row['Risk']}")
            print(f"Recommended to {counts[pid]} clients")

    plt.figure(figsize=(10, 8))
    sc = plt.scatter(nba_df['ClientRiskPropensity'], nba_df['ProductRiskLevel'],
                     c=nba_df['ProductRiskLevel'], cmap='viridis', s=80, alpha=0.9)
    max_val = max(nba_df['ClientRiskPropensity'].max(), nba_df['ProductRiskLevel'].max()) + epsilon + 0.05
    x_vals = np.linspace(0, max_val, 500)
    plt.fill_between(x_vals, x_vals, x_vals + epsilon, color='red', alpha=0.2, label=f'Excess Risk Zone (+ε={epsilon})')
    plt.plot(x_vals, x_vals, 'r--', label='Perfect match', alpha=0.8)
    plt.plot(x_vals, x_vals + epsilon, 'r:', alpha=0.5)
    plt.colorbar(sc, label='Product Risk Level')
    plt.title(f'Suitability: Client Risk Propensity vs Product Risk ({label})')
    plt.xlabel('Client Risk Propensity')
    plt.ylabel('Product Risk Level')
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

# 🧾 Costruisci e visualizza
epsilon = 0.025
nba_income = build_nba_df(preds_i, X_test_i, product_type_label=0, epsilon=epsilon)
nba_accum  = build_nba_df(preds_a, X_test_a, product_type_label=1, epsilon=epsilon)

plot_recommendation_analysis(nba_income, 'IncomeInvestment', epsilon=epsilon, color='#2E8B57')
plot_recommendation_analysis(nba_accum,  'AccumulationInvestment', epsilon=epsilon, color='#4682B4')


RISK PROPENSITY PREDICTION + MANUAL FEATURE = RECOMMENDATION PROCESS

In [None]:
# ⚠️ Assicurati che needs_df e products_df siano già caricati

# 🔁 Modello per stimare RiskPropensity (usato solo nel widget)
risk_model = XGBRegressor()
risk_model.fit(needs_df[['Age', 'Gender', 'FamilyMembers', 'FinancialEducation', 'Income ', 'Wealth']],
               needs_df['RiskPropensity'])

# 🧠 Funzione di feature engineering coerente con il tuo modello
def feature_engineering(input_dict):
    income_val = input_dict['Income']
    wealth_val = input_dict['Wealth']
    risk_val = input_dict['RiskPropensity']
    age_val = input_dict['Age']
    family_members = input_dict['FamilyMembers']
    edu = input_dict['FinancialEducation']

    features = {
        "Age": age_val,
        "Gender": input_dict['Gender'],
        "FamilyMembers": family_members,
        "FinancialEducation": edu,
        "RiskPropensity": risk_val,
        "Wealth_log": np.log1p(wealth_val),
        "Income_log": np.log1p(income_val),
        "Income_Wealth_Ratio_log": np.log1p(income_val / wealth_val) if wealth_val != 0 else np.log1p(income_val),
        "Is_Single": int(family_members == 1),
        "Is_Senior": int(age_val > 65),
        "Has_Education": int(edu > 0.1),
        "Risk_Age_Interaction": risk_val * age_val
    }

    return pd.DataFrame([features])

# 📊 Sistema di raccomandazione
def recommend_products(input_data, epsilon=0.05):
    feats = feature_engineering(input_data)
    prob_i = stack_income.predict_proba(feats)[0, 1]
    prob_a = stack_accum.predict_proba(feats)[0, 1]


    pred_i = int(prob_i >= best_t_i)
    pred_a = int(prob_a >= best_t_a)

    results = []

    if pred_i == 1:
        client_risk = feats['RiskPropensity'].iloc[0]
        suitable = products_df[(products_df['Type'] == 0) & (products_df['Risk'] <= client_risk + epsilon)]
        if not suitable.empty:
            best_product = suitable.loc[suitable['Risk'].idxmax()]
            results.append({
                "InvestmentType": "Income",
                "RecommendedProductID": int(best_product['IDProduct']),
                "ProductName": best_product['ProductName'],
                "ProductRisk": best_product['Risk'],
                "PredictedProb": round(prob_i, 3)
            })

    if pred_a == 1:
        client_risk = feats['RiskPropensity'].iloc[0]
        suitable = products_df[(products_df['Type'] == 1) & (products_df['Risk'] <= client_risk + epsilon)]
        if not suitable.empty:
            best_product = suitable.loc[suitable['Risk'].idxmax()]
            results.append({
                "InvestmentType": "Accumulation",
                "RecommendedProductID": int(best_product['IDProduct']),
                "ProductName": best_product['ProductName'],
                "ProductRisk": best_product['Risk'],
                "PredictedProb": round(prob_a, 3)
            })

    if not results:
        results.append({
            "InvestmentType": "None",
            "RecommendedProductID": 0,
            "ProductName": "No Investment Needed",
            "ProductRisk": "-",
            "PredictedProb": "-"
        })

    return pd.DataFrame(results)



# 🧩 Widget interattivo
def launch_widget():
    age = widgets.IntSlider(value=35, min=18, max=90, description='Age:')
    gender = widgets.Dropdown(options=[('Male', 0), ('Female', 1)], value=0, description='Gender:')
    family = widgets.IntSlider(value=2, min=1, max=10, description='Family:')
    education = widgets.FloatSlider(value=0.5, min=0, max=1, step=0.01, description='Education:')
    income = widgets.FloatSlider(value=60, min=5, max=250, step=1.0, description='Income:')
    wealth = widgets.FloatSlider(value=80, min=5, max=1000, step=1.0, description='Wealth:')
    button = widgets.Button(description="Get Recommendation")

    ui = widgets.VBox([age, gender, family, education, income, wealth, button])

    def on_click(b):
        clear_output(wait=True)
        display(ui)

        user_input = {
            "Age": age.value,
            "Gender": gender.value,
            "FamilyMembers": family.value,
            "FinancialEducation": education.value,
            "Income": income.value,
            "Wealth": wealth.value
        }

        # 🔍 Predizione RiskPropensity
        X_risk = pd.DataFrame([user_input])
        X_risk.columns = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation', 'Income ', 'Wealth']
        user_input['RiskPropensity'] = float(risk_model.predict(X_risk)[0])

        print(f"\n📊 Predicted Risk Propensity: {user_input['RiskPropensity']:.3f}")

        # 🎯 Raccomandazioni
        recommendation = recommend_products(user_input)
        display(recommendation)

    button.on_click(on_click)
    display(ui)

# ▶️ Avvia
launch_widget()


In [None]:
# ✅ Scegli il modello e i dati (Income o Accumulation)
model = stack_income  # oppure stack_accum
X_eval = X_test_i.copy()  # oppure X_test_a
y_eval = y_test_i.copy()  # oppure y_test_a

# ⚙️ Calcolo Permutation Importance
result = permutation_importance(
    model,
    X_eval,
    y_eval,
    n_repeats=10,
    random_state=42,
    scoring='f1'
)

# 📊 Mostra risultati
importances = pd.DataFrame({
    'Feature': X_eval.columns,
    'Importance Mean': result.importances_mean,
    'Importance Std': result.importances_std
}).sort_values(by='Importance Mean', ascending=False)

# 🔝 Plot
plt.figure(figsize=(10, 6))
plt.barh(importances['Feature'], importances['Importance Mean'], xerr=importances['Importance Std'], color='skyblue')
plt.gca().invert_yaxis()
plt.title("Permutation Importance (F1-score drop)")
plt.xlabel("Decrease in F1-score when feature is permuted")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

# 📄 (facoltativo) Mostra anche tabella
importances.reset_index(drop=True).head(10)


In [None]:
# ✅ 1. Permutation Importance su stack_accum
model = stack_accum
X_eval = X_test_a.copy()
y_eval = y_test_a.copy()

result = permutation_importance(
    model,
    X_eval,
    y_eval,
    n_repeats=10,
    random_state=42,
    scoring='f1'
)

importances = pd.DataFrame({
    'Feature': X_eval.columns,
    'Importance Mean': result.importances_mean,
    'Importance Std': result.importances_std
}).sort_values(by='Importance Mean', ascending=False)

# ✅ 2. Plot delle importanze
plt.figure(figsize=(10, 6))
plt.barh(importances['Feature'], importances['Importance Mean'], xerr=importances['Importance Std'], color='salmon')
plt.gca().invert_yaxis()
plt.title("Permutation Importance - AccumulationInvestment (F1-score drop)")
plt.xlabel("Decrease in F1-score when feature is permuted")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()




PARTE DI ALE:
Unire quello fatto da cami e cecio per predire risk propensity dallo slider con le info del prof con un questionario stile MiFID II.

Faccio domande per stabilire la financial literacy, così da non doverla chiedere tramite slider al cliente.
Poi usiamo algoritmo già fatto da Cami e Cecio per predire risk propensity da quei dati e facciamo media pesata con lo score che esce dal questionario MiFID per la risk propensity.
Infine come prima consigliamo i prodotti adeguati


In [None]:
# ———————————————————————————————————————————
# ⚠️ Assicurati che needs_df, products_df, stack_income, stack_accum,
# best_t_i, best_t_a siano già caricati/pronti
# ———————————————————————————————————————————

# 🔁 Modello per stimare RiskPropensity (usato solo nel widget)
risk_model = XGBRegressor()
risk_model.fit(
    needs_df[['Age', 'Gender', 'FamilyMembers', 'FinancialEducation', 'Income ', 'Wealth']],
    needs_df['RiskPropensity']
)

# 👇 OUTPUT WIDGET GLOBALE
output = widgets.Output()

# 🧩 Definizione domande MiFID II
financial_lit_questions = [
    { 'question': 'What is your education title?',
      'options': { 'No': 0.0, 'High School Diploma': 0.015, 'Bachelor Degree': 0.025,
                   'Bachelor Degree in economic/financial subjects': 0.075,
                   'Master Degree': 0.05, 'Master Degree in economic/financial subjects': 0.1 }
    },
    { 'question': 'Have you worked in the financial industry?',
      'options': {'Yes': 0.1, 'No': 0.0}
    },
    { 'question': 'Flag the most risky financial instruments in which you have invested',
      'options': {'Equity': 0.04, 'Mutual funds/Sicav/ETFs': 0.015, 'Bonds': 0.02,
                  'Government Bonds': 0.015,
                  'Structured Bonds (equity linked, reverse floater, reverse convertible)': 0.06,
                  'Insurance Products': 0.008, 'Covered Warrants/Warrants/Investment Certificates': 0.06,
                  'Portfolio Management': 0.04,
                  'Financial Derivatives (e.g. Options/Swaps/leveraged instruments)': 0.1 }
    },
    { 'question': 'With what frequency did you invest in financial products in the last 5 years?',
      'options': {'More than 10 times a year': 0.1, 'Between 5 and 10': 0.05, 'Less than 5': 0.0}
    },
    { 'question': 'The rating is a score expressed by an independent third party entity that measures?',
      'options': {'The solidity of an enterprise': 0.1, 'The productivity rate of an enterprise': 0.015,
                  'The revenues of a company': 0.0}
    },
    { 'question': 'What is an option?',
      'options': {'It is a financial contract whose value depends on the movements of an underlying asset': 0.1,
                  'An investment contract similar to equity and/or Bonds': 0.06,
                  'An instrument with guaranteed capital': 0.0}
    },
    { 'question': 'What happens to the owners of subordinated bonds in insolvency of the issuer?',
      'options': {'They never get reimbursed': 0.05,
                  'They get reimbursed just after the owners of non-subordinated bonds': 0.1,
                  'They get reimbursed with stocks': 0.0}
    },
    { 'question': 'What is a FX Swap?',
      'options': {'A swap on interest rates': 0.01,
                  'A product combining a spot and a forward currency contract': 0.1,
                  'Do not know': 0.0}
    },
    { 'question': 'What is the frequency of publication of the NAV of Alternative funds?',
      'options': {'At least twice a year': 0.1, 'Daily': 0.03, 'Do not know': 0.0}
    },
    { 'question': 'In a Credit Linked Note (CLN), what is the reimbursement of the capital tied to?',
      'options': {'The risk of default of the issuer': 0.03,
                  'The risk of default of the issuer and the reference entity': 0.1,
                  'The risk of default of the reference entity only': 0.0}
    }
]

risk_propensity_questions = [
    { 'question': 'How would you react to a loss of 10% on your investment portfolio?',
      'options': {'I would sell everything': 0.0,
                  'I would wait and see what happens': 0.12,
                  'I would buy more': 0.25}
    },
    { 'question': 'What is your investment goal on a 5 year horizon?',
      'options': {'Low returns but minimal risk of loss (gain 1%, loss 1%)': 0.04,
                  'Normal returns with limited loss (gain 5%, loss 5%)': 0.1,
                  'High return with high risk (gain 50%, loss 50%)': 0.25}
    },
    { 'question': 'Which investment strategy aligns with your goals?',
      'options': {'Liquidity: protect capital (≤1 year horizon)': 0.0,
                  'Short term: protect capital with modest growth (≤3 years)': 0.09,
                  'Savings: high protection with growth (≤5 years)': 0.15,
                  'Long-medium term: significant growth (>5 years)': 0.2,
                  'Speculative': 0.25}
    },
    { 'question': 'If a diversified portfolio showed -25% tech equities, -15% high-yield bonds, +5% commodities, what would you do?',
      'options': {'Rebalance towards defensive assets': 0.04,
                  'Buy more at lower prices': 0.25,
                  'Exit the markets': 0.0,
                  'Maintain original strategy': 0.15}
    }
]

# Genera widget per ogni domanda una volta sola
lit_widgets, lit_ui = [], []
for q in financial_lit_questions:
    lbl = widgets.HTML(f"<div style='font-size:12px; font-weight:bold; width:600px;'>{q['question']}</div>",
                       layout=Layout(margin='10px 0px'))
    rbt = widgets.RadioButtons(options=list(q['options'].keys()), description='',
                                layout=Layout(width='600px', margin='5px 0px'))
    lit_widgets.append(rbt)
    lit_ui.append(widgets.VBox([lbl, rbt], layout=Layout(margin='20px 0px')))

risk_widgets, risk_ui = [], []
for q in risk_propensity_questions:
    lbl = widgets.HTML(f"<div style='font-size:12px; font-weight:bold; width:600px;'>{q['question']}</div>",
                       layout=Layout(margin='10px 0px'))
    rbt = widgets.RadioButtons(options=list(q['options'].keys()), description='',
                                layout=Layout(width='600px', margin='5px 0px'))
    risk_widgets.append(rbt)
    risk_ui.append(widgets.VBox([lbl, rbt], layout=Layout(margin='20px 0px')))

# UI principale (build_ui)
def build_ui():
    age = widgets.IntSlider(value=35, min=18, max=90, description='Età:',
                             layout=Layout(margin='10px 0px', width='600px'))
    gender = widgets.Dropdown(options=[('Maschio', 0), ('Femmina', 1)], value=0,
                              description='Genere:', layout=Layout(margin='10px 0px', width='600px'))
    family = widgets.IntSlider(value=2, min=1, max=10, description='Componenti famiglia:',
                                layout=Layout(margin='10px 0px', width='600px'))
    income = widgets.FloatSlider(value=60, min=5, max=500, step=1.0, description='Reddito:',
                                 layout=Layout(margin='10px 0px', width='600px'))
    wealth = widgets.FloatSlider(value=80, min=5, max=5000, step=1.0, description='Patrimonio:',
                                 layout=Layout(margin='10px 0px', width='600px'))
    button = widgets.Button(description="Ottieni raccomandazione",
                             layout=Layout(margin='30px 0px', width='200px'))

    header_lit  = widgets.HTML(value="<h4>Questionario Financial Literacy</h4>",
                              layout=Layout(margin='20px 0px'))
    header_risk = widgets.HTML(value="<h4>Questionario Risk Propensity</h4>",
                              layout=Layout(margin='20px 0px'))

    ui = widgets.VBox([
        age, gender, family, income, wealth,
        header_lit, *lit_ui,
        header_risk, *risk_ui,
        button,
        output  # usa il widget globale
    ], layout=Layout(row_gap='10px', align_items='flex-start'))

    return ui, age, gender, family, income, wealth, button

# Avvio widget e logica di calcolo
def launch_widget():
    ui, age, gender, family, income, wealth, button = build_ui()
    display(ui)

    def on_click(b):
        clear_output(wait=True)
        display(ui)
        # Pulisci solo output widget
        output.clear_output()
        with output:
            # 1) Input base
            user_input = {"Age": age.value, "Gender": gender.value,
                          "FamilyMembers": family.value,
                          "Income": income.value, "Wealth": wealth.value}

            # 2) MiFID scoring
            lit_scores  = [financial_lit_questions[i]['options'][w.value]
                           for i, w in enumerate(lit_widgets)]
            mifid_lit   = 10 * np.mean(lit_scores) if lit_scores else 0
            risk_scores = [risk_propensity_questions[i]['options'][w.value]
                           for i, w in enumerate(risk_widgets)]
            mifid_risk  = 4 * np.mean(risk_scores) if risk_scores else 0

            user_input['FinancialEducation'] = mifid_lit

            # 3) modello
            X_risk = pd.DataFrame([user_input])
            X_risk.columns = ['Age', 'Gender', 'FamilyMembers', 'FinancialEducation', 'Income ', 'Wealth']
            model_risk = float(risk_model.predict(X_risk)[0])

            # 4) combine
            combined_risk = 0.7 *	model_risk + 0.3 * mifid_risk
            user_input['RiskPropensity'] = combined_risk

            # 5) Stampa intermediate
            print(f"📊 Modello Risk (XGB): {model_risk:.3f}")
            print(f"📊 MiFID Risk: {mifid_risk:.3f}")
            print(f"📊 Risk combinato: {combined_risk:.3f}")
            print(f"📚 Financial Literacy MiFID: {mifid_lit:.3f}\n")

            # 6) raccomandazione
            rec = recommend_products(user_input)
            display(rec)

            # 7) livelli
            edu_idx  = min(max(int(np.ceil(mifid_lit*6)), 1), 6)
            edu_lbl  = ['Low', 'Medium', 'Medium-High', 'High', 'Very High', 'Advanced'][edu_idx-1]
            risk_idx = min(max(int(np.ceil(combined_risk*4)), 1), 4)
            risk_lbl = ['Low','Medium-Low','Medium-High','High'][risk_idx-1]

            print(f"💡 Financial Literacy Level: {edu_idx} ({edu_lbl})")
            print(f"💡 Risk Propensity Level: {risk_idx} ({risk_lbl})")

    button.on_click(on_click)

# ▶️ Esegui widget
launch_widget()


In [None]:
# TO RUN VOILA, use the following
# voila Final_proj.ipynb --TagRemovePreprocessor.enabled=True --TagRemovePreprocessor.remove_input_tags='{"remove_input"}' --TagRemovePreprocessor.remove_cell_tags='{"remove_cell"}'
