# Hist Gradient Boosting
Usa histogramas para acelerar el entrenamiento, y fue inspirado en implementaciones como LightGBM. Soporta directamente variables categóricas, sin requerir one-hot encoding.

In [2]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import shap
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load and scale data
df_encoded = pd.read_csv('dataset/df_no_encoding.csv')
features = list(df_encoded.drop(columns=['decoded_target']).columns)
target = 'decoded_target'

X = df_encoded[features]
y = df_encoded[target]
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
X_scaled = X

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, stratify=y, random_state=42)

In [4]:
df_encoded

Unnamed: 0,job,default,contact,month,poutcome,cons.conf.idx,nr.employed,decoded_target,pdays_was_contacted,contact_type
0,housemaid,no,telephone,may,nonexistent,-36.4,5191.0,0,0,0
1,services,unknown,telephone,may,nonexistent,-36.4,5191.0,0,0,0
2,services,no,telephone,may,nonexistent,-36.4,5191.0,0,0,0
3,admin.,no,telephone,may,nonexistent,-36.4,5191.0,0,0,0
4,services,no,telephone,may,nonexistent,-36.4,5191.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
40099,retired,no,cellular,nov,nonexistent,-50.8,4963.6,1,0,0
40100,blue-collar,no,cellular,nov,nonexistent,-50.8,4963.6,0,0,0
40101,retired,no,cellular,nov,nonexistent,-50.8,4963.6,0,0,0
40102,technician,no,cellular,nov,nonexistent,-50.8,4963.6,1,0,0


In [7]:

# Fit final model
final_model = HistGradientBoostingClassifier(max_iter=500,
                                             early_stopping=False,
                                             learning_rate=0.01,
                                             max_leaf_nodes=15,
                                             random_state=42,
                                             categorical_features=['job','default','contact','month','poutcome','pdays_was_contacted','contact_type'])
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
y_proba = final_model.predict_proba(X_test)[:, 1]

# --- 1. Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)
labels = ['No', 'Yes']
cm_fig = px.imshow(cm, text_auto=True,
                   x=labels, y=labels,
                   labels=dict(x="Predicted", y="Actual", color="Count"),
                   title="Confusion Matrix")
cm_fig.update_layout(width=500, height=400)
cm_fig.show()

# --- 2. ROC Curve ---
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
roc_fig = go.Figure()
roc_fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC Curve', line=dict(color='blue')))
roc_fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(dash='dash')))
roc_fig.update_layout(title=f"ROC Curve (AUC = {roc_auc:.4f})",
                      xaxis_title='False Positive Rate',
                      yaxis_title='True Positive Rate',
                      width=600,
                      height=500)
roc_fig.show()

# --- 3. Lift Curve ---
lift_df = pd.DataFrame({'y_true': y_test.values, 'y_proba': y_proba})
lift_df.sort_values('y_proba', ascending=False, inplace=True)
lift_df['cum_response'] = lift_df['y_true'].cumsum()
lift_df['total_positives'] = lift_df['y_true'].sum()
lift_df['baseline'] = np.linspace(0, 1, len(lift_df))
lift_df['cum_gain'] = lift_df['cum_response'] / lift_df['total_positives']
lift_df['percent_contacted'] = np.arange(1, len(lift_df) + 1) / len(lift_df)

alift = np.trapezoid(lift_df['cum_gain'], lift_df['percent_contacted'])

lift_fig = go.Figure()
lift_fig.add_trace(go.Scatter(
    x=lift_df['percent_contacted'],
    y=lift_df['cum_gain'],
    mode='lines',
    name='Lift Curve',
    line=dict(color='blue')
))
lift_fig.add_trace(go.Scatter(
    x=lift_df['percent_contacted'],
    y=lift_df['baseline'],
    mode='lines',
    name='Random Targeting',
    line=dict(dash='dash', color='gray')
))
lift_fig.update_layout(title=f'Cumulative Lift Curve (ALIFT = {alift:.4f})',
                       xaxis_title='Proportion of Contacted Clients',
                       yaxis_title='Cumulative Gain',
                       width=700,
                       height=500)
lift_fig.show()

Obtenemos un rendimiento muy similar, evitando el procesamiento de encoding. Pero mantengo encoding en este caso ya que quiero mantener la grafica SHAP y:
- SHAP calcula la contribución de cada columna numérica en el input al output del modelo.
- Columnas categóricas tipo category no tienen representación numérica clara si no han sido codificadas.
- Incluso si HistGradientBoostingClassifier puede manejar category, SHAP no sabe cómo interpretarlas directamente.