## LIBRERIAS

In [15]:
# Manejo de datos
import pandas as pd
import numpy as np

# Baselines y optimización
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, precision_score, recall_score, \
roc_curve, roc_auc_score, ConfusionMatrixDisplay, multilabel_confusion_matrix


# Warnigs
import warnings
warnings.filterwarnings("ignore")

## CARGA DE DATOS

In [2]:
X = pd.read_csv("./data/X_NEW.csv")

df_y = pd.read_csv("./data/transformed_df1.csv")
y = df_y["income"]

In [8]:
y.value_counts()

income
0.0    29265
1.0     9975
Name: count, dtype: int64

## SEPARACIÓN DATOS

In [5]:
TRAIN, TEST = train_test_split(X, test_size=0.2, random_state=42)

y_train, y_test = train_test_split(y,test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(TRAIN, y_train, test_size=0.3, random_state=42)

## BASELINES

In [18]:
# modelos
modelos = {
    "LogisticRegression": LogisticRegression(),
    "SVC": SVC(),
    "DecisionTree": DecisionTreeClassifier(),
    "RandomForest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "GradientBoost": GradientBoostingClassifier(),
    "HistGradientBoosting": HistGradientBoostingClassifier()
}

# Define las métricas a usar
resultados_dict = {}
metricas = ["accuracy", "f1_macro"]


for nombre_modelo, modelo in modelos.items():
    cv_resultados = cross_validate(modelo, X_train, y_train, cv=5, scoring=metricas)
    
    for metrica in metricas:
        clave = f"{nombre_modelo}_{metrica}"
        resultados_dict[clave] = cv_resultados[f"test_{metrica}"].mean()

        for metrica in metricas:
            clave = f"{nombre_modelo}_{metrica}"
            resultados_dict[clave] = cv_resultados[f"test_{metrica}"].mean()

resultados_df = pd.DataFrame([resultados_dict])

resultados_df.T

Unnamed: 0,0
LogisticRegression_accuracy,0.799582
LogisticRegression_f1_macro,0.687531
SVC_accuracy,0.745244
SVC_f1_macro,0.427014
DecisionTree_accuracy,0.799126
DecisionTree_f1_macro,0.738486
RandomForest_accuracy,0.836671
RandomForest_f1_macro,0.774057
AdaBoost_accuracy,0.850505
AdaBoost_f1_macro,0.785294


In [19]:
# Instanciando el voting
VotingC = VotingClassifier(estimators=
                          [
                              ("dt", DecisionTreeClassifier(random_state=42)),
                              ("hgb", HistGradientBoostingClassifier(random_state=42)),
                              ("rf", RandomForestClassifier(random_state=42))
                          ])
# Entrenando el voting
VotingC.fit(X_train, y_train)

#monstrando resultados
for name, clf in VotingC.named_estimators_.items():
    print(name, "=", clf.score(X_val, y_val))

dt = 0.8042047143767255
hgb = 0.8593119558292631
rf = 0.8385007432575918


Elegimos hgb

In [20]:
# parámetros a evaluar
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_iter': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [10, 20, 50],
    'l2_regularization': [0.0, 0.1, 0.5],
    'max_bins': [255, 512]
}

# modelo
hgb = HistGradientBoostingClassifier(random_state=42)

# gridsearch
grid_search = GridSearchCV(hgb,
                           param_grid,
                           cv=5,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1
                          )

# Entrenamiento
grid_search.fit(X_train, y_train)

# instanciando modelo entrenado en una variable
hgb = grid_search.best_estimator_

# mostrando mejores valores

print(grid_search.best_params_)

{'l2_regularization': 0.1, 'learning_rate': 0.1, 'max_bins': 255, 'max_depth': 3, 'max_iter': 500, 'min_samples_leaf': 10}


## MÉTRICAS Y VALIDACIÓN 

In [28]:
pred = hgb.predict(X_train)

print("train accuracy:", accuracy_score(y_train, pred))
print('train f1:', f1_score(y_train, pred))

train accuracy: 0.8650678074087558
train f1: 0.703648175912044


In [27]:
pred = hgb.predict(X_val)

print("val accuracy:", accuracy_score(y_val, pred))
print('val f1:', f1_score(y_val, pred))

val accuracy: 0.8603737523890422
val f1: 0.6905154153918569


In [26]:
pred_test = hgb.predict(TEST)
print("accuracy:", accuracy_score(y_test, pred_test))
print("f1:", f1_score(y_test, pred_test))

accuracy: 0.8688837920489296
f1: 0.7110362257792755


La bondad del modelo es realmente buena, no hay overfitting ni underfitting, se ajusta perfectamente a una predicción real.