In [2]:
import pandas as pd
from sklearn.exceptions import ConvergenceWarning
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings 
warnings.filterwarnings('ignore', category=ResourceWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=ConvergenceWarning)


from pipeline_function import run_pipeline
from cascade_logistic_model import CascadedLogisticRegression
from cascade_logistic_model import ThresholdedCascadedLogisticRegression
from model_submission import submit

train = pd.read_csv('./datasets/train.csv')
test = pd.read_csv('./datasets/test.csv') 

LABEL_VARS = ['sex', 'fbs', 'exang', 'slope']
ONEHOT_VARS = ['cp', 'restecg']

In [3]:
import plotly.express as px
from sklearn.metrics import confusion_matrix, classification_report

def show_confusion_matrix(y_true, y_pred, title="Matriz de Confusión", labels=None):

    cm = confusion_matrix(y_true, y_pred, labels=labels)
    labels_display = labels

    fig = px.imshow(
        cm,
        text_auto=True,                
        aspect="auto",                 
        labels=dict(x="Etiqueta Predicha", y="Etiqueta Verdadera", color="Conteo"), 
        x=labels_display,              
        y=labels_display,              
        color_continuous_scale='Blues' 
    )

    fig.update_layout(
        title={
            'text': title,
            # 'y': 0.95,
            'x': 0.5,                  
            # 'xanchor': 'center',
            # 'yanchor': 'top'
        },
        title_font_size=20,
        width=700,                     
        height=600
    )
    
    fig.update_traces(textfont_size=14)

    fig.show()

    fig.write_image("./images/confusion_matrix.png", scale=3, width=700, height=700)
    
    print(classification_report(y_true, y_pred, labels=labels, zero_division=0))
    
    return cm

In [None]:
from sklearn.neural_network import MLPClassifier


X_train_scaled, y_train, X_test_scaled = run_pipeline(train, test, LABEL_VARS, ONEHOT_VARS)

model_params = {
    'Regresión Logística': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'params': {
            'C': [0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 15, 20],
            'solver': ['lbfgs']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5]
        }
    },
    'SVM': {
        'model': SVC(random_state=42),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['rbf', 'linear'],
            'gamma': ['scale', 'auto']
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5]
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance']
        }
    },
    'NNET': {
        'model': MLPClassifier(random_state=42, max_iter=5000),
        'params': {
            'hidden_layer_sizes': [(300,), (100,), (50, 50)],
            'activation': ['relu'],
            'solver': ['adam', 'sgd']
        }
    }
}

scores = []
best_estimators = {}
logreg_results_df = None 

for model_name, mp in model_params.items():
    clf = GridSearchCV(
        mp['model'],
        mp['params'],
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    clf.fit(X_train_scaled, y_train)
    
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
    best_estimators[model_name] = clf.best_estimator_
    
    if model_name == 'Regresión Logística':
        logreg_results_df = pd.DataFrame({
            'C': clf.cv_results_['param_C'].data.astype(float),
            'mean_accuracy': clf.cv_results_['mean_test_score'],
            'std_accuracy': clf.cv_results_['std_test_score']
        }).sort_values('C')

# Resultados globales
results_df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
results_df = results_df.sort_values(by='best_score', ascending=False)

display(results_df)

best_model_name = results_df.iloc[0]['model']
best_model = best_estimators[best_model_name]

print(f"\nEl mejor modelo seleccionado es: {best_model_name} con Accuracy {results_df.iloc[0]['best_score']:.4f}")

if logreg_results_df is not None:
    print("\nResultados de Regresión Logística por valor de C:")
    display(logreg_results_df)


from sklearn.model_selection import cross_val_predict

y_cv_pred = cross_val_predict(
    best_model, 
    X_train_scaled, 
    y_train, 
    cv=5,          
    n_jobs=-1
)

show_confusion_matrix(
    y_train,
    y_cv_pred,
    title=f"{best_model_name} - Matriz de Confusión",
    labels=[0, 1, 2, 3, 4]
)

Unnamed: 0,model,best_score,best_params
0,Regresión Logística,0.551906,"{'C': 0.05, 'solver': 'lbfgs'}"
1,Random Forest,0.543752,"{'max_depth': 10, 'min_samples_split': 2, 'n_e..."
3,Gradient Boosting,0.540975,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est..."
2,SVM,0.530034,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}"
4,KNN,0.515003,"{'n_neighbors': 9, 'weights': 'distance'}"
5,NNET,0.505414,"{'activation': 'relu', 'hidden_layer_sizes': (..."



El mejor modelo seleccionado es: Regresión Logística con Accuracy 0.5519

Resultados de Regresión Logística por valor de C:


Unnamed: 0,C,mean_accuracy,std_accuracy
0,0.01,0.550573,0.024094
1,0.05,0.551906,0.021889
2,0.1,0.551896,0.015127
3,0.5,0.534144,0.008332
4,1.0,0.532765,0.008161
5,2.0,0.527304,0.010745
6,5.0,0.525944,0.013034
7,10.0,0.520464,0.010292
8,15.0,0.519094,0.01064
9,20.0,0.517724,0.01164


              precision    recall  f1-score   support

           0       0.71      0.89      0.79       327
           1       0.40      0.41      0.40       156
           2       0.30      0.16      0.21       108
           3       0.32      0.29      0.30       107
           4       0.25      0.03      0.05        34

    accuracy                           0.55       732
   macro avg       0.39      0.36      0.35       732
weighted avg       0.50      0.55      0.52       732



array([[291,  24,   3,   9,   0],
       [ 65,  64,  12,  15,   0],
       [ 23,  37,  17,  31,   0],
       [ 26,  31,  16,  31,   3],
       [  6,   6,   9,  12,   1]])

In [6]:
import plotly.express as px
import kaleido


fig = px.line(
    logreg_results_df,
    x="C",
    y="mean_accuracy",
    markers=True,
    title="Regresión Logística: Accuracy medio por valor de C",
)

fig.update_layout(
    xaxis_title="C",
    yaxis_title="Accuracy medio (CV)",
    xaxis_type="log",
    height=500,
    title_x=0.5
)

fig.show()

fig.write_image("./images/logreg_accuracy_vs_C.png", scale=3, width=800, height=800)


In [7]:
%pip install -U "plotly>=6.1.1"


Note: you may need to restart the kernel to use updated packages.


In [8]:
import plotly.express as px

# Plot accuracy per model (CV best_score)
fig = px.bar(
    results_df.sort_values('best_score', ascending=False),
    x='model',
    y='best_score',
    text='best_score',
    labels={'best_score': 'Accuracy', 'model': 'Modelo'},
    title='Accuracy por modelo (CV)',
    color_discrete_sequence=['rgba(0, 0, 255, 0.5)'],
)

fig.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig.update_layout(
    yaxis=dict(range=[0, 0.6]), 
    height=400,
    title_x=0.5)
fig.show()

fig.write_image("./images/accuracy_models.png", scale=3, width=1200, height=700)



## Prueba pseudolabeling

In [9]:
from sklearn.base import clone
import numpy as np

PSEUDO_THRESHOLD = 0.5  

best_model.fit(X_train_scaled, y_train)

probas_test = best_model.predict_proba(X_test_scaled)
max_probas = probas_test.max(axis=1)

mask_pseudo = max_probas >= PSEUDO_THRESHOLD
X_pseudo = X_test_scaled[mask_pseudo]
y_pseudo_idx = probas_test[mask_pseudo].argmax(axis=1)
y_pseudo = best_model.classes_[y_pseudo_idx]

X_aug = np.vstack([X_train_scaled, X_pseudo])
y_aug = np.concatenate([y_train, y_pseudo])

best_params = results_df.iloc[0]['best_params']
base_model = model_params[best_model_name]['model']
final_model = clone(base_model)
final_model.set_params(**best_params)

final_model.fit(X_aug, y_aug)

## Binary Model

In [10]:
# y_train original tiene clases 0..4, pero el modelo ya binariza internamente.
# Para las métricas / matriz de confusión, creamos la versión binaria:
from catboost import CatBoostClassifier
from cascade_logistic_model import ZeroVsRestLogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

y_train_bin = (y_train > 0).astype(int)

bin_model = ZeroVsRestLogisticRegression(

)

# ---- CV con cross_val_score ----
cv_scores = cross_val_score(
    bin_model,
    X_train_scaled,
    y_train_bin,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print(f"Cross-validated Accuracy (Zero vs Rest LR): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# ---- OOF preds con cross_val_predict ----
y_cv_pred = cross_val_predict(
    bin_model,
    X_train_scaled,
    y_train_bin,
    cv=5,
    n_jobs=-1
)

print(f"OOF CV Accuracy (Zero vs Rest LR): {accuracy_score(y_train_bin, y_cv_pred):.4f}")

from charts import plot_accuracy_circle
plot_accuracy_circle(accuracy_score(y_train_bin, y_cv_pred), title="Modelo binario Zero vs Resto - Accuracy")

show_confusion_matrix(
    y_train_bin,
    y_cv_pred,
    title="Modelo binario Zero vs Resto - Matriz de Confusión",
    labels=[0, 1]   # 0 = clase 0 real, 1 = resto (1,2,3,4)
)


Cross-validated Accuracy (Zero vs Rest LR): 0.8074 ± 0.0172
OOF CV Accuracy (Zero vs Rest LR): 0.8074


              precision    recall  f1-score   support

           0       0.80      0.76      0.78       327
           1       0.81      0.84      0.83       405

    accuracy                           0.81       732
   macro avg       0.81      0.80      0.80       732
weighted avg       0.81      0.81      0.81       732



array([[249,  78],
       [ 63, 342]])

In [11]:
# y_train original tiene clases 0..4, pero el modelo ya binariza internamente.
# Para las métricas / matriz de confusión, creamos la versión binaria:
from catboost import CatBoostClassifier
from cascade_logistic_model import ZeroVsRestLogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

y_train_bin = (y_train > 0).astype(int)

bin_model = ZeroVsRestLogisticRegression(
    zero_threshold=0.65

)

# ---- CV con cross_val_score ----
cv_scores = cross_val_score(
    bin_model,
    X_train_scaled,
    y_train_bin,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print(f"Cross-validated Accuracy (Zero vs Rest LR): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# ---- OOF preds con cross_val_predict ----
y_cv_pred = cross_val_predict(
    bin_model,
    X_train_scaled,
    y_train_bin,
    cv=5,
    n_jobs=-1
)

print(f"OOF CV Accuracy (Zero vs Rest LR): {accuracy_score(y_train_bin, y_cv_pred):.4f}")

show_confusion_matrix(
    y_train_bin,
    y_cv_pred,
    title="Modelo binario Zero vs Resto - Matriz de Confusión",
    labels=[0, 1]   # 0 = clase 0 real, 1 = resto (1,2,3,4)
)


Cross-validated Accuracy (Zero vs Rest LR): 0.8020 ± 0.0422
OOF CV Accuracy (Zero vs Rest LR): 0.8019


              precision    recall  f1-score   support

           0       0.87      0.66      0.75       327
           1       0.77      0.92      0.84       405

    accuracy                           0.80       732
   macro avg       0.82      0.79      0.79       732
weighted avg       0.81      0.80      0.80       732



array([[215, 112],
       [ 33, 372]])

## Binary Model + Classification

0 / (1, 2, 3, 4) --> 1 / 2 / 3 / 4

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

cascade_model = CascadedLogisticRegression(
    C_zero=0.1,
    C_multi=0.1,
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

cv_scores = cross_val_score(
    cascade_model,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print(f"Cross-validated Accuracy (cascade LR): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

y_cv_pred = cross_val_predict(
    cascade_model,
    X_train_scaled,
    y_train,
    cv=5,
    n_jobs=-1
)

print(f"OOF CV Accuracy (cascade LR): {accuracy_score(y_train, y_cv_pred):.4f}")

show_confusion_matrix(
    y_train,
    y_cv_pred,
    title="Regresión Logística en Cascada - Matriz de Confusión",
    labels=[0, 1, 2, 3, 4]
)

cascade_model.fit(X_train_scaled, y_train)

y_train_pred = cascade_model.predict(X_train_scaled)
print(f"Train Accuracy (cascade LR): {accuracy_score(y_train, y_train_pred):.4f}")


Cross-validated Accuracy (cascade LR): 0.5122 ± 0.0363
OOF CV Accuracy (cascade LR): 0.5123


              precision    recall  f1-score   support

           0       0.80      0.76      0.78       327
           1       0.33      0.46      0.38       156
           2       0.27      0.20      0.23       108
           3       0.27      0.28      0.28       107
           4       0.27      0.09      0.13        34

    accuracy                           0.51       732
   macro avg       0.39      0.36      0.36       732
weighted avg       0.52      0.51      0.51       732

Train Accuracy (cascade LR): 0.5505


## Binary Model + Classification

0 / (1, 2, 3, 4) --> = 0 / 1 / 2 / 3 / 4

In [13]:

cascade_thresh_model = ThresholdedCascadedLogisticRegression(
    C_zero=0.1,
    C_multi=0.1,
    solver='lbfgs',
    max_iter=1000,
    random_state=42,
    zero_threshold=0.7
)

cv_scores = cross_val_score(
    cascade_thresh_model,
    X_train_scaled,
    y_train,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print(f"Cross-validated Accuracy (cascade LR): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

y_cv_pred = cross_val_predict(
    cascade_thresh_model,
    X_train_scaled,
    y_train,
    cv=5,
    n_jobs=-1
)

print(f"OOF CV Accuracy (cascade LR): {accuracy_score(y_train, y_cv_pred):.4f}")

show_confusion_matrix(
    y_train,
    y_cv_pred,
    title="Regresión Logística en cascada - Matriz de Confusión",
    labels=[0, 1, 2, 3, 4]
)

cascade_thresh_model.fit(X_train_scaled, y_train)

y_train_pred = cascade_thresh_model.predict(X_train_scaled)
print(f"Train Accuracy (cascade LR): {accuracy_score(y_train, y_train_pred):.4f}")

Cross-validated Accuracy (cascade LR): 0.5382 ± 0.0365
OOF CV Accuracy (cascade LR): 0.5383


              precision    recall  f1-score   support

           0       0.70      0.87      0.78       327
           1       0.35      0.37      0.36       156
           2       0.33      0.17      0.22       108
           3       0.29      0.28      0.29       107
           4       0.38      0.09      0.14        34

    accuracy                           0.54       732
   macro avg       0.41      0.36      0.36       732
weighted avg       0.50      0.54      0.51       732

Train Accuracy (cascade LR): 0.5779
