In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, make_scorer
from imblearn.over_sampling import SMOTE
import joblib

# Cargar y limpiar datos
df = pd.read_csv('survey lung cancer.csv')

# Remover outliers en AGE
Q1 = df['AGE'].quantile(0.25)
Q3 = df['AGE'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['AGE'] >= lower_bound) & (df['AGE'] <= upper_bound)]

# Convertir columnas (2->1, 1->0)
columns_to_convert = df.columns.difference(['GENDER', 'AGE', 'LUNG_CANCER'])
df[columns_to_convert] = df[columns_to_convert].replace({2: 1, 1: 0})

# Mapear variables categóricas
df['LUNG_CANCER'] = df['LUNG_CANCER'].map({'NO': 0, 'YES': 1})
df['GENDER'] = df['GENDER'].map({'F': 0, 'M': 1})

# Preparar datos
X = df.drop('LUNG_CANCER', axis=1)
y = df['LUNG_CANCER']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Preparar arrays
y_train_res_arr = np.ravel(y_train_res)
y_test_arr = np.ravel(y_test)

# Cross-validation y scoring
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = make_scorer(recall_score, pos_label=1)

# Modelo y parámetros
models = {
    "LogisticRegression": (
        Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=2000, random_state=42))
        ]),
        {
            "clf__C": [0.01, 0.1, 1, 10],
            "clf__class_weight": [None, "balanced"],
            "clf__penalty": ["l2"],
            "clf__solver": ["lbfgs", "liblinear"],
        }
    ),
}

# Entrenamiento
all_results = []
best_models = {}

for name, (estimator, param_grid) in models.items():
    print(f"\n=== Grid Search: {name} ===")
    
    grid = GridSearchCV(
        estimator=estimator,
        param_grid=param_grid,
        scoring=scoring,
        cv=cv,
        n_jobs=-1,
        refit=True,
        verbose=0
    )
    
    grid.fit(X_train_res, y_train_res_arr)
    best_models[name] = grid.best_estimator_
    
    print("Best parameters:", grid.best_params_)
    print(f"Best recall (class 1): {grid.best_score_:.4f}")
    
    y_pred = grid.predict(X_test)
    
    print("\nClassification Report (test):")
    print(classification_report(y_test_arr, y_pred, zero_division=0))
    
    rep = classification_report(y_test_arr, y_pred, output_dict=True, zero_division=0)
    
    all_results.append({
        "model": name,
        "test_accuracy": accuracy_score(y_test_arr, y_pred),
        "test_f1_macro": f1_score(y_test_arr, y_pred, average="macro"),
        "test_recall_class1": rep["1"]["recall"],
        "test_precision_class1": rep["1"]["precision"],
        "test_f1_class1": rep["1"]["f1-score"],
        "best_params": grid.best_params_,
    })

summary_df = pd.DataFrame(all_results).sort_values(by="test_recall_class1", ascending=False)
print("\n================ Summary sorted by recall (class 1) ================")
print(summary_df.to_string(index=False))

# =====================================================
# EXPORTAR MODELO PARA STREAMLIT
# =====================================================

# Obtener el mejor modelo
best_model = best_models["LogisticRegression"]

# Guardar el modelo
joblib.dump(best_model, 'lung_cancer_model.pkl')
print("\nModelo guardado como 'lung_cancer_model.pkl'")

# Guardar nombres de características para referencia
feature_names = X.columns.tolist()
joblib.dump(feature_names, 'feature_names.pkl')
print("Nombres de características guardados como 'feature_names.pkl'")

print("\nPara usar en Streamlit, necesitarás estos archivos:")
print("   - lung_cancer_model.pkl")
print("   - feature_names.pkl")


=== Grid Search: LogisticRegression ===
Best parameters: {'clf__C': 0.1, 'clf__class_weight': None, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}
Best recall (class 1): 0.9674

Classification Report (test):
              precision    recall  f1-score   support

           0       0.67      0.75      0.71         8
           1       0.96      0.94      0.95        54

    accuracy                           0.92        62
   macro avg       0.81      0.85      0.83        62
weighted avg       0.92      0.92      0.92        62


             model  test_accuracy  test_f1_macro  test_recall_class1  test_precision_class1  test_f1_class1                                                                              best_params
LogisticRegression       0.919355       0.829577            0.944444               0.962264        0.953271 {'clf__C': 0.1, 'clf__class_weight': None, 'clf__penalty': 'l2', 'clf__solver': 'lbfgs'}

Modelo guardado como 'lung_cancer_model.pkl'
Nombres de característic