In [None]:
import pandas as pd
try:
    data = pd.read_csv("../data/titanic_proc.csv", index_col="PassengerId")
except:
    data = pd.read_csv("https://raw.githubusercontent.com/Argentan/DMA_LAB2/master/data/titanic_proc.csv", index_col="PassengerId")
data.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold, ParameterGrid

In [None]:
train, test = train_test_split(data, test_size=0.2, random_state=1)

In [None]:
folds = [train.index[idx] for _, idx in KFold(5, shuffle=True, random_state=1).split(train)]

In [None]:
candidatos = [
    {"algoritmo": LogisticRegression, "params": {"C": [0.1, 1], "penalty": [ "l2"]}},
    {"algoritmo": LGBMClassifier, "params": {"num_leaves": [10 , 20], "max_depth": [4 , 6, 8]}}
]

In [None]:
valid_probs = []
test_probs = []
for candidato in candidatos:
    for params in ParameterGrid(candidato["params"]):
        test_fold_probs = []
        valid_fold_probs = []
        name = candidato["algoritmo"].__name__ + ";" + ";".join([f"{k}_{v}" for k, v in params.items()])
        for valid_idx in folds:
            X_valid = train.loc[valid_idx].drop("Survived", axis=1)
            # y_valid = train.loc[valid_idx, "Survived"]

            X_train = train.drop(valid_idx).drop("Survived", axis=1)
            y_train = train.loc[X_train.index, "Survived"]

            model = candidato["algoritmo"](**params)
            model.fit(X_train, y_train)

            p = model.predict_proba(test.drop("Survived", axis=1))[:, -1]
            test_fold_probs.append(pd.Series(p, name=name, index=test.index))

            p = model.predict_proba(X_valid)[:, -1]
            valid_fold_probs.append(pd.Series(p, name=name, index=X_valid.index))
        test_probs.append(pd.concat(test_fold_probs, axis=1).mean(axis=1).rename(name))
        valid_probs.append(pd.concat(valid_fold_probs))
valid_probs = pd.concat(valid_probs, axis=1)
test_probs = pd.concat(test_probs, axis=1)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
valid_res = pd.Series([roc_auc_score(train.loc[valid_probs.index, "Survived"], valid_probs[c])
                       for c in valid_probs],
                       name="resultados", index=valid_probs.columns)
valid_res

In [None]:
test_res = pd.Series([roc_auc_score(test.loc[test_probs.index, "Survived"], test_probs[c])
                       for c in test_probs],
                       name="resultados", index=test_probs.columns)
test_res

In [None]:
valid_res.idxmax()

In [None]:
test_probs