In [1]:
import pandas as pd
try:
    data = pd.read_csv("../data/titanic_proc.csv", index_col="PassengerId")
except:
    data = pd.read_csv("https://raw.githubusercontent.com/Argentan/DMA_LAB2/master/data/titanic_proc.csv", index_col="PassengerId")
data.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,NumFam,C,Q,S,Age_nul
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,0,22.0,1,0,0,7.25,0,1,0,0,1,0
2,1,1,1,38.0,1,0,1,71.2833,1,1,1,0,0,0
3,1,3,1,26.0,0,0,2,7.925,0,0,0,0,1,0
4,1,1,1,35.0,1,0,3,53.1,2,1,0,0,1,0
5,0,3,0,35.0,0,0,4,8.05,0,0,0,0,1,0


In [2]:
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold, ParameterGrid

In [3]:
train, test = train_test_split(data, test_size=0.2, random_state=1)

In [4]:
folds = [train.index[idx] for _, idx in KFold(5, shuffle=True, random_state=1).split(train)]

In [5]:
candidatos = [
    {"algoritmo": LogisticRegression, "params": {"C": [0.1, 1], "penalty": [ "l2"]}},
    {"algoritmo": LGBMClassifier, "params": {"num_leaves": [10 , 20], "max_depth": [4 , 6, 8]}}
]

In [6]:
valid_probs = []
test_probs = []
for candidato in candidatos:
    for params in ParameterGrid(candidato["params"]):
        test_fold_probs = []
        valid_fold_probs = []
        name = candidato["algoritmo"].__name__ + ";" + ";".join([f"{k}_{v}" for k, v in params.items()])
        for valid_idx in folds:
            X_valid = train.loc[valid_idx].drop("Survived", axis=1)
            # y_valid = train.loc[valid_idx, "Survived"]

            X_train = train.drop(valid_idx).drop("Survived", axis=1)
            y_train = train.loc[X_train.index, "Survived"]

            model = candidato["algoritmo"](**params)
            model.fit(X_train, y_train)

            p = model.predict_proba(test.drop("Survived", axis=1))[:, -1]
            test_fold_probs.append(pd.Series(p, name=name, index=test.index))

            p = model.predict_proba(X_valid)[:, -1]
            valid_fold_probs.append(pd.Series(p, name=name, index=X_valid.index))
        test_probs.append(pd.concat(test_fold_probs, axis=1).mean(axis=1).rename(name))
        valid_probs.append(pd.concat(valid_fold_probs))
valid_probs = pd.concat(valid_probs, axis=1)
test_probs = pd.concat(test_probs, axis=1)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [12]:
from sklearn.metrics import roc_auc_score

In [13]:
valid_res = pd.Series([roc_auc_score(train.loc[valid_probs.index, "Survived"], valid_probs[c])
                       for c in valid_probs],
                       name="resultados", index=valid_probs.columns)
valid_res

LogisticRegression;C_0.1;penalty_l2         0.851427
LogisticRegression;C_1;penalty_l2           0.855245
LGBMClassifier;max_depth_4;num_leaves_10    0.865097
LGBMClassifier;max_depth_4;num_leaves_20    0.865919
LGBMClassifier;max_depth_6;num_leaves_10    0.862730
LGBMClassifier;max_depth_6;num_leaves_20    0.860549
LGBMClassifier;max_depth_8;num_leaves_10    0.865835
LGBMClassifier;max_depth_8;num_leaves_20    0.861346
Name: resultados, dtype: float64

In [14]:
test_res = pd.Series([roc_auc_score(test.loc[test_probs.index, "Survived"], test_probs[c])
                       for c in test_probs],
                       name="resultados", index=test_probs.columns)
test_res

LogisticRegression;C_0.1;penalty_l2         0.811321
LogisticRegression;C_1;penalty_l2           0.817136
LGBMClassifier;max_depth_4;num_leaves_10    0.834324
LGBMClassifier;max_depth_4;num_leaves_20    0.834712
LGBMClassifier;max_depth_6;num_leaves_10    0.837296
LGBMClassifier;max_depth_6;num_leaves_20    0.837426
LGBMClassifier;max_depth_8;num_leaves_10    0.830706
LGBMClassifier;max_depth_8;num_leaves_20    0.841690
Name: resultados, dtype: float64

In [15]:
valid_res.idxmax()

'LGBMClassifier;max_depth_4;num_leaves_20'

In [16]:
test_probs

Unnamed: 0_level_0,LogisticRegression;C_0.1;penalty_l2,LogisticRegression;C_1;penalty_l2,LGBMClassifier;max_depth_4;num_leaves_10,LGBMClassifier;max_depth_4;num_leaves_20,LGBMClassifier;max_depth_6;num_leaves_10,LGBMClassifier;max_depth_6;num_leaves_20,LGBMClassifier;max_depth_8;num_leaves_10,LGBMClassifier;max_depth_8;num_leaves_20
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
863,0.908777,0.956425,0.966842,0.978932,0.964195,0.979587,0.975066,0.986752
224,0.118781,0.070737,0.075699,0.070332,0.056724,0.054312,0.069612,0.023402
85,0.675857,0.796850,0.838864,0.860207,0.907216,0.897001,0.883649,0.917521
681,0.540585,0.640719,0.729419,0.712269,0.718770,0.719918,0.755097,0.717360
536,0.772679,0.852458,0.969895,0.963201,0.967985,0.958866,0.968264,0.934950
...,...,...,...,...,...,...,...,...
797,0.906250,0.954545,0.965986,0.973875,0.963485,0.974671,0.974035,0.985114
816,0.656718,0.634350,0.109249,0.112938,0.095442,0.104659,0.112057,0.077428
630,0.157774,0.107681,0.046165,0.043207,0.030708,0.029022,0.029755,0.016107
422,0.215291,0.170957,0.124697,0.117185,0.114658,0.094358,0.099402,0.088438
