This notebook just processes below:

- Fill `NaN` data simply with median, and other values
- Cross-validate two models (LightGBM, Random Forest) and simply ensemble these results

In [2]:
import optuna
import optuna.integration.lightgbm as lgb
import pandas as pd
import pathlib
from sklearn import compose
from sklearn import ensemble
from sklearn import impute
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.pipeline import Pipeline

COMP_NAME = 'titanic'
HOME_DIR = pathlib.Path('/home/aiskay/competitions') / COMP_NAME
SEED = 42

optuna.logging.set_verbosity(optuna.logging.WARNING)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# specify 'PassengerId' as index
train = pd.read_csv(HOME_DIR / 'input' / COMP_NAME / 'train.csv', index_col=0)
test = pd.read_csv(HOME_DIR / 'input' / COMP_NAME / 'test.csv', index_col=0)
train.head(4)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


# Arrange the Data

In [4]:
X_train = train.drop(['Survived'], axis=1).copy()
y_train = train['Survived'].copy()

In [5]:
# display any rows including Nan
X_train[X_train.isnull().any(axis=1)]

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...
885,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
886,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [6]:
# make_column_transformer does not support in-order processing
ct = compose.make_column_transformer(
    # drop irrevant cols
    ('drop', ['Name', 'Ticket', 'Cabin']),
    # fill NaN
    (impute.SimpleImputer(strategy='constant', fill_value=-999), ['Age']),
    # encode
    (preprocessing.OrdinalEncoder(), ['Sex']),
    # encode after filling NaN
    (Pipeline(steps=[
        ('filling', impute.SimpleImputer(strategy='most_frequent')),
        ('encoding', preprocessing.OrdinalEncoder())
    ]), ['Embarked']),
    remainder='passthrough'
)

X_train = pd.DataFrame(
    ct.fit_transform(X_train),
    columns=['Age', 'Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch', 'Fare']
)
X_train.head(4)

Unnamed: 0,Age,Sex,Embarked,Pclass,SibSp,Parch,Fare
0,22.0,1.0,2.0,3.0,1.0,0.0,7.25
1,38.0,0.0,0.0,1.0,1.0,0.0,71.2833
2,26.0,0.0,2.0,3.0,0.0,0.0,7.925
3,35.0,0.0,2.0,1.0,1.0,0.0,53.1


# Train

## Hyper parameter search

In [10]:
# lightGBM
params = {
    "objective": "binary",
    'metric': 'binary_logloss',
    'verbosity': -1,
    "boosting_type": "gbdt",
    'metric': 'auc'
}

tuner = lgb.LightGBMTunerCV(
    params,
    lgb.Dataset(X_train, y_train),
    verbose_eval=False,
    folds=model_selection.StratifiedShuffleSplit(n_splits=3, random_state=SEED),
    callbacks=[lgb.early_stopping(100)]
)

tuner.run()

best_params = tuner.best_params
print("Best score:", tuner.best_score)
print("  Params: ")
for key, value in best_params.items():
    print("    {}: {}".format(key, value))

feature_fraction, val_score: -inf:   0%|          | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds


feature_fraction, val_score: 0.838701:  14%|#4        | 1/7 [00:00<00:01,  3.72it/s]

Early stopping, best iteration is:
[43]	cv_agg's auc: 0.838701 + 0.00839514


feature_fraction, val_score: 0.843810:  29%|##8       | 2/7 [00:00<00:01,  3.47it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


feature_fraction, val_score: 0.843810:  43%|####2     | 3/7 [00:00<00:01,  3.86it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


feature_fraction, val_score: 0.843810:  57%|#####7    | 4/7 [00:01<00:00,  3.92it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


feature_fraction, val_score: 0.843810:  71%|#######1  | 5/7 [00:01<00:00,  3.14it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


feature_fraction, val_score: 0.843810: 100%|##########| 7/7 [00:01<00:00,  3.59it/s]


Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611
Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  10%|#         | 2/20 [00:00<00:03,  4.95it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611
Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  15%|#5        | 3/20 [00:00<00:03,  4.39it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  20%|##        | 4/20 [00:00<00:03,  4.32it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  25%|##5       | 5/20 [00:01<00:03,  4.15it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  30%|###       | 6/20 [00:01<00:03,  4.10it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  35%|###5      | 7/20 [00:01<00:03,  3.81it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  40%|####      | 8/20 [00:02<00:03,  3.47it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  45%|####5     | 9/20 [00:02<00:03,  3.64it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  50%|#####     | 10/20 [00:02<00:02,  3.94it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  55%|#####5    | 11/20 [00:02<00:02,  3.64it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611
Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  65%|######5   | 13/20 [00:03<00:01,  4.06it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611
Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611

num_leaves, val_score: 0.843810:  70%|#######   | 14/20 [00:03<00:01,  4.30it/s]




num_leaves, val_score: 0.843810:  75%|#######5  | 15/20 [00:03<00:01,  4.29it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  85%|########5 | 17/20 [00:04<00:00,  4.23it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611
Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810:  95%|#########5| 19/20 [00:04<00:00,  5.07it/s]

Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611
Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


num_leaves, val_score: 0.843810: 100%|##########| 20/20 [00:04<00:00,  4.01it/s]


Early stopping, best iteration is:
[38]	cv_agg's auc: 0.84381 + 0.00728611


bagging, val_score: 0.844069:  10%|#         | 1/10 [00:00<00:01,  4.82it/s]

Early stopping, best iteration is:
[122]	cv_agg's auc: 0.844069 + 0.0158794


bagging, val_score: 0.844069:  20%|##        | 2/10 [00:00<00:02,  3.34it/s]

Early stopping, best iteration is:
[122]	cv_agg's auc: 0.844069 + 0.0158794


bagging, val_score: 0.844069:  30%|###       | 3/10 [00:00<00:02,  3.42it/s]

Early stopping, best iteration is:
[122]	cv_agg's auc: 0.844069 + 0.0158794


bagging, val_score: 0.844069:  40%|####      | 4/10 [00:01<00:01,  3.34it/s]

Early stopping, best iteration is:
[122]	cv_agg's auc: 0.844069 + 0.0158794


bagging, val_score: 0.844069:  50%|#####     | 5/10 [00:01<00:01,  3.46it/s]

Early stopping, best iteration is:
[122]	cv_agg's auc: 0.844069 + 0.0158794


bagging, val_score: 0.844069:  60%|######    | 6/10 [00:01<00:01,  3.40it/s]

Early stopping, best iteration is:
[122]	cv_agg's auc: 0.844069 + 0.0158794


bagging, val_score: 0.844069:  70%|#######   | 7/10 [00:02<00:00,  3.22it/s]

Early stopping, best iteration is:
[122]	cv_agg's auc: 0.844069 + 0.0158794


bagging, val_score: 0.848398:  90%|######### | 9/10 [00:02<00:00,  3.64it/s]

Early stopping, best iteration is:
[122]	cv_agg's auc: 0.844069 + 0.0158794
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


bagging, val_score: 0.848398: 100%|##########| 10/10 [00:02<00:00,  3.54it/s]


Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


feature_fraction_stage2, val_score: 0.848398:  17%|#6        | 1/6 [00:00<00:01,  4.97it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


feature_fraction_stage2, val_score: 0.848398:  33%|###3      | 2/6 [00:00<00:01,  3.24it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


feature_fraction_stage2, val_score: 0.848398:  50%|#####     | 3/6 [00:00<00:00,  3.80it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


feature_fraction_stage2, val_score: 0.848398:  83%|########3 | 5/6 [00:01<00:00,  4.34it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


feature_fraction_stage2, val_score: 0.848398: 100%|##########| 6/6 [00:01<00:00,  3.67it/s]


Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848398:   5%|5         | 1/20 [00:00<00:04,  4.14it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848398:  10%|#         | 2/20 [00:00<00:04,  3.97it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848398:  15%|#5        | 3/20 [00:00<00:05,  2.87it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848398:  20%|##        | 4/20 [00:01<00:04,  3.43it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848398:  30%|###       | 6/20 [00:01<00:03,  3.96it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848398:  35%|###5      | 7/20 [00:02<00:03,  3.38it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848398:  45%|####5     | 9/20 [00:02<00:03,  3.58it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848398:  50%|#####     | 10/20 [00:02<00:02,  4.24it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848398 + 0.0257744


regularization_factors, val_score: 0.848745:  60%|######    | 12/20 [00:03<00:01,  4.33it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848745 + 0.0215238
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848745 + 0.0215238


regularization_factors, val_score: 0.849091:  70%|#######   | 14/20 [00:03<00:01,  4.65it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.848745 + 0.0215238
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.849091 + 0.0210705


regularization_factors, val_score: 0.850996:  80%|########  | 16/20 [00:03<00:00,  4.91it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885


regularization_factors, val_score: 0.850996:  85%|########5 | 17/20 [00:04<00:00,  4.61it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885


regularization_factors, val_score: 0.850996:  90%|######### | 18/20 [00:04<00:00,  4.26it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885


regularization_factors, val_score: 0.850996: 100%|##########| 20/20 [00:04<00:00,  4.11it/s]


Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885


min_data_in_leaf, val_score: 0.850996:  20%|##        | 1/5 [00:00<00:00,  7.30it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885


min_data_in_leaf, val_score: 0.850996:  40%|####      | 2/5 [00:00<00:00,  4.82it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885


min_data_in_leaf, val_score: 0.850996:  60%|######    | 3/5 [00:00<00:00,  3.50it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885


min_data_in_leaf, val_score: 0.850996: 100%|##########| 5/5 [00:01<00:00,  4.04it/s]

Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885
Early stopping, best iteration is:
[66]	cv_agg's auc: 0.850996 + 0.0215885
Best score: 0.8509956709956711
  Params: 
    objective: binary
    metric: auc
    verbosity: -1
    boosting_type: gbdt
    feature_pre_filter: False
    lambda_l1: 0.0003096038400881894
    lambda_l2: 3.424633663985954e-07
    num_leaves: 31
    feature_fraction: 0.8
    bagging_fraction: 0.6949791051056928
    bagging_freq: 1
    min_child_samples: 20





In [20]:
# Random Forest
rf_clf = ensemble.RandomForestClassifier(random_state=SEED)

rf_params = {
    # the number of trees in the forest
    'n_estimators': optuna.distributions.IntUniformDistribution(100, 400, 100),
    'criterion': optuna.distributions.CategoricalDistribution(['gini', 'entropy']),
    # The minimum number of samples required to split an internal node
    'min_samples_split': optuna.distributions.IntUniformDistribution(2, 20, 2),
    # The minimum number of samples required to be at a leaf node
    "min_samples_leaf": optuna.distributions.IntUniformDistribution(2, 10, 2),
    "bootstrap": optuna.distributions.CategoricalDistribution([True, False])
}

optuna_search = optuna.integration.OptunaSearchCV(
    rf_clf,
    rf_params,
    cv=3,
    random_state=SEED
)

optuna_search.fit(X_train, y_train)

print(
    f'Best score: {optuna_search.best_score_}',
    f'Best params: {optuna_search.best_params_}',
    sep='\n'
)

  optuna_search = optuna.integration.OptunaSearchCV(


Best score:  0.8271604938271605
Best params: {'n_estimators': 100, 'criterion': 'gini', 'min_samples_split': 12, 'min_samples_leaf': 4, 'bootstrap': False}


## Ensembling

In [23]:
voting_clf = ensemble.VotingClassifier(
    estimators=[('lgbm', lgb.LightGBMTuner(best_params)), ('rf', optuna_search)],
    voting='soft'
)
voting_clf.fit(X_train, y_train)

TypeError: __init__() missing 1 required positional argument: 'train_set'

## Predict

In [12]:
titanic_test = test.copy()
titanic_test = ct.transform(titanic_test)

In [154]:
test_survived = voting_clf.predict(titanic_test)
result = pd.concat(
    [pd.Series(test.index.values, name='PassengerId'),
    pd.Series(test_survived, name='Survived')],
    axis=1
)
result.to_csv(HOME_DIR / "output/ensemble_voting.csv", index=False)