In [None]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
import lightgbm as lgb
import optuna
import warnings
from tqdm import tqdm
import time
from sklearn.decomposition import TruncatedSVD, PCA
import joblib
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [None]:
def label_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_cols):
    n_train = len(train)
    train = pd.concat([train, test], sort=False).reset_index(drop=True)
    for f in encode_cols:
        try:
            lbl = preprocessing.LabelEncoder()
            train[f] = lbl.fit_transform(list(train[f].values))
        except:
            print(f)
    test = train[n_train:].reset_index(drop=True)
    train = train[:n_train]
    return train, test

In [None]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
sub = pd.read_csv('../input/titanic/gender_submission.csv')

## Fill NAs

In [None]:
sub.PassengerId.to_list() == test.PassengerId.to_list()

In [None]:
train.isna().sum()

In [None]:
## fill age
train.Age = train.Age.fillna(-999)

In [None]:
train.Cabin = train.Cabin.fillna('Unknown')

In [None]:
train.Embarked = train.Embarked.fillna('Unknown')

In [None]:
test.isna().sum()

In [None]:
test.Age = test.Age.fillna(-999)
test.Fare = test.Fare.fillna(np.mean(test.Fare))

In [None]:
test.Cabin = test.Cabin.fillna('Unknown')

## Split folds

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
train['fold'] = -999
for fold_id, (train_index, valid_index) in enumerate(skf.split(X=train, y=train.Survived.values)):
    train.loc[valid_index, 'fold'] = fold_id

In [None]:
train.fold.value_counts()

## Label Encoding

In [None]:
train.Name = train.Name.str.extract('([A-Za-z]+)\.', expand=False)
test.Name = test.Name.str.extract('([A-Za-z]+)\.', expand=False)

In [None]:
train, test = label_encoding(train, test, ['Sex', 'Ticket', 'Cabin', 'Embarked', 'Name'] )

In [None]:
train = train.drop('PassengerId', axis=1)
test = test.drop('PassengerId', axis=1)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
def optuna_lgb(n_trials=100):
    
    def objective(trial):

        params = {
            "objective": "binary",
            "metric": "binary_logloss",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-5, 10, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-5, 10, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 2, 62),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 0.9),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 0.9),
            "bagging_freq": trial.suggest_int("bagging_freq", 2, 9),
            "min_child_samples": trial.suggest_int("min_child_samples", 33, 93),
            "max_depth": trial.suggest_int("max_depth", 2, 7)
        }

        # initialize oof 
        oof_train = np.zeros((len(train),))

        for i in range(5):
            train_x = train.query(f'fold!={i}').drop(['fold', 'Survived'], axis=1)
            train_y = train.query(f'fold!={i}').Survived

            valid_x = train.query(f'fold=={i}').drop(['fold', 'Survived'], axis=1)
            valid_y = train.query(f'fold=={i}').Survived
            
            lgb_train = lgb.Dataset(train_x,
                                    train_y)

            lgb_eval = lgb.Dataset(valid_x,
                                   valid_y,
                                   reference=lgb_train)

            gbm = lgb.train(params,
                            lgb_train,
                            valid_sets=[lgb_train, lgb_eval],
                            num_boost_round=5000,
                            verbose_eval=-1, 
                            early_stopping_rounds=1000
                            )
            oof_preds = gbm.predict(valid_x, num_iteration=gbm.best_iteration)
            oof_train[valid_x.index] = gbm.predict(valid_x,
                                                   num_iteration=gbm.best_iteration)
            
            
        return accuracy_score(train.Survived, oof_train.round())
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
    best_params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }
    best_params.update(study.best_params)
    ## save trained model
    # initialize oof 
    oof_train = np.zeros((len(train),))
    print('*'*50)
    print('Exporting best models')
    print('*' * 50)
    # split folds using RSGKF
    for i in range(5):
        train_x = train.query(f'fold!={i}').drop(['fold', 'Survived'], axis=1)
        train_y = train.query(f'fold!={i}').Survived

        valid_x = train.query(f'fold=={i}').drop(['fold', 'Survived'], axis=1)
        valid_y = train.query(f'fold=={i}').Survived

        lgb_train = lgb.Dataset(train_x,
                                train_y)

        lgb_eval = lgb.Dataset(valid_x,
                               valid_y,
                               reference=lgb_train)

        gbm = lgb.train(best_params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_eval],
                        num_boost_round=50000,
                        verbose_eval=-1, 
                        early_stopping_rounds=1001
                        )

        oof_preds = gbm.predict(valid_x, num_iteration=gbm.best_iteration)
        oof_train[valid_x.index] = gbm.predict(valid_x,
                                               num_iteration=gbm.best_iteration)

        joblib.dump(gbm, f'lgb_fold{i}.pkl')

In [None]:
optuna_lgb()

In [None]:
test = test.drop(['Survived', 'fold'], axis=1)

In [None]:
test

In [None]:
predictions = np.zeros(418)
for i in range(5):
    model = joblib.load(f'./lgb_fold{i}.pkl')
    predictions += model.predict(test) / 5

In [None]:
sub

In [None]:
sub.Survived = predictions.round().astype(int)

In [None]:
sum(predictions > 0.5)

In [None]:
sub.to_csv('submission.csv', index=False)