In [None]:
#load the library
import numpy as np
import pandas as pd
import re
import seaborn as sns
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import accuracy_score
import lightgbm as lgb
import optuna
import warnings
from tqdm import tqdm
import time
from sklearn.decomposition import TruncatedSVD, PCA
import joblib
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [None]:
#transfrom categorical data to numbers
def label_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_cols):
    n_train = len(train)
    train = pd.concat([train, test], sort=False).reset_index(drop=True)
    for f in encode_cols:
        try:
            lbl = preprocessing.LabelEncoder()
            train[f] = lbl.fit_transform(list(train[f].values))
        except:
            print(f)
    test = train[n_train:].reset_index(drop=True)
    train = train[:n_train]
    return train, test

In [None]:
#import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('gender_submission.csv')

In [None]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
sub.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


## Fill NAs

In [None]:
sub.PassengerId.to_list() == test.PassengerId.to_list()

True

In [None]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
g1 = train[(train['Parch']==0)&(train['SibSp']==0)&(train['Pclass']==1)]['Age']
g2 = train[((train['Parch']!=0)|(train['SibSp']!=0))&(train['Pclass']==1)]['Age']
g3 = train[(train['Parch']==0)&(train['SibSp']==0)&(train['Pclass']==2)]['Age']
g4 = train[((train['Parch']!=0)|(train['SibSp']!=0))&(train['Pclass']==2)]['Age']
g5 = train[(train['Parch']==0)&(train['SibSp']==0)&(train['Pclass']==3)]['Age']
g6 = train[((train['Parch']!=0)|(train['SibSp']!=0))&(train['Pclass']==3)]['Age']
av1 = np.mean(g1)
av2 = np.mean(g2)
av3 = np.mean(g3)
av4 = np.mean(g4)
av5 = np.mean(g5)
av6 = np.mean(g6)

In [None]:
## fill age
train['Age'] = np.where(
    (train['Age'].isna()) & (train['Parch']==0) & (train['SibSp']==0) & (train['Pclass']==1), av1, train['Age'])

train['Age'] = np.where(
    (train['Age'].isna()) & (train['Parch']==0) & (train['SibSp']==0) & (train['Pclass']==2), av3, train['Age'])

train['Age'] = np.where(
    (train['Age'].isna()) & (train['Parch']==0) & (train['SibSp']==0) & (train['Pclass']==3), av5, train['Age'])

train['Age'] = np.where(
    (train['Age'].isna()) & ((train['Parch']!=0)|(train['SibSp']!=0))&(train['Pclass']==1), av2, train['Age'])

train['Age'] = np.where(
    (train['Age'].isna()) & ((train['Parch']!=0)|(train['SibSp']!=0))&(train['Pclass']==2), av4, train['Age'])

train['Age'] = np.where(
    (train['Age'].isna()) & ((train['Parch']!=0)|(train['SibSp']!=0))&(train['Pclass']==3), av6, train['Age'])



In [None]:
train.Cabin = train.Cabin.fillna('Unknown')

In [None]:
train[(train['Cabin'].str[0]=='B')]['Embarked'].value_counts()

S    23
C    22
Name: Embarked, dtype: int64

In [None]:
train.Embarked = train.Embarked.fillna('Unknown')

In [None]:
train.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [None]:
test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [None]:
# fill test age
g1 = test[(test['Parch']==0)&(test['SibSp']==0)&(test['Pclass']==1)]['Age']
g2 = test[((test['Parch']!=0)|(test['SibSp']!=0))&(test['Pclass']==1)]['Age']
g3 = test[(test['Parch']==0)&(test['SibSp']==0)&(test['Pclass']==2)]['Age']
g4 = test[((test['Parch']!=0)|(test['SibSp']!=0))&(test['Pclass']==2)]['Age']
g5 = test[(test['Parch']==0)&(test['SibSp']==0)&(test['Pclass']==3)]['Age']
g6 = test[((test['Parch']!=0)|(test['SibSp']!=0))&(test['Pclass']==3)]['Age']
av1 = np.mean(g1)
av2 = np.mean(g2)
av3 = np.mean(g3)
av4 = np.mean(g4)
av5 = np.mean(g5)
av6 = np.mean(g6)

test['Age'] = np.where(
    (test['Age'].isna()) & (test['Parch']==0) & (test['SibSp']==0) & (test['Pclass']==1), av1, test['Age'])

test['Age'] = np.where(
    (test['Age'].isna()) & (test['Parch']==0) & (test['SibSp']==0) & (test['Pclass']==2), av3, test['Age'])

test['Age'] = np.where(
    (test['Age'].isna()) & (test['Parch']==0) & (test['SibSp']==0) & (test['Pclass']==3), av5, test['Age'])

test['Age'] = np.where(
    (test['Age'].isna()) & ((test['Parch']!=0)|(test['SibSp']!=0))&(test['Pclass']==1), av2, test['Age'])

test['Age'] = np.where(
    (test['Age'].isna()) & ((test['Parch']!=0)|(test['SibSp']!=0))&(test['Pclass']==2), av4, test['Age'])

test['Age'] = np.where(
    (test['Age'].isna()) & ((test['Parch']!=0)|(test['SibSp']!=0))&(test['Pclass']==3), av6, test['Age'])



In [None]:
test.Cabin = test.Cabin.fillna('Unknown')

In [None]:
test.Fare = test.Fare.fillna(np.mean(test.Fare))

In [None]:
test.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Split folds

In [None]:
#do k fold validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
train['fold'] = -999
for fold_id, (train_index, valid_index) in enumerate(skf.split(X=train, y=train.Survived.values)):
    train.loc[valid_index, 'fold'] = fold_id

In [None]:
train.fold.value_counts()

0    179
1    178
2    178
3    178
4    178
Name: fold, dtype: int64

## Feature Engineering

#### 1. Name Title (Change the name to title and label encoding)

In [None]:
def find_title(entry):
    pattern = r'[a-zA-Z]+\.'
    return re.findall(pattern, entry)[0][:-1]

temp = train[['Name']]
temp['title'] =  temp['Name'].apply(find_title)
train = pd.merge(train, temp, on = 'Name')

In [None]:
train['title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Major         2
Col           2
Mlle          2
Lady          1
Don           1
Ms            1
Sir           1
Mme           1
Jonkheer      1
Capt          1
Countess      1
Name: title, dtype: int64

In [None]:
def conversion_title(entry):
    if entry == 'Miss':
        return 'Miss'
    elif entry == 'Mrs':
        return 'Mrs'
    elif entry == 'Mr':
        return 'Mr'
    elif entry == 'Master':
        return 'Master'
    elif entry == 'Dr':
        return 'Dr'
    elif entry == 'Rev':
        return 'Rev'
    else:
        return 'ELSE'
train['title'] = train['title'].apply(conversion_title)

In [None]:
temp = test[['Name']]
temp['title'] = temp['Name'].apply(find_title)
test = pd.merge(test, temp, on = 'Name')
test['title'] = test['title'].apply(conversion_title)

In [None]:
train = train.drop('Name', axis=1)
test = test.drop('Name', axis=1)

In [None]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fold,title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,Unknown,S,1,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,4,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,Unknown,S,3,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,3,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,Unknown,S,0,Mr


In [None]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,892,3,male,34.5,0,0,330911,7.8292,Unknown,Q,Mr
1,893,3,female,47.0,1,0,363272,7.0,Unknown,S,Mrs
2,894,2,male,62.0,0,0,240276,9.6875,Unknown,Q,Mr
3,895,3,male,27.0,0,0,315154,8.6625,Unknown,S,Mr
4,896,3,female,22.0,1,1,3101298,12.2875,Unknown,S,Mrs


#### 2.  Cut the Age and Fare column to different bins 

In [None]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,fold
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.51479,0.523008,0.381594,32.204208,1.997755
std,257.353842,0.486592,0.836071,13.309636,1.102743,0.806057,49.693429,1.4158
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,21.0,0.0,0.0,7.9104,1.0
50%,446.0,0.0,3.0,28.235556,0.0,0.0,14.4542,2.0
75%,668.5,1.0,3.0,36.0,1.0,0.0,31.0,3.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292,4.0


In [None]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,418.0,418.0,418.0
mean,1100.5,2.26555,29.471072,0.447368,0.392344,35.627188
std,120.810458,0.841838,12.949363,0.89676,0.981429,55.8405
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.25,0.0,0.0,7.8958
50%,1100.5,3.0,26.121053,0.0,0.0,14.4542
75%,1204.75,3.0,36.375,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [None]:
train['Fare'] = pd.qcut(train['Fare'], 10)
test['Fare'] = pd.qcut(test['Fare'], 10)
train['Age'] = pd.qcut(train['Age'], 10)
test['Age'] = pd.qcut(test['Age'], 10)

In [None]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,fold,title
0,1,0,3,male,"(19.784, 22.0]",1,0,A/5 21171,"(-0.001, 7.55]",Unknown,S,1,Mr
1,2,1,1,female,"(34.0, 40.0]",1,0,PC 17599,"(39.688, 77.958]",C85,C,4,Mrs
2,3,1,3,female,"(22.0, 27.0]",0,0,STON/O2. 3101282,"(7.854, 8.05]",Unknown,S,3,Miss
3,4,1,1,female,"(34.0, 40.0]",1,0,113803,"(39.688, 77.958]",C123,S,3,Mrs
4,5,0,3,male,"(34.0, 40.0]",0,0,373450,"(7.854, 8.05]",Unknown,S,0,Mr


In [None]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
0,892,3,male,"(33.0, 39.429]",0,0,330911,"(7.796, 8.05]",Unknown,Q,Mr
1,893,3,female,"(39.429, 48.0]",1,0,363272,"(-0.001, 7.644]",Unknown,S,Mrs
2,894,2,male,"(48.0, 76.0]",0,0,240276,"(8.05, 11.342]",Unknown,Q,Mr
3,895,3,male,"(26.121, 29.0]",0,0,315154,"(8.05, 11.342]",Unknown,S,Mr
4,896,3,female,"(20.129, 23.0]",1,1,3101298,"(11.342, 14.454]",Unknown,S,Mrs


### 3. clean ticket 

In [None]:
# Ticket  value_counts first
def ticket_extract(entry):
    pattern = r'[A-Z]+'
    extracted = ''.join(re.findall(pattern, entry))
    if extracted == '':
        return 'Number'
    elif extracted == 'PC':
        return extracted
    elif extracted == 'CA':
        return extracted
    else:
        return 'ELSE'

train['Ticket'] = train['Ticket'].apply(ticket_extract)
test['Ticket'] = test['Ticket'].apply(ticket_extract)

### 4. clean number of family members 

In [None]:
#train['Alone'] = np.where((train['SibSp']==0)&(train['Parch']==0), 1, 0)
#test['Alone'] = np.where((test['SibSp']==0)&(test['Parch']==0), 1, 0)

In [None]:
train['Family_size'] = np.zeros(len(train))

train['Family_size'] = np.where((train['SibSp']+train['Parch'])<3, 0, train['Family_size'])
train['Family_size'] = np.where((train['SibSp']+train['Parch']<5) & (train['SibSp']+train['Parch']>=3), 1, train['Family_size'])
train['Family_size'] = np.where((train['SibSp']+train['Parch'])>=5, 2, train['Family_size'])

test['Family_size'] = np.zeros(len(test))
test['Family_size'] = np.where((test['SibSp']+test['Parch'])<3, 0, test['Family_size'])
test['Family_size'] = np.where((test['SibSp']+test['Parch']<5) & (test['SibSp']+test['Parch']>=3), 1, test['Family_size'])
test['Family_size'] = np.where((test['SibSp']+test['Parch'])>=5, 2, test['Family_size'])

In [None]:
train = train.drop(columns = ['SibSp', 'Parch'])
test = test.drop(columns = ['SibSp', 'Parch'])

In [None]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,fold,title,Family_size
0,1,0,3,male,"(19.784, 22.0]",ELSE,"(-0.001, 7.55]",Unknown,S,1,Mr,0.0
1,2,1,1,female,"(34.0, 40.0]",PC,"(39.688, 77.958]",C85,C,4,Mrs,0.0
2,3,1,3,female,"(22.0, 27.0]",ELSE,"(7.854, 8.05]",Unknown,S,3,Miss,0.0
3,4,1,1,female,"(34.0, 40.0]",Number,"(39.688, 77.958]",C123,S,3,Mrs,0.0
4,5,0,3,male,"(34.0, 40.0]",Number,"(7.854, 8.05]",Unknown,S,0,Mr,0.0


### 6. Cabin number

In [None]:
train['Cabin Num'] = train['Cabin'].str.extract(r'([0-9]+)').fillna(0).astype(float)
test['Cabin Num'] = test['Cabin'].str.extract(r'([0-9]+)').fillna(0).astype(float)

In [None]:
train['Cabin Num'] = (train['Cabin Num']-np.mean(train['Cabin Num']))/np.std(train['Cabin Num'])
test['Cabin Num'] = (test['Cabin Num']-np.mean(test['Cabin Num']))/np.std(test['Cabin Num'])


### 5. Extract cabin value

In [None]:
#cabin  get first letter
def cabin(entry):
    rare_list = ['A', 'F', 'G', 'T']
    if entry=='Unknown':
        return 'NAN'
    elif entry[0] in rare_list:
        return 'Rare'
    else:
         return entry[0]
train['Cabin'] = train['Cabin'].apply(cabin)
test['Cabin'] = test['Cabin'].apply(cabin)

## Label Encoding

In [None]:
train = train.drop('PassengerId', axis=1)
test = test.drop('PassengerId', axis=1)

In [None]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,fold,title,Family_size,Cabin Num
0,0,3,male,"(19.784, 22.0]",ELSE,"(-0.001, 7.55]",NAN,S,1,Mr,0.0,-0.421322
1,1,1,female,"(34.0, 40.0]",PC,"(39.688, 77.958]",C,C,4,Mrs,0.0,2.73859
2,1,3,female,"(22.0, 27.0]",ELSE,"(7.854, 8.05]",NAN,S,3,Miss,0.0,-0.421322
3,1,1,female,"(34.0, 40.0]",Number,"(39.688, 77.958]",C,S,3,Mrs,0.0,4.151256
4,0,3,male,"(34.0, 40.0]",Number,"(7.854, 8.05]",NAN,S,0,Mr,0.0,-0.421322


In [None]:
#transfrom categorical data to numbers
def label_encoding(train: pd.DataFrame, test: pd.DataFrame, encode_cols):
    n_train = len(train)
    train = pd.concat([train, test], sort=False).reset_index(drop=True)
    for f in encode_cols:
        try:
            lbl = preprocessing.LabelEncoder()
            train[f] = lbl.fit_transform(list(train[f].values))
        except:
            print(f)
    test = train[n_train:].reset_index(drop=True)
    train = train[:n_train]
    return train, test

In [None]:
train, test = label_encoding(train, test, ['Sex', 'Ticket', 'Embarked', 'Age', 'Cabin Num', 'Fare','Cabin'])

In [None]:
train = pd.get_dummies(train)
test= pd.get_dummies(test)

In [None]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,fold,Family_size,Cabin Num,title_Dr,title_ELSE,title_Master,title_Miss,title_Mr,title_Mrs,title_Rev
0,0.0,3,1,4,1,0,4,2,1.0,0.0,0,0,0,0,0,1,0,0
1,1.0,1,0,15,3,15,1,0,4.0,0.0,113,0,0,0,0,0,1,0
2,1.0,3,0,6,1,5,4,2,3.0,0.0,0,0,0,0,1,0,0,0
3,1.0,1,0,15,2,15,1,2,3.0,0.0,142,0,0,0,0,0,1,0
4,0.0,3,1,15,2,5,4,2,0.0,0.0,0,0,0,0,0,1,0,0


In [None]:
test.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,fold,Family_size,Cabin Num,title_Dr,title_ELSE,title_Master,title_Miss,title_Mr,title_Mrs,title_Rev
0,,3,1,14,2,4,4,1,,0.0,1,0,0,0,0,1,0,0
1,,3,0,16,2,1,4,2,,0.0,1,0,0,0,0,0,1,0
2,,2,1,19,2,7,4,1,,0.0,1,0,0,0,0,1,0,0
3,,3,1,9,2,7,4,2,,0.0,1,0,0,0,0,1,0,0
4,,3,0,5,2,9,4,2,,0.0,1,0,0,0,0,0,1,0


# Building Model

## LGB

In [None]:
def optuna_lgb(n_trials=100):
    
    def objective(trial):

        params = {
            "objective": "binary",
            "metric": "binary_logloss",
            "verbosity": -1,
            "boosting_type": "gbdt",
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-5, 10, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-5, 10, log=True),
            "num_leaves": trial.suggest_int("num_leaves", 2, 62),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 0.9),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 0.9),
            "bagging_freq": trial.suggest_int("bagging_freq", 2, 9),
            "min_child_samples": trial.suggest_int("min_child_samples", 33, 93),
            "max_depth": trial.suggest_int("max_depth", 2, 7)
        }

        # initialize oof 
        oof_train = np.zeros((len(train),))

        for i in range(5):
            train_x = train.query(f'fold!={i}').drop(['fold', 'Survived'], axis=1)
            train_y = train.query(f'fold!={i}').Survived

            valid_x = train.query(f'fold=={i}').drop(['fold', 'Survived'], axis=1)
            valid_y = train.query(f'fold=={i}').Survived
            
            lgb_train = lgb.Dataset(train_x,
                                    train_y)

            lgb_eval = lgb.Dataset(valid_x,
                                   valid_y,
                                   reference=lgb_train)

            gbm = lgb.train(params,
                            lgb_train,
                            valid_sets=[lgb_train, lgb_eval],
                            num_boost_round=5000,
                            verbose_eval=-1, 
                            early_stopping_rounds=1000
                            )
            oof_preds = gbm.predict(valid_x, num_iteration=gbm.best_iteration)
            oof_train[valid_x.index] = gbm.predict(valid_x,
                                                   num_iteration=gbm.best_iteration)
            
            
        return accuracy_score(train.Survived, oof_train.round())
    
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))
    
    best_params = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
    }
    best_params.update(study.best_params)
    ## save trained model
    # initialize oof 
    oof_train = np.zeros((len(train),))
    print('*'*50)
    print('Exporting best models')
    print('*' * 50)
    # split folds using RSGKF
    for i in range(5):
        train_x = train.query(f'fold!={i}').drop(['fold', 'Survived'], axis=1)
        train_y = train.query(f'fold!={i}').Survived

        valid_x = train.query(f'fold=={i}').drop(['fold', 'Survived'], axis=1)
        valid_y = train.query(f'fold=={i}').Survived

        lgb_train = lgb.Dataset(train_x,
                                train_y)

        lgb_eval = lgb.Dataset(valid_x,
                               valid_y,
                               reference=lgb_train)

        gbm = lgb.train(best_params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_eval],
                        num_boost_round=50000,
                        verbose_eval=-1, 
                        early_stopping_rounds=1001
                        )

        oof_preds = gbm.predict(valid_x, num_iteration=gbm.best_iteration)
        oof_train[valid_x.index] = gbm.predict(valid_x, num_iteration=gbm.best_iteration)

        joblib.dump(gbm, f'lgb_fold{i}.pkl')

In [None]:
optuna_lgb()

Early stopping, best iteration is:
[127]	training's binary_logloss: 0.361735	valid_1's binary_logloss: 0.455804
Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration is:
[649]	training's binary_logloss: 0.312182	valid_1's binary_logloss: 0.397094
Training until validation scores don't improve for 1000 rounds
[32m[I 2021-10-03 12:58:29,789][0m Trial 39 finished with value: 0.8316498316498316 and parameters: {'lambda_l1': 0.0001895787116162189, 'lambda_l2': 0.008932702538828505, 'num_leaves': 43, 'feature_fraction': 0.6987224163891123, 'bagging_fraction': 0.67945154902962, 'bagging_freq': 5, 'min_child_samples': 67, 'max_depth': 6}. Best is trial 4 with value: 0.8372615039281706.[0m
Early stopping, best iteration is:
[1401]	training's binary_logloss: 0.276952	valid_1's binary_logloss: 0.369364
Training until validation scores don't improve for 1000 rounds
Early stopping, best iteration is:
[962]	training's binary_logloss: 0.358434	valid_1's bina

## XGBoost

In [None]:
#optuna example
#https://github.com/optuna/optuna-examples/blob/main/xgboost/xgboost_integration.py
import xgboost as xgb
def objective(trial):
    param = {
            "verbosity": 0,
            "objective": "binary:logistic",  #binary:logistic
            "eval_metric": "auc",  #map
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

    for i in range(5):
        train_x = train.query(f'fold!={i}').drop(['fold', 'Survived'], axis=1)
        train_y = train.query(f'fold!={i}').Survived

        valid_x = train.query(f'fold=={i}').drop(['fold', 'Survived'], axis=1)
        valid_y = train.query(f'fold=={i}').Survived

        dtrain = xgb.DMatrix(train_x, label=train_y)
        dvalid = xgb.DMatrix(valid_x, label=valid_y)
        
        accuracy = []
        # Add a callback for pruning.
        pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
        bst = xgb.train(param, dtrain, evals=[(dvalid, "validation")], callbacks=[pruning_callback])
        preds = bst.predict(dvalid)
        pred_labels = np.rint(preds)
        accuracy.append(sklearn.metrics.accuracy_score(valid_y, pred_labels))
        
        
        joblib.dump(bst, f'xgboost_fold{i}.pkl')
    return np.mean(accuracy)

    
if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
    )
    study.optimize(objective, n_trials=100)
    print(study.best_trial)

[6]	validation-auc:0.87801
[7]	validation-auc:0.87654
[8]	validation-auc:0.87179
[9]	validation-auc:0.87286
[0]	validation-auc:0.82487
[1]	validation-auc:0.85308
[2]	validation-auc:0.85054
[3]	validation-auc:0.84880
[4]	validation-auc:0.85107
[5]	validation-auc:0.84626
[6]	validation-auc:0.82995
[7]	validation-auc:0.82874
[8]	validation-auc:0.82707
[9]	validation-auc:0.83135
[0]	validation-auc:0.86350
[1]	validation-auc:0.86190
[2]	validation-auc:0.86150
[3]	validation-auc:0.86110
[4]	validation-auc:0.86143
[5]	validation-auc:0.86023
[6]	validation-auc:0.86805
[7]	validation-auc:0.86765
[8]	validation-auc:0.86704
[9]	validation-auc:0.86678
[0]	validation-auc:0.84823
[1]	validation-auc:0.84756
[2]	validation-auc:0.85627
[3]	validation-auc:0.85840
[4]	validation-auc:0.86132
[5]	validation-auc:0.86139
[6]	validation-auc:0.86252
[7]	validation-auc:0.86518
[8]	validation-auc:0.86637
[9]	validation-auc:0.86451
[32m[I 2021-10-03 12:37:26,970][0m Trial 60 finished with value: 0.8146067415730

### Logistic

In [None]:
#logistic
from sklearn import linear_model

#Step 1. Define an objective function to be maximized.
def objective(trial):

    classifier_name = 'LogReg'
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    classifier_obj = linear_model.LogisticRegression(C=logreg_c)
        
    for i in range(5):
        train_x = train.query(f'fold!={i}').drop(['fold', 'Survived'], axis=1)
        train_y = train.query(f'fold!={i}').Survived

        valid_x = train.query(f'fold=={i}').drop(['fold', 'Survived'], axis=1)
        valid_y = train.query(f'fold=={i}').Survived
    
        #save the five fold accuracy
        accuracy = []

        gbm = classifier_obj
        gbm.fit(train_x, train_y)
        preds = gbm.predict(valid_x)
        pred_labels = np.rint(preds)
        accuracy.append(accuracy_score(valid_y, pred_labels))
                
        #save the best model for each fold 
        joblib.dump(gbm, f'logistic_fold{i}.pkl')
        
    return np.mean(accuracy)

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2021-10-03 13:06:38,677][0m A new study created in memory with name: no-name-5d10eb6a-edaa-4fd5-9272-7ad6b88b18bf[0m
[32m[I 2021-10-03 13:06:39,084][0m Trial 0 finished with value: 0.6123595505617978 and parameters: {'logreg_c': 8.439924122911414e-09}. Best is trial 0 with value: 0.6123595505617978.[0m
[32m[I 2021-10-03 13:06:39,584][0m Trial 1 finished with value: 0.6123595505617978 and parameters: {'logreg_c': 1.9074766503214344e-10}. Best is trial 0 with value: 0.6123595505617978.[0m
[32m[I 2021-10-03 13:06:40,671][0m Trial 2 finished with value: 0.8146067415730337 and parameters: {'logreg_c': 112213.66829628668}. Best is trial 2 with value: 0.8146067415730337.[0m
[32m[I 2021-10-03 13:06:41,182][0m Trial 3 finished with value: 0.6123595505617978 and parameters: {'logreg_c': 1.1411550103568469e-08}. Best is trial 2 with value: 0.8146067415730337.[0m
[32m[I 2021-10-03 13:06:42,192][0m Trial 4 finished with value: 0.8202247191011236 and parameters: {'logreg_c': 

### Catboost

In [None]:
#optuna catboost example
#https://github.com/optuna/optuna-examples/blob/main/catboost/catboost_simple.py
import catboost as cb
def objective(trial):
    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    for i in range(5):
        train_x = train.query(f'fold!={i}').drop(['fold', 'Survived'], axis=1)
        train_y = train.query(f'fold!={i}').Survived

        valid_x = train.query(f'fold=={i}').drop(['fold', 'Survived'], axis=1)
        valid_y = train.query(f'fold=={i}').Survived
        
        #save the five fold accuracy
        accuracy = []
        
        gbm = cb.CatBoostClassifier(**param)
        gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)
        preds = gbm.predict(valid_x)
        pred_labels = np.rint(preds)
        accuracy.append(accuracy_score(valid_y, pred_labels))
        
        #save the best model for each fold 
        joblib.dump(gbm, f'catboost_fold{i}.pkl')
    return np.mean(accuracy)

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2021-10-03 13:08:24,691][0m A new study created in memory with name: no-name-15ab1b0f-08e1-47e8-b6be-f4e20cc0912e[0m
[32m[I 2021-10-03 13:08:31,783][0m Trial 0 finished with value: 0.8089887640449438 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.055062637218881695, 'depth': 8, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.6933611318320017}. Best is trial 0 with value: 0.8089887640449438.[0m
[32m[I 2021-10-03 13:08:37,234][0m Trial 1 finished with value: 0.8146067415730337 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.08407485346821005, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.8146067415730337.[0m
[32m[I 2021-10-03 13:08:39,011][0m Trial 2 finished with value: 0.8258426966292135 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.01250352878243508, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_t

## Ensemble lgb and xgboost

In [None]:
X_train = train.drop(['Survived', 'fold'], axis=1).values
y_train = train[['Survived']].values

In [None]:
test2 = test.drop(['Survived', 'fold'], axis=1).values

Unnamed: 0,Survived,Pclass,Sex,Age,Ticket,Fare,Cabin,Embarked,fold,Family_size,Cabin Num,title_Dr,title_ELSE,title_Master,title_Miss,title_Mr,title_Mrs,title_Rev
0,,3,1,14,2,4,4,1,,0.0,0,0,0,0,0,1,0,0
1,,3,0,16,2,1,4,2,,0.0,0,0,0,0,0,0,1,0
2,,2,1,19,2,7,4,1,,0.0,0,0,0,0,0,1,0,0
3,,3,1,9,2,7,4,2,,0.0,0,0,0,0,0,1,0,0
4,,3,0,5,2,9,4,2,,0.0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,,3,1,8,1,4,4,2,,0.0,0,0,0,0,0,1,0,0
414,,1,0,14,3,18,1,0,,0.0,90,0,1,0,0,0,0,0
415,,3,1,14,1,1,4,2,,0.0,0,0,0,0,0,1,0,0
416,,3,1,8,2,4,4,2,,0.0,0,0,0,0,0,1,0,0


In [None]:
#get the mean of best fold in lgb
lgb_pred = np.zeros(418)
for i in range(5):
    model = joblib.load(f'./lgb_fold{i}.pkl')
    lgb_pred += model.predict(test2) / 5
    
lgb_pred = lgb_pred.round().astype(int)

In [None]:
#get the mean of best fold in xgboost
xgboost_pred = np.zeros(418)
for i in range(5):
    model = joblib.load(f'./xgboost_fold{i}.pkl')
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    xgboost_pred += model.predict_proba(test2)[:,1] / 5
    
xgboost_pred = xgboost_pred.round().astype(int)

In [None]:
#get the mean of best fold in logistic  0.8426966292134831 0.78947 selected nice
logistic_pred = np.zeros(418)
for i in range(5):
    model = joblib.load(f'./logistic_fold{i}.pkl')
    logistic_pred += model.predict(test2) / 5
    
logistic_pred = logistic_pred.round().astype(int)

In [None]:
#get the mean of best fold in catboost  0.8426966292134831  nice  0.77990 selected
catboost_pred = np.zeros(418)
for i in range(5):
    model = joblib.load(f'./catboost_fold{i}.pkl')
    catboost_pred += model.predict(test2) / 5
    
catboost_pred = catboost_pred.round().astype(int)

In [None]:
#combine
f_pro = (lgb_pred + xgboost_pred + catboost_pred + logistic_pred)/4    #predict probability [0, 1]


In [None]:
pred = []
for i in f_pro:
    if i > 0.5:
        pred.append(1)
    else:
        pred.append(0)

In [None]:
sub.Survived = pred

In [None]:
sub

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [None]:
sub.to_csv('submission_lastly.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6f45f37b-2499-4526-9eb4-651deeca3084' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>