This notebook follows [How to score 0.8134 🏅 in Titanic Kaggle Challenge](https://ahmedbesbes.com/how-to-score-08134-in-titanic-kaggle-challenge.html).

In [1]:
import numpy as np
import lightgbm as lgb
import pandas as pd
import pathlib
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import svm

COMP_NAME = 'titanic'
HOME_DIR = pathlib.Path('/home/aiskay/competitions') / COMP_NAME
SEED = 42

In [2]:
titanic_train = pd.read_csv(HOME_DIR / 'input' / COMP_NAME / 'train.csv')
titanic_test = pd.read_csv(HOME_DIR / 'input' / COMP_NAME / 'train.csv')

data = pd.concat([titanic_train, titanic_test], ignore_index=True, sort=False)
# fill few NaN
data['Fare'].fillna(data['Fare'].mean(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

data.head(4)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


# Feature Engineering

## Name/Title

In [3]:
data_title = pd.Series([i.split(',')[1].split('.')[0].strip() for i in data['Name']])
title_dic = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir": "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Royalty",
    'Dona': 'Royality'
}
data['Title'] = data_title.map(title_dic)

## FamilySize

In [4]:
data['FSize'] = data['Parch'] + data['SibSp'] + 1

data['Single'] = data['FSize'].map(lambda s: 1 if s == 1 else 0)
data['SmallF'] = data['FSize'].map(lambda s: 1 if  s == 2  else 0)
data['MedF'] = data['FSize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
data['LargeF'] = data['FSize'].map(lambda s: 1 if s >= 5 else 0)

## Age

In [5]:
index_NaN_age = data['Age'][data['Age'].isnull()].index.values
age_med = data['Age'].median()
for i in index_NaN_age:
    age_pred = data['Age'][((data['SibSp'] == data.iloc[i]['SibSp']) 
                            & (data['Parch'] == data.iloc[i]['Parch']) 
                            & (data['Pclass'] == data.iloc[i]['Pclass']))].median()
    if not np.isnan(age_pred):
        data.loc[i, 'Age'] = age_pred
    else:
        data.loc[i, 'Age'] = age_med

## Cabin

In [6]:
for i, item in data['Cabin'].iteritems():
    if not pd.isnull(item):
        data.loc[i, 'Cabin'] = item[0]
    else:
        data.loc[i, 'Cabin'] = 'X'

## Ticket

In [7]:
for i, item in data['Ticket'].iteritems():
    if not item.isdigit() :
        data.loc[i, 'Ticket'] = item.replace(".","").replace("/","").strip().split(' ')[0]
    else:
        data.loc[i, 'Ticket'] = 'X' 

## Fare

In [8]:
# qcut, cut: ビン分割処理
# qcut: 各ビンに含まれる要素数が等しくなるようにビン分割する
# cut: 最小値と最大値の間を指定した値で等間隔に分割してビン分けし、どこに属するかが出力される
# data['FareBin'] = pd.qcut(data['Fare'], 4)
# train['AgeBin'] = pd.cut(train['Age'].astype(int), 5)  # .astype() で型変換

# Modeling

In [9]:
def show_result(grid_search):
    print(grid_search.best_params_)
    print(grid_search.best_score_)

data.drop(['PassengerId', 'Name'], axis=1, inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FSize,Single,SmallF,MedF,LargeF
0,0,3,male,22.0,1,0,A5,7.25,X,S,Mr,2,0,1,1,0
1,1,1,female,38.0,1,0,PC,71.2833,C,C,Mrs,2,0,1,1,0
2,1,3,female,26.0,0,0,STONO2,7.925,X,S,Miss,1,1,0,0,0
3,1,1,female,35.0,1,0,X,53.1,C,S,Mrs,2,0,1,1,0
4,0,3,male,35.0,0,0,X,8.05,X,S,Mr,1,1,0,0,0


In [10]:
# encoding
data["Sex"] = data["Sex"].map({"male": 0, "female":1})
cols = ['Ticket', 'Cabin', 'Embarked', 'Title']

# label-encoding
data_label = data.copy()
for col in cols:
    le = preprocessing.LabelEncoder()
    data_label[col] = le.fit_transform(data_label[col])
    
# one-hot encoding
data_onehot = data.copy()
for col in cols:
    data_onehot = pd.get_dummies(data_onehot, columns=[col], prefix=col)

In [11]:
train_len = len(titanic_train)
X_train_label = data_label[:train_len].drop('Survived', axis=1)
Y_train_label = data_label[:train_len]['Survived'].copy()
X_train_onehot = data_onehot[:train_len].drop('Survived', axis=1)
Y_train_onehot = data_onehot[:train_len]['Survived'].copy()

In [12]:
models = {
    'rf': ensemble.RandomForestClassifier(),
    'ext': ensemble.ExtraTreesClassifier(),
    'lgbm': lgb.LGBMClassifier(),
    'svc': svm.SVC(probability=True)
}

grid_params = {
    'rf': {
        'max_depth' : range(3, 11),
        'n_estimators': range(10, 51, 10),
        'max_features': ['sqrt', 'auto', 'log2'],
        'min_samples_split': range(2, 15),
        'min_samples_leaf': range(1, 8),
        'bootstrap': [True, False],
    },
    'ext': {
        "max_depth": [None],
        "max_features": [1, 3, 10],
        "min_samples_split": range(2, 20, 3),
        "min_samples_leaf": range(1, 10),
        "bootstrap": [False],
        "n_estimators" :[100, 200, 300],
        "criterion": ["gini"]
    },
    'lgbm': {
        'learning_rate': [1e-3, 1e-2, 1e-1],
        'n_estimators':[100, 200],
        'max_depth': [3, 4, 6, 8],
        "min_samples_split": [30, 40],
        'min_samples_leaf': [20, 20],
        'max_features': [0.05, 0.1, 0.3],
    },
    'svc': {
        'kernel': ['rbf'],
        'gamma': [ 0.001, 0.01, 0.1, 1],
        'C': [1, 10, 50, 100, 200, 300, 1000],
    }
}

# Change here for the first run
is_grid_search = {
    'rf': True,
    'ext': False,
    'lgbm': True,
    'svc': True,
}

In [13]:
estimator_best = {}
validation_scores = {}

In [14]:
for model in models:
    if is_grid_search[model]:
        grid_search = model_selection.GridSearchCV(
            models[model], grid_params[model],
            scoring='accuracy', cv=3, n_jobs=-1
        )
        grid_search.fit(X_train_label, Y_train_label)
        estimator_best[model] = grid_search.best_estimator_
        validation_scores[model] = grid_search.best_score_

        print(estimator_best[model])
        print(validation_scores[model])

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_split=3,
                       n_estimators=50)
0.8372615039281706






































LGBMClassifier(max_depth=3, max_features=0.05, min_samples_leaf=20,
               min_samples_split=30, n_estimators=200)
0.840628507295174
SVC(C=300, gamma=0.001, probability=True)
0.7934904601571269


## Prediction

RandomForestClassifer gets the best score.

In [15]:
test_label = data_label[train_len:].drop('Survived', axis=1)
test_onehot = data_onehot[train_len:].drop('Survived', axis=1)

In [16]:
for name, clf in estimator_best.items():
    pred = pd.Series(clf.predict(test_label), name='Survived', dtype=int)
    result = pd.concat([titanic_test['PassengerId'], pred], axis=1)
    # result.to_csv(HOME_DIR / 'output/rf.csv', index=False)

## Simple Ensembling

I created simple ensemble model, but this does not improve the score.

In [19]:
voting = ensemble.VotingClassifier(
    estimators=[('rf', estimator_best['rf']), ('lgbm', estimator_best['lgbm'])],
    voting='soft',
    n_jobs=8
)
voting.fit(X_train_label, Y_train_label)

pred = pd.Series(voting.predict(test_label), name='Survived', dtype=int)
result = pd.concat([titanic_test['PassengerId'], pred], axis=1)
# result.to_csv(HOME_DIR / 'output/voting.csv', index=False)

