In [445]:
import numpy as np
import pandas as pd

This notebook follows the procedure of this URL:

https://ahmedbesbes.com/how-to-score-08134-in-titanic-kaggle-challenge.html

In [446]:
titanic_train = pd.read_csv('input/train.csv')
titanic_test = pd.read_csv('input/test.csv')
data = pd.concat([titanic_train, titanic_test], ignore_index=True, sort=False)

In [447]:
# fill few NaN
data['Fare'].fillna(data['Fare'].mean(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

## Feature Engineering

### Name/Title

In [448]:
data_title = pd.Series([i.split(',')[1].split('.')[0].strip() for i in data['Name']])
title_dic = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir": "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr": "Mr",
    "Mrs": "Mrs",
    "Miss": "Miss",
    "Master": "Master",
    "Lady": "Royalty",
    'Dona': 'Royality'
}
data['Title'] = data_title.map(title_dic)

### FamilySize

In [449]:
data['FSize'] = data['Parch'] + data['SibSp'] + 1

data['Single'] = data['FSize'].map(lambda s: 1 if s == 1 else 0)
data['SmallF'] = data['FSize'].map(lambda s: 1 if  s == 2  else 0)
data['MedF'] = data['FSize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
data['LargeF'] = data['FSize'].map(lambda s: 1 if s >= 5 else 0)

### Age

In [450]:
index_NaN_age = data['Age'][data['Age'].isnull()].index.values
age_med = data['Age'].median()
for i in index_NaN_age:
    age_pred = data['Age'][((data['SibSp'] == data.iloc[i]['SibSp']) 
                            & (data['Parch'] == data.iloc[i]['Parch']) 
                            & (data['Pclass'] == data.iloc[i]['Pclass']))].median()
    if not np.isnan(age_pred):
        data.loc[i, 'Age'] = age_pred
    else:
        data.loc[i, 'Age'] = age_med

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


### Cabin

In [451]:
for i, item in data['Cabin'].iteritems():
    if not pd.isnull(item):
        data.loc[i, 'Cabin'] = item[0]
    else:
        data.loc[i, 'Cabin'] = 'X'

### Ticket

In [452]:
for i, item in data['Ticket'].iteritems():
    if not item.isdigit() :
        data.loc[i, 'Ticket'] = item.replace(".","").replace("/","").strip().split(' ')[0]
    else:
        data.loc[i, 'Ticket'] = 'X' 

## Modeling

In [453]:
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn import svm
import lightgbm

In [454]:
def show_result(grid_search):
    print(grid_search.best_params_)
    print(grid_search.best_score_)

data.drop(['PassengerId', 'Name'], axis=1, inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FSize,Single,SmallF,MedF,LargeF
0,0.0,3,male,22.0,1,0,A5,7.25,X,S,Mr,2,0,1,1,0
1,1.0,1,female,38.0,1,0,PC,71.2833,C,C,Mrs,2,0,1,1,0
2,1.0,3,female,26.0,0,0,STONO2,7.925,X,S,Miss,1,1,0,0,0
3,1.0,1,female,35.0,1,0,X,53.1,C,S,Mrs,2,0,1,1,0
4,0.0,3,male,35.0,0,0,X,8.05,X,S,Mr,1,1,0,0,0


In [455]:
# encoding
data["Sex"] = data["Sex"].map({"male": 0, "female":1})
cols = ['Ticket', 'Cabin', 'Embarked', 'Title']

# label-encoding
data_label = data.copy()
for col in cols:
    le = preprocessing.LabelEncoder()
    data_label[col] = le.fit_transform(data_label[col])
    
# one-hot encoding
data_onehot = data.copy()
for col in cols:
    data_onehot = pd.get_dummies(data_onehot, columns=[col], prefix=col)

In [456]:
train_len = len(titanic_train)
X_train_label = data_label[:train_len].drop('Survived', axis=1)
Y_train_label = data_label[:train_len]['Survived'].copy()
X_train_onehot = data_onehot[:train_len].drop('Survived', axis=1)
Y_train_onehot = data_onehot[:train_len]['Survived'].copy()

In [507]:
models = {
    'rf': ensemble.RandomForestClassifier(),
    'ext': ensemble.ExtraTreesClassifier(),
    'lgbm': lightgbm.LGBMClassifier(),
    'svc': svm.SVC(probability=True)
}

grid_params = {
    'rf': {
        'max_depth' : range(3, 11),
        'n_estimators': range(10, 51, 10),
        'max_features': ['sqrt', 'auto', 'log2'],
        'min_samples_split': range(2, 15),
        'min_samples_leaf': range(1, 8),
        'bootstrap': [True, False],
    },
    'ext': {
        "max_depth": [None],
        "max_features": [1, 3, 10],
        "min_samples_split": range(2, 20, 3),
        "min_samples_leaf": range(1, 10),
        "bootstrap": [False],
        "n_estimators" :[100, 200, 300],
        "criterion": ["gini"]
    },
    'lgbm': {
        'learning_rate': [1e-3, 1e-2, 1e-1], 
        'n_estimators':[100, 200],
        'max_depth': [3, 4, 6, 8],
        "min_samples_split": [30, 40],
        'min_samples_leaf': [20, 20],
        'max_features': [0.05, 0.1, 0.3],
    },
    '''
    'svc': {
        'kernel': ['rbf'], 
        'gamma': [ 0.001, 0.01, 0.1, 1],
        'C': [1, 10, 50, 100, 200, 300, 1000],
    }
    '''
}

# Change here for the first run
is_grid_search = {
    'rf': False,
    'ext': False,
    'lgbm': False,
    # 'svc': True,
}

In [None]:
estimator_best = {}
validation_scores = {}

In [515]:
for model in models:
    if is_grid_search[model]:
        grid_search = model_selection.GridSearchCV(models[model], grid_params[model],
                                                  scoring='accuracy', cv=5, n_jobs=8, iid='False')
        grid_search.fit(X_train_label, Y_train_label)
        estimator_best[model] = grid_search.best_estimator_
        validation_scores[model] = grid_search.best_score_
        
    print(estimator_best[model])
    print(validation_scores[model])

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.8406285072951739
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features=3, max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=14,
                     min_weight_fraction_leaf=0.0, n_estimators=300,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)
0.8282828282828283

## Prediction

In [516]:
test_label = data_label[train_len:].drop('Survived', axis=1)
test_onehot = data_onehot[train_len:].drop('Survived', axis=1)
answer = pd.read_csv('answer.csv').drop('PassengerId', axis=1)
answer = answer.rename(columns={'Survived': 'Answer'})

In [519]:
for name, clf in estimator_best.items():
    pred = pd.Series(clf.predict(test_label), name='Survived', dtype=int)
    print(name+':', answer[answer['Answer'] == pred].count()/answer['Answer'].size)

rf: Answer    0.787081
dtype: float64
ext: Answer    0.782297
dtype: float64
lgbm: Answer    0.76555
dtype: float64
svc: Answer    0.72488
dtype: float64


RandomForestClassifer seems to get the best score.

In [520]:
pred = pd.Series(estimator_best['rf'].predict(test_label), name='Survived', dtype=int)
result = pd.concat([titanic_test['PassengerId'], pred], axis=1)
result.to_csv('rf.csv', index=False)

I created simple ensemble model, but this does not improve the score.

In [524]:
voting = ensemble.VotingClassifier(#estimators=estimator_best.items(), 
                                   estimators=[('rf', estimator_best['rf']), ('ext', estimator_best['ext'])],
                                   voting='soft', n_jobs=8)
voting.fit(X_train_label, Y_train_label)

pred = pd.Series(voting.predict(test_label), name='Survived', dtype=int)
print(answer[answer['Answer'] == pred].count()/answer['Answer'].size)

Answer    0.787081
dtype: float64


In [525]:
pred = pd.Series(voting.predict(test_label), name='Survived', dtype=int)
result = pd.concat([titanic_test['PassengerId'], pred], axis=1)
result.to_csv('voting.csv', index=False)