In this notebook, we add many features base on `analysis.ipynb` and make predictions.

In [354]:
import numpy as np
import pandas as pd
from collections import Counter

In [358]:
titanic_train = pd.read_csv('input/train.csv')
titanic_test = pd.read_csv('input/test.csv')

In [359]:
data = pd.concat([titanic_train, titanic_test], sort=False)

## Arrange the Data

In [360]:
# fill the less frequently appeared NaN with mean and mode
data['Fare'].fillna(data['Fare'].mean(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

## Fill Age with the median age of similar rows according to Pclass, Parch and SibSp
index_NaN_age = data['Age'][data['Age'].isnull()].index.values
age_med = data['Age'].median()
for i in index_NaN_age:
    age_pred = data['Age'][((data['SibSp'] == data.iloc[i]['SibSp']) 
                            & (data['Parch'] == data.iloc[i]['Parch']) 
                            & (data['Pclass'] == data.iloc[i]['Pclass']))].median()
    if not np.isnan(age_pred):
        data['Age'].loc[i] = age_pred
    else:
        data['Age'].loc[i] = age_med

In [361]:
# add Title feature from Name
data_title = [i.split(',')[1].split('.')[0].strip() for i in data['Name']]
data['Title'] = pd.Series(data_title)
data["Title"] = data["Title"].replace(['Lady', 'the Countess',
                                       'Countess','Capt', 'Col','Don',
                                       'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer',
                                       'Dona'], 'Rare')
data["Title"] = data["Title"].map({"Master":0,
                                   "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1,
                                   "Mr":2, "Rare":3})
data["Title"] = data["Title"].astype(int)

In [362]:
# add the family-size feature
data['Fsize'] = data['Parch'] + data['SibSp'] + 1

# Create new feature of family size
data['Single'] = data['Fsize'].map(lambda s: 1 if s == 1 else 0)
data['SmallF'] = data['Fsize'].map(lambda s: 1 if  s == 2  else 0)
data['MedF'] = data['Fsize'].map(lambda s: 1 if 3 <= s <= 4 else 0)
data['LargeF'] = data['Fsize'].map(lambda s: 1 if s >= 5 else 0)

# add Cabin feature
for i, item in data['Cabin'].iteritems():
    if not pd.isnull(item):
        data.loc[i, 'Cabin'] = item[0]
    else:
        data.loc[i, 'Cabin'] = 'X'
        
## Treat Ticket by extracting the ticket prefix. When there is no prefix it returns X. 
Ticket = []
for i in list(data.Ticket):
    if not i.isdigit() :
        Ticket.append(i.replace(".","").replace("/","").strip().split(' ')[0])
    else:
        Ticket.append("X")
        
data['Ticket'] = Ticket

In [363]:
data.drop(['PassengerId', 'Name'], axis=1, inplace=True)
data["Sex"] = data["Sex"].map({"male": 0, "female":1})
data = pd.get_dummies(data, columns=['Cabin'], prefix='Cabin')
data = pd.get_dummies(data, columns=['Ticket'], prefix='Ticket')
data = pd.get_dummies(data, columns=['Title'], prefix='Title')
data = pd.get_dummies(data, columns=["Embarked"], prefix="Embarked")
data = pd.get_dummies(data, columns=["Pclass"], prefix="Pclass")
data.head(4)

Unnamed: 0,Survived,Sex,Age,SibSp,Parch,Fare,Fsize,Single,SmallF,MedF,...,Title_0,Title_1,Title_2,Title_3,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3
0,0.0,0,22.0,1,0,7.25,2,0,1,0,...,0,0,1,0,0,0,1,0,0,1
1,1.0,1,38.0,1,0,71.2833,2,0,1,0,...,0,1,0,0,1,0,0,1,0,0
2,1.0,1,26.0,0,0,7.925,1,1,0,0,...,0,1,0,0,0,0,1,0,0,1
3,1.0,1,35.0,1,0,53.1,2,0,1,0,...,0,1,0,0,0,0,1,1,0,0


## Train a Model

In [369]:
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import feature_selection 
from sklearn import ensemble
from sklearn import svm
import lightgbm

In [365]:
train_len = len(titanic_train)
X_train = data[:train_len].drop(['Survived'], axis=1)
Y_train = data[:train_len]['Survived']

### Ensemble 

In [366]:
def show_result(grid_search):
    print(grid_search.best_params_)
    print(grid_search.best_score_)

In [338]:
# Random Forest
rf_clf = ensemble.RandomForestClassifier(n_estimators=50, max_features='sqrt')

rf_param_grid = {
              "max_depth": [None],
              "n_estimators" :[100,300],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "criterion": ["gini"]
}

rf_grid_search = model_selection.GridSearchCV(rf_clf, rf_param_grid, 
                                              scoring='accuracy', cv=10, n_jobs=8)
rf_grid_search.fit(X_train, Y_train)
rf_clf_best = rf_grid_search.best_estimator_

In [339]:
show_result(rf_grid_search)

{'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 100}
0.8308740068104427


In [347]:
# SVM classifier
svm_clf = svm.SVC(probability=True)
svc_param_grid = {'kernel': ['rbf'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'C': [1, 10, 50, 100, 200, 300, 1000]}

svm_grid_search= model_selection.GridSearchCV(SVM_clf, svc_param_grid, 
                                      cv=10, scoring="accuracy", n_jobs=8, verbose=1)
svm_grid_search.fit(X_train,Y_train)

svm_clf_best = grid_search_svm.best_estimator_

Fitting 10 folds for each of 28 candidates, totalling 280 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  52 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 280 out of 280 | elapsed:    8.8s finished


In [348]:
show_result(grid_search_svm)

{'C': 200, 'gamma': 0.001, 'kernel': 'rbf'}
0.8103254769921436


In [341]:
# LightGBM
lgbm = lightgbm.LGBMClassifier(silent=False)

lgbm_param_grid = {
    'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1], 
    'n_estimators':[100, 200, 300],
    'max_depth': [4, 8, 15],
    'min_samples_leaf': [100,150],
    'max_features': [0.3, 0.1],
    'early_stopping_rounds': [10]
}

lgbm_grid_search = model_selection.GridSearchCV(lgbm, param_grid, 
                                                scoring='accuracy', cv=10, n_jobs=8)
lgbm_grid_search.fit(X_train, Y_train)

lgbm_best = lgbm_grid_search.best_estimator_

In [342]:
show_result(lgbm_grid_search)

{'learning_rate': 0.1, 'n_estimators': 100}
0.8274687854710556


In [336]:
# Extra Trees
ext_clf = ensemble.ExtraTreesClassifier()

ext_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}

ext_grid_search = model_selection.GridSearchCV(ext_clf, ext_param_grid, 
                                               scoring='accuracy', 
                                               cv=10, n_jobs=8)
ext_grid_search.fit(X_train, Y_train)
ext_clf_best = ext_grid_search.best_estimator_

In [337]:
show_result(ext_grid_search)

{'bootstrap': False, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 300}
0.8297389330306469


In [350]:
voting_clf = ensemble.VotingClassifier(estimators=[
    ('svm', svm_clf_best), ('lgbm', lgbm_best), 
    ('rf', rf_clf_best), ('ext', ext_clf_best)], voting='soft', n_jobs=8)

voting_clf.fit(X_train, Y_train)

VotingClassifier(estimators=[('svm',
                              SVC(C=200, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_function_shape='ovr',
                                  degree=3, gamma=0.001, kernel='rbf',
                                  max_iter=-1, probability=True,
                                  random_state=None, shrinking=True, tol=0.001,
                                  verbose=False)),
                             ('lgbm',
                              LGBMClassifier(boosting_type='gbdt',
                                             class_weight=None,
                                             colsample_bytree=1.0,
                                             importance_type='split',
                                             le...
                                                   class_weight=None,
                                                   criterion='gini',
                                                

## Prediction

In [377]:
test = data[train_len:].drop(['Survived'], axis=1)

In [352]:
test_survived = voting_clf.predict(test)
# test_survived = rf_clf_best.predict(test)
result = pd.concat([titanic_test['PassengerId'],
                    pd.Series(test_survived, name='Survived', dtype='int')],
                   axis=1)

result.to_csv("ensemble_voting.csv", index=False)

In [353]:
# voting_clf.score(X_train, Y_train)
# print('SVM:', svm_clf_best.score(X_train, Y_train))
print('Randam Forest:', rf_clf_best.score(X_train, Y_train))
print('LightGBM:', lgbm_best.score(X_train, Y_train))
print('Extra Tree:', ext_clf_best.score(X_train, Y_train))

Randam Forest: 0.8626560726447219
LightGBM: 0.960272417707151
Extra Tree: 0.8535754824063564


In [379]:
test_survived = model.predict(test)
result = pd.concat([titanic_test['PassengerId'],
                    pd.Series(test_survived, name='Survived', dtype='int')],
                   axis=1)
result.to_csv("rf_clf.csv", index=False)

In [380]:
model.score(X_train, Y_train)

0.8630751964085297