The solution of this notebook is just simple (straightforward) as

- Fill NaN data simply with median, mean, and so on
- No extra features
- Cross-validate three models (LightGBM, Random Forest, Extra Trees) and simply ensemble these results

In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [45]:
train = pd.read_csv('./input/train.csv', index_col=0)  # index='PassengerId' と指定してもよい
test = pd.read_csv('./input/test.csv', index_col=0)
train.head(4)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S


## Arrange the Data

In [57]:
from sklearn.preprocessing import LabelEncoder

titanic_train = train.copy()

In [58]:
# drop irrelevant data
titanic_train.drop(['Ticket', 'Cabin', 'Name'], axis=1, inplace=True)
# titanic_train[titanic_train.isnull().any(axis=1)]  # Nan を含む行の表示

# fill NaN data
titanic_train['Age'].fillna(-999, inplace=True)
titanic_train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

# label encoding
for attr in ['Sex', 'Embarked']:
    le = LabelEncoder()
    titanic_train[attr] = le.fit_transform(titanic_train[attr])
    
titanic_train.head(4)

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,1,22.0,1,0,7.25,2
2,1,1,0,38.0,1,0,71.2833,0
3,1,3,0,26.0,0,0,7.925,2
4,1,1,0,35.0,1,0,53.1,2


## Train a Model

In [95]:
import lightgbm as lgb
from sklearn import ensemble
from sklearn import model_selection

rnd_state = 42

In [88]:
X_train = titanic_train.drop(['Survived'], axis=1)
y_train = titanic_train['Survived']

### Ensembling
#### Hyper parameter search

In [123]:
# LightGBM
lgbm = lgb.LGBMClassifier(silent=False, random_state=42)

param_grid = {
    'learning_rate': [1e-4, 1e-3, 1e-2, 1e-1], 
    'n_estimators':[10, 100, 1000]
}

grid_search = model_selection.GridSearchCV(lgbm, param_grid, scoring='accuracy', cv=5, n_jobs=8)
grid_search.fit(X_train, y_train)
best_lgbm = grid_search.best_estimator_

In [124]:
print(grid_search.best_params_)
print(grid_search.best_score_)
# pd.DataFrame(grid_search.cv_results_)

{'learning_rate': 0.1, 'n_estimators': 100}
0.8271604938271605


In [106]:
# Random Forest
rf_clf = ensemble.RandomForestClassifier(random_state=rnd_state)

rf_param_grid = {
    'n_estimators' :[100, 200, 400],  # the number of trees in the forest
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 3, 10, 20],  # The minimum number of samples required to split an internal node
    "min_samples_leaf": [1, 3, 10],  # The minimum number of samples required to be at a leaf node
    # "max_features": [1, 3, 10],
    "bootstrap": [True, False]
}

rf_grid_search = model_selection.GridSearchCV(rf_clf, rf_param_grid, scoring='accuracy', cv=5, n_jobs=8)
rf_grid_search.fit(X_train, y_train)
best_rf_clf = rf_grid_search.best_estimator_

In [107]:
print(rf_grid_search.best_params_)
print(rf_grid_search.best_score_)

{'bootstrap': False, 'criterion': 'entropy', 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 200}
0.8271604938271605


In [129]:
# Extra Trees
ext_clf = ensemble.ExtraTreesClassifier()

ext_param_grid = {
    'n_estimators' :[100, 200, 500, 700], 
    'criterion': ['gini', 'entropy'],
    'min_samples_split': [2, 3, 10, 20],
    'min_samples_leaf': [1, 3, 10]
}

ext_grid_search = model_selection.GridSearchCV(ext_clf, ext_param_grid, scoring='accuracy', cv=5, n_jobs=8)
ext_grid_search.fit(X_train, y_train)
best_ext_clf = ext_grid_search.best_estimator_

In [130]:
print(ext_grid_search.best_params_)
print(ext_grid_search.best_score_)

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
0.8215488215488216


In [131]:
voting_clf = ensemble.VotingClassifier(estimators=[
    ('lgbm', best_lgbm), ('rf', best_rf_clf), ('ext', best_ext_clf)],
                                       voting='soft', n_jobs=8)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lgbm',
                              LGBMClassifier(boosting_type='gbdt',
                                             class_weight=None,
                                             colsample_bytree=1.0,
                                             importance_type='split',
                                             learning_rate=0.1, max_depth=-1,
                                             min_child_samples=20,
                                             min_child_weight=0.001,
                                             min_split_gain=0.0,
                                             n_estimators=100, n_jobs=-1,
                                             num_leaves=31, objective=None,
                                             random_state=42, reg_alpha=0.0,
                                             reg_lambda=0.0, silent=False,
                                             subsam...
                                                   class_w

## Prediction

In [135]:
titanic_test = test.copy()

# drop irrelevant data
titanic_test.drop(['Ticket', 'Cabin', 'Name'], axis=1, inplace=True)

# fill NaN data
titanic_test['Age'].fillna(-999, inplace=True)
titanic_test['Embarked'].fillna(titanic_test['Embarked'].mode()[0], inplace=True)
titanic_test['Fare'].fillna(titanic_test['Fare'].mode()[0], inplace=True)

# label encoding
for attr in ['Sex', 'Embarked']:
    le = LabelEncoder()
    titanic_test[attr] = le.fit_transform(titanic_test[attr])

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
892,3,1,34.5,0,0,7.8292,1
893,3,0,47.0,1,0,7.0,2
894,2,1,62.0,0,0,9.6875,1
895,3,1,27.0,0,0,8.6625,2


In [154]:
test_survived = voting_clf.predict(titanic_test)
result = pd.concat([pd.Series(test.index.values, name='PassengerId'),
                    pd.Series(test_survived, name='Survived')], axis=1)

result.to_csv("ensemble_voting.csv", index=False)