---

03/2019 <p style="text-align: right">Anton Panchenko</p>


![Titanic](https://cdn-static.denofgeek.com/sites/denofgeek/files/styles/main_wide/public/2015/10/raise-main.jpg?itok=QBxamb0z)

In [None]:
from sys import version
print("python", version)

import os
import re
import numpy as np
print("numpy", np.__version__)

import pandas as pd
print("pandas", pd.__version__)

import seaborn as sns
print("seaborn", sns.__version__)

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

import xgboost as xgb
print("xgboost", xgb.__version__)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

os.listdir("../input")

## Introduction

Load and merge datasets, meet data

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
data = train.append(test, sort=False)
data.sample(5)

Look at types and incomplete features

In [None]:
def count(df:pd.DataFrame):
    stat = pd.DataFrame([df.dtypes, df.count(), df.isna().sum()], index=['dtypes', 'values', 'nans'])
    return stat.sort_values(by=['values'], axis=1, ascending=False)

count(data)

It seems we have 1309 objects.

Features to make categorical:
- Name
- Sex
- Ticket
- Embarked

Incomplete features are:
- Fare
- Embarked
- Age
- Cabin

Solve them in this order


## Check tickets

In [None]:
def value_counts(feature:pd.Series):
    df = pd.DataFrame([feature.value_counts()], index=[feature.name])
    df['nan'] = feature.isna().sum()
    return df

def encode_cat(df:pd.DataFrame, label:str):
    target = label.lower() + '_cat'
    if target in df:
        df.drop(columns=target, inplace=True)
    notna = df[label].notna()
    y = df[notna].loc[:, label]
    df.loc[notna, target] = LabelEncoder().fit_transform(y).astype('int32')
    return value_counts(df[target])

encode_cat(data, 'Ticket')

## Create FamilySize and Alone features

In [None]:
data['family_size'] = data['SibSp'] + data['Parch'] + 1

data['alone'] = 0
data.loc[data['family_size'] == 1, 'alone'] = 1

value_counts(data['alone'])

## Encode Sex

In [None]:
encode_cat(data, 'Sex')

## Extract Title from Name

In [None]:
data['title'] = data['Name'].str.extract(r', (.*?)\.', expand=False)

value_counts(data['title'])

See [english honorifics](https://en.wikipedia.org/wiki/English_honorifics) for reference.

In [None]:
data['title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
data['title'].replace(['Mme', 'Lady', 'Countess', 'Dona', 'the Countess'], 'Mrs', inplace=True)
data['title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Mr', inplace=True)
value_counts(data['title'])

In [None]:
encode_cat(data, 'title')

## Fix Fare

In [None]:
def infer(df:pd.DataFrame, params, features:np.array, target:str):
    # select training data and fit regressor
    train = df[df[target].notna()]
    x = train.loc[:, features]
    y = train.loc[:, target]
    regressor = xgb.XGBRegressor(n_jobs=4)
    grid = model_selection.GridSearchCV(regressor, params, cv=5).fit(x, y)
    print('score', grid.best_score_)
    print('params', grid.best_params_)  
    regressor = grid.best_estimator_    
    
    # predict missing target values
    na_mask = df[target].isna()
    predict = df[na_mask]
    x_predict = predict.loc[:, features]
    y_predict = regressor.predict(x_predict)

    # create new feature
    new_feature = target + '_'
    df[new_feature] = df[target]
    df.loc[na_mask, new_feature] = y_predict
    df[new_feature].plot.kde()

    # return feature importance
    feature_importance = pd.DataFrame({'feature':features, 'importance':regressor.feature_importances_})
    return feature_importance.sort_values(by='importance', ascending=False)

params = {'max_depth': [2, 3, 4],
          'learning_rate': [0.3, 0.4, 0.5],
          'n_estimators': [150, 170, 190]}

fare_features = ['Pclass', 'SibSp', 'sex_cat', 'title_cat', 'ticket_cat', 'family_size']

infer(data, params, fare_features, 'Fare')

## Encode and fix Embarked

In [None]:
encode_cat(data, 'Embarked')

In [None]:
def infer_cat(df, params, features, target:str):
    # select training data and classifier
    train = df[df[target].notna()]
    x = train.loc[:, features]
    y = train.loc[:, target]
    estimator = xgb.XGBClassifier(n_jobs=4)
    grid = model_selection.GridSearchCV(estimator, params, cv=3).fit(x, y)
    print('score', grid.best_score_)
    print('params', grid.best_params_)  
    estimator = grid.best_estimator_  
    
    # predict missing target values 
    na = df[target].isna()
    x_predict = df[na].loc[:, features]
    y_predict = estimator.predict(x_predict)
    
    # create new feature
    new_feature = target + '_'
    df[new_feature] = df[target]
    df.loc[na, new_feature] = y_predict
    df[new_feature] = df[new_feature].astype('int64')
    return value_counts(df[new_feature])

params = {'max_depth': [3, 4, 5],
          'learning_rate': [0.4, 0.5, 0.6],
          'n_estimators': [400, 500, 600]}

emb_features = np.append(fare_features, 'Fare_')

infer_cat(data, params, emb_features, 'embarked_cat')

## Fix Age

In [None]:
infer(data, ['Pclass', 'SibSp', 'sex_cat', 'title_cat', 'ticket_cat', 'family_size', 'Fare_'], 'Age')

## Finally predict Survived

In [None]:
infer_cat(data, ['Pclass', 'SibSp', 'Parch', 'title_cat', 'sex_cat', 'Fare_', 'Age_', 'embarked_cat'], 'Survived')

In [None]:
na_mask = data['Survived'].isna()
# create a Kaggle submission
sub = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': data[na_mask].loc[:, 'Survived_']})
sub.to_csv('submission.csv', index=False)

## Thanks!!!