---

03/2019 <p style="text-align: right">Anton Panchenko</p>


![Titanic](https://cdn-static.denofgeek.com/sites/denofgeek/files/styles/main_wide/public/2015/10/raise-main.jpg?itok=QBxamb0z)

In [None]:
from sys import version
print("python", version)

import os
import re
import numpy as np
print("numpy", np.__version__)

import pandas as pd
print("pandas", pd.__version__)

import seaborn as sns
print("seaborn", sns.__version__)

from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics

import xgboost as xgb
print("xgboost", xgb.__version__)

os.listdir("../input")

## Introduction

Load and merge datasets, meet data

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
data = train.append(test, sort=False)
data.sample(5)

Look at types and incomplete features

In [None]:
def count(df:pd.DataFrame):
    stat = pd.DataFrame([df.dtypes, df.count(), df.isna().sum()], index=['dtypes', 'values', 'nans'])
    return stat.sort_values(by=['values'], axis=1, ascending=False)

count(data)

It seems we have 1309 objects. Incomplete features are:
- Fare
- Embarked
- Age
- Cabin


## Check tickets

In [None]:
def value_counts(feature:pd.Series):
    df = pd.DataFrame([feature.value_counts()], index=[feature.name])
    df['nan'] = feature.isna().sum()
    return df

def encode_cat(df:pd.DataFrame, label:str):
    target = label.lower() + '_cat'
    if target in df:
        df.drop(columns=target, inplace=True)
    notna = df[label].notna()
    y = df[notna].loc[:, label]
    df.loc[notna, target] = LabelEncoder().fit_transform(y).astype('int32')
    return value_counts(df[target])

encode_cat(data, 'Ticket')

## Create FamilySize and Alone features

In [None]:
data['family_size'] = data['SibSp'] + data['Parch'] + 1

data['alone'] = 0
data.loc[data['family_size'] == 1, 'alone'] = 1

value_counts(data['alone'])

## Encode Sex

In [None]:
encode_cat(data, 'Sex')

## Extract Title from Name

In [None]:
data['title'] = data['Name'].str.extract(r', (.*?)\.', expand=False)

value_counts(data['title'])

See [english honorifics](https://en.wikipedia.org/wiki/English_honorifics) for reference.

In [None]:
data['title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
data['title'].replace(['Mme', 'Lady', 'Countess', 'Dona', 'the Countess'], 'Mrs', inplace=True)
data['title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Mr', inplace=True)
value_counts(data['title'])

In [None]:
encode_cat(data, 'title')

## Fix Fare

In [None]:
def infer(df:pd.DataFrame, features:np.array, target:str):
    # select training data and fit regressor
    train = df[df[target].notna()]
    x = train.loc[:, features]
    y = train.loc[:, target]
    #model = linear_model.LinearRegression().fit(x, y)
    model = xgb.XGBRegressor(max_depth=2, learning_rate=1.5, n_estimators=1000, n_jobs=4, base_score=y.median()).fit(x, y)
    #parameters = {'max_depth': [2, 3, 4],
    #              'learning_rate': [1.2],
    #              'n_estimators': [1000]}
    #regressor = xgb.XGBRegressor(n_jobs=4, base_score=y.median())
    #model = model_selection.GridSearchCV(regressor, parameters, cv=3).fit(x, y)
    print('mae', metrics.mean_absolute_error(y, model.predict(x)))
    print('score', model.score(x, y))
    #print(model.best_params_)
    
    # predict missing target values
    na_mask = df[target].isna()
    predict = df[na_mask]
    x_predict = predict.loc[:, features]
    y_predict = model.predict(x_predict)

    # create new feature
    df[target + '_'] = df[target]
    df.loc[na_mask, target + '_'] = y_predict
    
infer(data, ['Pclass', 'SibSp', 'sex_cat', 'title_cat', 'ticket_cat'], 'Fare')
# Sex and Parch - increase mae

#count(data)
data['Fare_'].plot.kde()

## Fix Age

In [None]:
infer(data, ['Pclass', 'SibSp', 'sex_cat', 'title_cat', 'ticket_cat', 'Fare_'], 'Age')

#count(data)
data['Age_'].plot.kde()

## Encode and fix Embarked

In [None]:
encode_cat(data, 'Embarked')

In [None]:
def infer_cat(df:pd.DataFrame, features:np.array, label:str):
    # select training data and classifier
    train = df[df[label].notna()]
    x = train.loc[:, features]
    y = train.loc[:, label]
    parameters = {'max_depth': [3, 5],
                  'learning_rate': [0.1, 0.5],
                  'n_estimators': [500]}
    #model = model_selection.GridSearchCV(xgb.XGBClassifier(n_jobs=4), parameters, cv=3).fit(x, y)
    model = xgb.XGBClassifier(max_depth=3, learning_rate=0.5, n_estimators=500, n_jobs=4, cv=3).fit(x, y)
    print('Score', model.score(x, y))  
    #print(model.best_params_)
    
    # predict missing target values 
    isna = df[label].isna()
    x_predict = df[isna].loc[:, features]
    target = label + '_'
    df[target] = df[label]
    df.loc[isna, target] = model.predict(x_predict)
    df[target] = df[target].astype('int64')
    return value_counts(df[target])

infer_cat(data, ['Pclass', 'SibSp', 'Parch', 'title_cat', 'Fare_', 'Age_'], 'embarked_cat')

## Finally predict Survived

In [None]:
infer_cat(data, ['Pclass', 'SibSp', 'Parch', 'title_cat', 'sex_cat', 'Fare_', 'Age_', 'embarked_cat'], 'Survived')

In [None]:
na_mask = data['Survived'].isna()
# create a Kaggle submission
sub = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': data[na_mask].loc[:, 'Survived_']})
sub.to_csv('submission.csv', index=False)

## Thanks!!!