---

03/2019 <p style="text-align: right">Anton Panchenko</p>


![Titanic](https://cdn-static.denofgeek.com/sites/denofgeek/files/styles/main_wide/public/2015/10/raise-main.jpg?itok=QBxamb0z)

In [2]:
from sys import version_info
print("Python version:", version_info)

import os
import re
import numpy as np
print("numpy version", np.__version__)

import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics

import xgboost as xgb
print("xgboost version", xgb.__version__)

os.listdir("../input")

Python version: sys.version_info(major=3, minor=7, micro=2, releaselevel='final', serial=0)
numpy version 1.16.2
XGBoost version 0.82


['gender_submission.csv', 'test.csv', 'train.csv']

## Introduction

Load and merge datasets, meet data

In [3]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
data = train.append(test, sort=False)
data.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
37,38,0.0,3,"Cann, Mr. Ernest Charles",male,21.0,0,0,A./5. 2152,8.05,,S
266,267,0.0,3,"Panula, Mr. Ernesti Arvid",male,16.0,4,1,3101295,39.6875,,S
75,76,0.0,3,"Moen, Mr. Sigurd Hansen",male,25.0,0,0,348123,7.65,F G73,S


Look at types and incomplete features

In [4]:
def count(df:pd.DataFrame):
    stat = pd.DataFrame([df.dtypes, df.count()], index=['dtypes', 'count'])
    return stat.sort_values(by=['count'], axis=1, ascending=False)

count(data)

Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,Age,Survived,Cabin
dtypes,int64,int64,object,object,int64,int64,object,float64,object,float64,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1308,1307,1046,891,295


It seems we have 1309 objects. Incomplete features are:
- Fare
- Embarked
- Age
- Cabin

## Create FamilySize and Alone features

In [5]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

data['Alone'] = 0
data.loc[data['FamilySize'] == 1, 'Alone'] = 1

def value_counts(feature:pd.Series):
    df = pd.DataFrame([feature.value_counts()], index=[feature.name + ' counts'])
    df['nans'] = feature.isna().sum()
    return df

value_counts(data['Alone'])

Unnamed: 0,1,0,nans
Alone counts,790,519,0


## Encode Sex

In [6]:
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])

value_counts(data['Sex'])

Unnamed: 0,1,0,nans
Sex counts,843,466,0


## Extract Title from Name

In [7]:
data['Title'] = data['Name'].str.extract(r', (.*)\.', expand=False)

value_counts(data['Title'])

Unnamed: 0,Mr,Miss,Mrs,Master,Dr,Rev,Col,Ms,Major,Mlle,Don,Mme,Sir,Capt,Lady,the Countess,Dona,Mrs. Martin (Elizabeth L,Jonkheer,nans
Title counts,757,260,196,61,8,8,4,2,2,2,1,1,1,1,1,1,1,1,1,0


See [english honorifics](https://en.wikipedia.org/wiki/English_honorifics) for reference.

* TODO: fix regexp for `Mrs. Martin (Elizabeth L` and `the Countess`

In [8]:
data['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
data['Title'].replace(['Mme', 'Lady', 'Countess', 'Dona', 'Mrs. Martin (Elizabeth L', 'the Countess'], 'Mrs', inplace=True)
data['Title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Mr', inplace=True)
data['Title'] = LabelEncoder().fit_transform(data['Title'])
value_counts(data['Title'])

Unnamed: 0,2,1,3,0,nans
Title counts,783,264,201,61,0


## Fix Fare

In [9]:
def infer(df:pd.DataFrame, features:np.array, target:str):
    train = df[df[target].notna()]
    
    # select training data and fit regressor
    x = train.loc[:, features]
    y = train.loc[:, target]
    reg = linear_model.LinearRegression().fit(x, y)
    print('     score', reg.score(x, y))
    
    pred = reg.predict(x)
    print('       mae', metrics.mean_absolute_error(y, pred))
    
    # check prediction is better than mean
    print('  mean mae', metrics.mean_absolute_error(y, np.full((train.shape[0]), y.mean())))
    
    # check prediction is better than median
    print('median mae', metrics.mean_absolute_error(y, np.full((train.shape[0]), y.median())))
    
    # predict missing target values
    na_mask = df[target].isna()
    predict = df[na_mask]
    x_predict = predict.loc[:, features]
    y_predict = reg.predict(x_predict)

    # create new feature
    df[target + '_'] = df[target]
    df.loc[na_mask, target + '_'] = y_predict
    
infer(data, ['Pclass', 'SibSp', 'Title'], 'Fare')
# Sex and Parch - increase mae

count(data)

     score 0.3515680878622677
       mae 21.181295377580543
  mean mae 29.798712688793504
median mae 24.5225121559633


Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,FamilySize,Alone,Title,Fare_,Fare,Embarked,Age,Survived,Cabin
dtypes,int64,int64,object,int32,int64,int64,object,int64,int64,int32,float64,float64,object,float64,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1308,1307,1046,891,295


## Fix Age

In [10]:
infer(data, ['Pclass', 'FamilySize', 'Title', 'Fare_'], 'Age')
# Sex

count(data)

     score 0.3800709225145358
       mae 8.834697089242642
  mean mae 11.262327595977025
median mae 11.157103250478011


Unnamed: 0,PassengerId,Parch,Fare_,Title,Alone,FamilySize,Ticket,SibSp,Sex,Name,Pclass,Age_,Fare,Embarked,Age,Survived,Cabin
dtypes,int64,int64,float64,int32,int64,int64,object,int64,int32,object,int64,float64,float64,object,float64,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1308,1307,1046,891,295


## Encode and fix Embarked

In [11]:
def encode_labels(df:pd.DataFrame, target: str):
    notna = df[target].notna()
    y = df[notna].loc[:, target]
    df.loc[notna, target] = LabelEncoder().fit_transform(y)

encode_labels(data, 'Embarked')
infer(data, ['Pclass', 'SibSp', 'Parch', 'Title', 'Fare_', 'Age_'], 'Embarked')

value_counts(data['Embarked_'])

     score 0.07613770086173777
       mae 0.6687430591199431
  mean mae 0.70947736541921
median mae 0.5072685539403213


Unnamed: 0,2.0,0.0,1.0,1.1926722433322476,1.2616295227859113,nans
Embarked_ counts,914,270,123,1,1,0


## Finally predict Survived

In [50]:
train = data[data['Survived'].notna()]
test = data[data['Survived'].isna()]

# prepare train for fitting
# other features appears useless
features = ['Age_', 'Sex', 'Pclass', 'Fare_', 'FamilySize', 'Alone']
x = train.loc[:, features]
y = train.loc[:, 'Survived']
XTrain, XValid, YTrain, YValid = train_test_split(x, y, test_size=0.2, random_state=40)

# train model
model = xgb.XGBClassifier(max_depth=4)
model.fit(x, y)

def accuracy(Y: np.array, yPred: np.array) -> float:
  return np.sum(yPred==Y) / len(Y)

# use model to predict on training and validation sets
print('     Train accuracy', accuracy(YTrain, model.predict(XTrain)))
print('Validation accuracy', accuracy(YValid, model.predict(XValid)))

     Train accuracy 0.8820224719101124
Validation accuracy 0.9162011173184358


## Save results

In [51]:
# Predict for test set
# Create a Kaggle submission
XTest = test.loc[:, features]
YTest = model.predict(XTest)
#data['Survivd'] = data['Survived'].astype('int64')
sub = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': YTest.astype('int64')})
sub.to_csv('submission.csv', index=False)

## Thanks!!!