---

03/2019 <div style="text-align: right">Anton Panchenko</div>


![Titanic](https://cdn-static.denofgeek.com/sites/denofgeek/files/styles/main_wide/public/2015/10/raise-main.jpg?itok=QBxamb0z)

In [40]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics

os.listdir("../input")

['gender_submission.csv', 'test.csv', 'train.csv']

## Introduction

Load and merge datasets, meet data

In [113]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
data = train.append(test, sort=False)
data.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
379,380,0.0,3,"Gustafsson, Mr. Karl Gideon",male,19.0,0,0,347069,7.775,,S
396,397,0.0,3,"Olsson, Miss. Elina",female,31.0,0,0,350407,7.8542,,S
433,434,0.0,3,"Kallio, Mr. Nikolai Erland",male,17.0,0,0,STON/O 2. 3101274,7.125,,S


Look at types and incomplete features

In [174]:
def count(df:pd.DataFrame):
    stat = pd.DataFrame([df.dtypes, df.count()], index=['dtypes', 'count'])
    return stat.sort_values(by=['count'], axis=1, ascending=False)

count(data)

Unnamed: 0,PassengerId,Ticket,FamilySize,Embarked_,Age_,Fare_,Title,Alone,Parch,SibSp,Sex,Name,Pclass,Fare,Embarked,Age,Survived,Cabin
dtypes,int64,object,int64,object,float64,float64,object,int64,int64,int64,int32,object,int64,float64,object,float64,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1308,1307,1046,891,295


It seems we have 1309 objects. Incomplete features are:
- Fare
- Embarked
- Age
- Cabin

## Create FamilySize and Alone features

In [218]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

data['Alone'] = 0
data.loc[data['FamilySize'] == 1, 'Alone'] = 1

def value_counts(feature:pd.Series):
    df = pd.DataFrame([feature.value_counts()], index=[feature.name + ' counts'])
    df['nans'] = feature.isna().sum()
    return df

value_counts(data['Alone'])

Unnamed: 0,1,0,nans
Alone counts,790,519,0


## Encode Sex

In [219]:
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])

value_counts(data['Sex'])

Unnamed: 0,1,0,nans
Sex counts,843,466,0


## Extract Title from Name

In [220]:
data['Title'] = data['Name'].str.extract(r', (.*)\.', expand=False)

value_counts(data['Title'])

Unnamed: 0,Mr,Miss,Mrs,Master,Dr,Rev,Col,Mlle,Ms,Major,Don,Capt,Dona,Jonkheer,Mrs. Martin (Elizabeth L,Mme,the Countess,Sir,Lady,nans
Title counts,757,260,196,61,8,8,4,2,2,2,1,1,1,1,1,1,1,1,1,0


See [english honorifics](https://en.wikipedia.org/wiki/English_honorifics) for reference.

* TODO: fix regexp for `Mrs. Martin (Elizabeth L` and `the Countess`

In [221]:
data['Title'].replace(['Mlle', 'Ms'], 'Miss', inplace=True)
data['Title'].replace(['Mme', 'Lady', 'Countess', 'Dona', 'Mrs. Martin (Elizabeth L', 'the Countess'], 'Mrs', inplace=True)
data['Title'].replace(['Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Mr', inplace=True)
data['Title'] = LabelEncoder().fit_transform(data['Title'])
value_counts(data['Title'])

Unnamed: 0,2,1,3,0,nans
Title counts,783,264,201,61,0


## Fix Fare

In [188]:
def infer(df:pd.DataFrame, features:np.array, target:str):
    train = df[df[target].notna()]
    
    # select training data and fit regressor
    x = train.loc[:, features]
    y = train.loc[:, target]
    reg = linear_model.LinearRegression().fit(x, y)
    print('     score', reg.score(x, y))
    
    pred = reg.predict(x)
    print('       mae', metrics.mean_absolute_error(y, pred))
    
    # check prediction is better than mean
    print('  mean mae', metrics.mean_absolute_error(y, np.full((train.shape[0]), y.mean())))
    
    # check prediction is better than median
    print('median mae', metrics.mean_absolute_error(y, np.full((train.shape[0]), y.median())))
    
    # predict missing target values
    na_mask = df[target].isna()
    predict = df[na_mask]
    x_predict = predict.loc[:, features]
    y_predict = reg.predict(x_predict)

    # create new feature
    df[target + '_'] = df[target]
    df.loc[na_mask, target + '_'] = y_predict
    
    
infer(data, ['Sex', 'Pclass', 'SibSp', 'Parch', 'Title'], 'Fare')

count(data)

     score 0.3851051296193947
       mae 21.65632358667641
  mean mae 29.798712688793504
median mae 24.5225121559633


Unnamed: 0,PassengerId,Ticket,FamilySize,Embarked_,Age_,Fare_,Title,Alone,Parch,SibSp,Sex,Name,Pclass,Fare,Embarked,Age,Survived,Cabin
dtypes,int64,object,int64,object,float64,float64,int32,int64,int64,int64,int32,object,int64,float64,object,float64,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1308,1307,1046,891,295


## Fix Age

In [189]:
infer(data, ['Sex', 'Pclass', 'SibSp', 'Parch', 'Title', 'Fare_'], 'Age')

count(data)

     score 0.3927764751108549
       mae 8.878938066039254
  mean mae 11.262327595977025
median mae 11.157103250478011


Unnamed: 0,PassengerId,Ticket,FamilySize,Embarked_,Age_,Fare_,Title,Alone,Parch,SibSp,Sex,Name,Pclass,Fare,Embarked,Age,Survived,Cabin
dtypes,int64,object,int64,object,float64,float64,int32,int64,int64,int64,int32,object,int64,float64,object,float64,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1308,1307,1046,891,295


## Encode and fix Embarked

In [192]:
def encode_labels(df:pd.DataFrame, target: str):
    notna = df[target].notna()
    y = df[notna].loc[:, target]
    df.loc[notna, target] = LabelEncoder().fit_transform(y)

encode_labels(data, 'Embarked')
infer(data, ['Pclass', 'SibSp', 'Parch', 'Title', 'Fare_', 'Age_'], 'Embarked')

value_counts(data['Embarked_'])

     score 0.07644678819687824
       mae 0.6685018167342903
  mean mae 0.70947736541921
median mae 0.5072685539403213


Unnamed: 0,2.0,0.0,1.0,1.1948966586896055,1.2660614766504166
count,914,270,123,1,1


## Finally predict Survived

In [185]:
train = data[data['Survived'].notna()]
test = data[data['Survived'].isna()]

# prepare train for fitting
# other features appears useless
x = train.loc[:, ['Age_', 'Sex', 'Pclass', 'Fare_', 'FamilySize', 'Alone']]
y = train.loc[:, 'Survived']
XTrain, XValid, YTrain, YValid = train_test_split(x, y, test_size=0.2, random_state=40)

# Fit logistic regression using scikit
LR = linear_model.LogisticRegression(C=1000, solver='lbfgs', max_iter=1000)
LR.fit(X=XTrain, y=YTrain)

def accuracy(Y: np.array, yPred: np.array) -> float:
  return np.sum(yPred==Y) / len(Y)

# Use model to predict on training and validation sets
print('     Train accuracy', accuracy(YTrain, LR.predict(XTrain)))
print('Validation accuracy', accuracy(YValid, LR.predict(XValid)))

     Train accuracy 0.7991573033707865
Validation accuracy 0.8324022346368715


## Save results

In [186]:
# Predict for test set
# Create a Kaggle submission
XTest = test.loc[:, features]
YTest = LR.predict(XTest)
#data['Survivd'] = data['Survived'].astype('int64')
sub = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': YTest.astype('int64')})
sub.to_csv('submission.csv', index=False)

## Thanks!!!