---

03/2019


![Titanic](https://cdn-static.denofgeek.com/sites/denofgeek/files/styles/main_wide/public/2015/10/raise-main.jpg?itok=QBxamb0z)

In [253]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics

os.listdir("../input")

['gender_submission.csv', 'test.csv', 'train.csv']

## Introduction

Load and merge datasets, meet data

In [254]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
data = train.append(test, sort=False)
data.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
429,430,1.0,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32.0,0,0,SOTON/O.Q. 392078,8.05,E10,S
68,69,1.0,3,"Andersson, Miss. Erna Alexandra",female,17.0,4,2,3101281,7.925,,S
567,568,0.0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S


Look at types and incomplete features

In [255]:
def count(data:pd.DataFrame):
    stat = pd.DataFrame([data.dtypes, data.count()], index=['dtypes', 'count'])
    return stat.sort_values(by=['count'], axis=1, ascending=False)

count(data)

Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,Age,Survived,Cabin
dtypes,int64,int64,object,object,int64,int64,object,float64,object,float64,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1308,1307,1046,891,295


It seems we have 1309 objects. Incomplete features are:
- Fare
- Embarked
- Age
- Cabin

## Create Title

In [256]:
# extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''

data['Title'] = data['Name'].apply(get_title)
data['Title'].replace('Mlle', 'Miss', inplace=True)
data['Title'].replace('Ms', 'Miss', inplace=True)
data['Title'].replace('Mme', 'Mrs', inplace=True)
data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare', inplace=True)

def encode_labels(label: str):
    le = LabelEncoder()
    le.fit(data[label])
    data.loc[:, label] = le.transform(data[label])

encode_labels('Title')

## Fix Fare

In [257]:
def infer(data:pd.DataFrame, features:np.array, target:str):
    train = data[data[target].notna()]
    train_size = train.shape[0]
    if train_size == data.shape[0]:
        return
    x = train.loc[:, features]
    y = train.loc[:, target]
    reg = linear_model.LinearRegression().fit(x, y)
    print('   score', reg.score(x, y))
    
    pred = reg.predict(x)
    print('     mae', metrics.mean_absolute_error(y, pred))
    
    # check prediction is better than mean
    print('mean mae', metrics.mean_absolute_error(y, np.full((train_size), y.mean())))
    
    # check prediction is better than median
    print(' med mae', metrics.mean_absolute_error(y, np.full((train_size), y.median())))
    
    predict = data[data[target].isna()]
    x_fare = predict.loc[:, features]
    y_fare = reg.predict(x_fare)

    
    data.loc[data[target].isna(), target] = y_fare
    
    
infer(data, ['Pclass', 'SibSp', 'Parch', 'Title'], 'Fare')
count(data)

   score 0.38258412483459414
     mae 21.555991632653065
mean mae 29.798712688793504
 med mae 24.5225121559633


Unnamed: 0,PassengerId,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Title,Embarked,Age,Survived,Cabin
dtypes,int64,int64,object,object,int64,int64,object,float64,int32,object,float64,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1309,1309,1307,1046,891,295


In [258]:
infer(data, ['Pclass', 'SibSp', 'Parch', 'Title', 'Fare'], 'Age')

count(data)

   score 0.38740313945425375
     mae 8.763525987460046
mean mae 11.262327595977025
 med mae 11.157103250478011


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Title,Embarked,Survived,Cabin
dtypes,int64,int64,object,object,float64,int64,int64,object,float64,int32,object,float64,object
count,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1307,891,295


In [172]:
def fillna_mean(feature: pd.Series):
    mean = feature.mean()
    feature.fillna(mean, inplace=True)
    
#fillna_mean(data['Age'])

In [259]:
# new features - FamilySize
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1

In [260]:
# new features - Alone
data['Alone'] = 0
data.loc[data['FamilySize'] == 1, 'Alone'] = 1

In [261]:
def encode_labels(label: str):
    le = LabelEncoder()
    data[label].fillna('', inplace=True)
    le.fit(data[label])
    data.loc[:, label] = le.transform(data[label])

encode_labels('Sex')
encode_labels('Embarked')
encode_labels('Title')

In [262]:
train = data[data['Survived'].notna()]
test = data[data['Survived'].isna()]

In [263]:
# prepare train for fitting
# other features appears useless
features = ['Age', 'Sex', 'Pclass', 'Fare', 'Parch', 'Alone']
X = train.loc[:, features]
Y = train.loc[:, 'Survived']
XTrain, XValid, YTrain, YValid = train_test_split(X, Y, test_size=0.2, random_state=40)

In [264]:
# Fit logistic regression using scikit
LR = linear_model.LogisticRegression(C=1000, solver='lbfgs', max_iter=1000)
LR.fit(X=XTrain, y=YTrain)

def accuracy(Y: np.array, yPred: np.array) -> float:
  return np.sum(yPred==Y) / len(Y)

# Use model to predict on training and validation sets
print('     Train accuracy', accuracy(YTrain, LR.predict(XTrain)))
print('Validation accuracy', accuracy(YValid, LR.predict(XValid)))

     Train accuracy 0.7907303370786517
Validation accuracy 0.8156424581005587


In [265]:
# Predict for test set
# Create a Kaggle submission
XTest = test.loc[:, features]
YTest = LR.predict(XTest)
#data['Survivd'] = data['Survived'].astype('int64')
sub = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': YTest.astype('int64')})
sub.to_csv('submission.csv', index=False)