# Titanic - Machine Learning from Disaster

## Features constructs

> Создать новые переменные из Name и Cabin

In [40]:
import re

import numpy as np
import pandas as pd

## Импорт данных

In [41]:
df = pd.read_csv('data/train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Анализ данных

### Описание данных

| Variable | Definition                                  | Key                                            |
|----------|---------------------------------------------|------------------------------------------------|
| survival | Survival	                                   | 0 = No, 1 = Yes                                |
| pclass   | Ticket class                                | 	1 = 1st, 2 = 2nd, 3 = 3rd                     |
| sex	     | Sex                                         |                                                |	
| Age      | 	Age in years                               |                                                |	
| sibsp    | 	# of siblings / spouses aboard the Titanic |                                                |	
| parch    | 	# of parents / children aboard the Titanic |                                                |	
| ticket   | 	Ticket number                              |                                                |	
| fare     | 	Passenger fare                             |                                                |	
| cabin    | 	Cabin number                               |                                                |	
| embarked | 	Port of Embarkation                        | C = Cherbourg, Q = Queenstown, S = Southampton |

### Variable Notes
**pclass**: A proxy for socio-economic status (SES)

1st = Upper
2nd = Middle
3rd = Lower

**age**: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

**sibsp**: The dataset defines family relations in this way...

**Sibling** = brother, sister, stepbrother, stepsister

**Spouse** = husband, wife (mistresses and fiancés were ignored)

**parch**: The dataset defines family relations in this way...

**Parent** = mother, father

**Child** = daughter, son, stepdaughter, stepson

**Some** children travelled only with a nanny, therefore parch=0 for them.


In [42]:
df.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
PassengerId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Survived,0,1,1,1,0,0,0,0,1,1,1,1,0,0,0,1,0,1,0,1
Pclass,3,1,3,1,3,3,1,3,3,2,3,1,3,3,3,2,3,2,3,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry","Moran, Mr. James","McCarthy, Mr. Timothy J","Palsson, Master. Gosta Leonard","Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)","Nasser, Mrs. Nicholas (Adele Achem)","Sandstrom, Miss. Marguerite Rut","Bonnell, Miss. Elizabeth","Saundercock, Mr. William Henry","Andersson, Mr. Anders Johan","Vestrom, Miss. Hulda Amanda Adolfina","Hewlett, Mrs. (Mary D Kingcome)","Rice, Master. Eugene","Williams, Mr. Charles Eugene","Vander Planke, Mrs. Julius (Emelia Maria Vande...","Masselmani, Mrs. Fatima"
Sex,male,female,female,female,male,male,male,male,female,female,female,female,male,male,female,female,male,male,female,female
Age,22.0,38.0,26.0,35.0,35.0,,54.0,2.0,27.0,14.0,4.0,58.0,20.0,39.0,14.0,55.0,2.0,,31.0,
SibSp,1,1,0,1,0,0,0,3,0,1,1,0,0,1,0,0,4,0,1,0
Parch,0,0,0,0,0,0,0,1,2,0,1,0,0,5,0,0,1,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450,330877,17463,349909,347742,237736,PP 9549,113783,A/5. 2151,347082,350406,248706,382652,244373,345763,2649
Fare,7.25,71.2833,7.925,53.1,8.05,8.4583,51.8625,21.075,11.1333,30.0708,16.7,26.55,8.05,31.275,7.8542,16.0,29.125,13.0,18.0,7.225


In [43]:
df['Title'] = df.Name.str.extract(r'.*\, ([a-zA-Z]*\.) ')

In [44]:
df[['Name', 'Title']].head(20)

Unnamed: 0,Name,Title
0,"Braund, Mr. Owen Harris",Mr.
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",Mrs.
2,"Heikkinen, Miss. Laina",Miss.
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",Mrs.
4,"Allen, Mr. William Henry",Mr.
5,"Moran, Mr. James",Mr.
6,"McCarthy, Mr. Timothy J",Mr.
7,"Palsson, Master. Gosta Leonard",Master.
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",Mrs.
9,"Nasser, Mrs. Nicholas (Adele Achem)",Mrs.


In [45]:
df.Title.value_counts()

Title
Mr.          517
Miss.        182
Mrs.         125
Master.       40
Dr.            7
Rev.           6
Mlle.          2
Major.         2
Col.           2
Don.           1
Mme.           1
Ms.            1
Sir.           1
Lady.          1
Capt.          1
Jonkheer.      1
Name: count, dtype: int64

In [46]:
df[df['Title'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
759,760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dye...",female,33.0,0,0,110152,86.5,B77,S,


## Обработка данных

### Разделение на выборки 

In [47]:
from sklearn.model_selection import train_test_split

In [55]:
target = df['Survived']
# features = df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Survived'], axis=1)
features = df.drop(['PassengerId',  'Ticket', 'Cabin', 'Survived'], axis=1)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
X_train.shape, X_test.shape

((596, 9), (295, 9))

In [57]:
X_train.head(2)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Title
6,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,S,Mr.
718,3,"McEvoy, Mr. Michael",male,,0,0,15.5,Q,Mr.


## Создание конвейера

План:

1. Численные:
 - Age - заполнение пропусков
2. Категорийный:
 - Embarked, Sex - Кодирование

In [58]:
import pandas as pd
from sklearn.compose import ColumnTransformer

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier


### Итоговый конвейер

In [71]:
def get_title(name):
    return re.match(r'.*\, ([a-zA-Z]*\.) ', name)[1]


pipe_cat = Pipeline([('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan))])

pipe_num = Pipeline([('knn', KNNImputer(n_neighbors=5))])

pipe_encoder = Pipeline([('regex_encoder',FunctionTransformer(func=get_title))])

col_transformer0 = ColumnTransformer([('pipe_encoder', pipe_encoder, ['Name'])],
                                     remainder='passthrough',
                                     force_int_remainder_cols=False) # .set_output(transform='pandas')

col_transformer1 = ColumnTransformer([('cat_preproc', pipe_cat, ['Embarked', 'Sex', 'Name'])],
                                     remainder='passthrough',
                                     force_int_remainder_cols=False)

model = DecisionTreeClassifier(random_state=0)

final_pipe = Pipeline(
    [('preproc0', col_transformer0), ('preproc1', col_transformer1), ('num_inputer', pipe_num), ('model', model)])

In [72]:
final_pipe.set_params(model__max_depth=7, model__max_features=5, num_inputer__knn__n_neighbors=9)

In [73]:
final_pipe.fit(X_train, y_train)

TypeError: expected string or bytes-like object, got 'DataFrame'

In [62]:
from sklearn.metrics import accuracy_score

In [63]:
accuracy_score(y_train, final_pipe.predict(X_train))

0.8808724832214765

In [64]:
accuracy_score(y_test, final_pipe.predict(X_test))

0.8033898305084746

In [19]:

from sklearn.model_selection import GridSearchCV

In [42]:
model_params = {'model__max_depth': range(1, 11), 'model__max_features': range(4, 19),
                'num_inputer__knn__n_neighbors': range(3, 10)}
tree_grid = GridSearchCV(final_pipe, model_params, cv=5, n_jobs=-1, verbose=True)

In [43]:
tree_grid.fit(X_train, y_train)


Fitting 5 folds for each of 1050 candidates, totalling 5250 fits


In [44]:
tree_grid.best_params_

{'model__max_depth': 7,
 'model__max_features': 5,
 'num_inputer__knn__n_neighbors': 9}

### Метрики обучения

> Скор на трейне : 0.979
> Скор на валиде : 0.76
> Вывод достигли переобучение

### Исследование pipeline

In [26]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
6,1,male,54.0,0,0,51.8625,S
718,3,male,,0,0,15.5,Q


In [27]:
final_pipe.steps

[('preproc1',
  ColumnTransformer(force_int_remainder_cols=False, remainder='passthrough',
                    transformers=[('cat_preproc',
                                   Pipeline(steps=[('encoder',
                                                    OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                   unknown_value=nan))]),
                                   ['Embarked', 'Sex'])])),
 ('num_inputer', Pipeline(steps=[('knn', KNNImputer())])),
 ('model', DecisionTreeClassifier(random_state=0))]

In [28]:
final_pipe.named_steps['preproc1'].named_transformers_['cat_preproc'].named_steps['encoder'].categories_

[array(['C', 'Q', 'S', nan], dtype=object),
 array(['female', 'male'], dtype=object)]

In [50]:
final_pipe.named_steps['model'].feature_importances_

array([0.0301361 , 0.34725498, 0.10693723, 0.18407834, 0.02692127,
       0.02129327, 0.28337881])

In [51]:
X_train.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

Максимальное влияние параметров:
1. Sex - 0.29
2. SibSp (наличие родственников) - 0.27
3. Embarked (порт назначения) - 0.20


### Кроссвалидация

In [65]:

from sklearn.model_selection import cross_val_score

cv_results = cross_val_score(final_pipe, X_train, y_train, cv=5,
                             scoring='accuracy')

In [66]:
cv_results

array([0.79166667, 0.84033613, 0.79831933, 0.78991597, 0.81512605])

## Предсказание на реальных данных

In [67]:
df_ground = pd.read_csv('data/test.csv')
df_ground.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [68]:
df_submission = pd.read_csv('data/gender_submission.csv')
df_submission.head(2)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1


In [69]:
df_ground['Survived'] = final_pipe.predict(df_ground.drop('PassengerId', axis=1))


In [70]:
df_ground

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0
...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,0
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,0


In [71]:
df_ground[['PassengerId', 'Survived']].to_csv('data/test4.csv', index=False)

### Метрика на лидерборде

Score: 0.73923

place: 12189

После оптимизации knn

Score: 0.74880

place: 11928

Можно сказать, что улучшение качества модели существенно улучшило результат на лидерборде.


