# Titanic - Machine Learning from Disaster

## Features constructs

> Создать новые переменные из Cabin

In [1]:


import numpy as np
import pandas as pd

## Импорт данных

In [2]:
df = pd.read_csv('data/train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Анализ данных

### Описание данных

| Variable | Definition                                  | Key                                            |
|----------|---------------------------------------------|------------------------------------------------|
| survival | Survival	                                   | 0 = No, 1 = Yes                                |
| pclass   | Ticket class                                | 	1 = 1st, 2 = 2nd, 3 = 3rd                     |
| sex	     | Sex                                         |                                                |	
| Age      | 	Age in years                               |                                                |	
| sibsp    | 	# of siblings / spouses aboard the Titanic |                                                |	
| parch    | 	# of parents / children aboard the Titanic |                                                |	
| ticket   | 	Ticket number                              |                                                |	
| fare     | 	Passenger fare                             |                                                |	
| cabin    | 	Cabin number                               |                                                |	
| embarked | 	Port of Embarkation                        | C = Cherbourg, Q = Queenstown, S = Southampton |

### Variable Notes
**pclass**: A proxy for socio-economic status (SES)

1st = Upper
2nd = Middle
3rd = Lower

**age**: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

**sibsp**: The dataset defines family relations in this way...

**Sibling** = brother, sister, stepbrother, stepsister

**Spouse** = husband, wife (mistresses and fiancés were ignored)

**parch**: The dataset defines family relations in this way...

**Parent** = mother, father

**Child** = daughter, son, stepdaughter, stepson

**Some** children travelled only with a nanny, therefore parch=0 for them.


In [3]:
df.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
PassengerId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Survived,0,1,1,1,0,0,0,0,1,1,1,1,0,0,0,1,0,1,0,1
Pclass,3,1,3,1,3,3,1,3,3,2,3,1,3,3,3,2,3,2,3,3
Name,"Braund, Mr. Owen Harris","Cumings, Mrs. John Bradley (Florence Briggs Th...","Heikkinen, Miss. Laina","Futrelle, Mrs. Jacques Heath (Lily May Peel)","Allen, Mr. William Henry","Moran, Mr. James","McCarthy, Mr. Timothy J","Palsson, Master. Gosta Leonard","Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)","Nasser, Mrs. Nicholas (Adele Achem)","Sandstrom, Miss. Marguerite Rut","Bonnell, Miss. Elizabeth","Saundercock, Mr. William Henry","Andersson, Mr. Anders Johan","Vestrom, Miss. Hulda Amanda Adolfina","Hewlett, Mrs. (Mary D Kingcome)","Rice, Master. Eugene","Williams, Mr. Charles Eugene","Vander Planke, Mrs. Julius (Emelia Maria Vande...","Masselmani, Mrs. Fatima"
Sex,male,female,female,female,male,male,male,male,female,female,female,female,male,male,female,female,male,male,female,female
Age,22.0,38.0,26.0,35.0,35.0,,54.0,2.0,27.0,14.0,4.0,58.0,20.0,39.0,14.0,55.0,2.0,,31.0,
SibSp,1,1,0,1,0,0,0,3,0,1,1,0,0,1,0,0,4,0,1,0
Parch,0,0,0,0,0,0,0,1,2,0,1,0,0,5,0,0,1,0,0,0
Ticket,A/5 21171,PC 17599,STON/O2. 3101282,113803,373450,330877,17463,349909,347742,237736,PP 9549,113783,A/5. 2151,347082,350406,248706,382652,244373,345763,2649
Fare,7.25,71.2833,7.925,53.1,8.05,8.4583,51.8625,21.075,11.1333,30.0708,16.7,26.55,8.05,31.275,7.8542,16.0,29.125,13.0,18.0,7.225


In [4]:

df.Cabin.info()

<class 'pandas.core.series.Series'>
RangeIndex: 891 entries, 0 to 890
Series name: Cabin
Non-Null Count  Dtype 
--------------  ----- 
204 non-null    object
dtypes: object(1)
memory usage: 7.1+ KB


## Обработка данных

### Разделение на выборки 

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
target = df['Survived']
# features = df.drop(['PassengerId', 'Ticket', 'Cabin', 'Survived'], axis=1)
features = df.drop(['PassengerId', 'Ticket', 'Survived'], axis=1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
X_train.shape, X_test.shape

((596, 9), (295, 9))

In [8]:
X_train.head(2)

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
6,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,E46,S
718,3,"McEvoy, Mr. Michael",male,,0,0,15.5,,Q


## Создание конвейера

План:

1. Численные:
 - Age - заполнение пропусков
2. Категорийный:
 - Embarked, Sex - Кодирование

In [9]:
import pandas as pd
from sklearn.compose import ColumnTransformer

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier


### Итоговый конвейер

In [10]:

X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
6,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,E46,S
718,3,"McEvoy, Mr. Michael",male,,0,0,15.5,,Q
685,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,41.5792,,C
73,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,14.4542,,C
882,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,10.5167,,S


In [16]:
X_train.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

In [11]:
def get_title(df, col):
    df[col] = df[col].str.extract(r'.*\, ([a-zA-Z]*\.) ')
    return df


In [12]:
def get_level(df: pd.DataFrame, col):
    df[col] = df[col].str.slice(0, 1)
    df.fillna('.', inplace=True)
    return df

In [223]:
# reference_columns = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
reference_columns = ['encoder__Name', 'cat_preproc__Embarked',
                     'cat_preproc__Sex', 'remainder__Pclass', 'remainder__Age',
                     'remainder__SibSp', 'remainder__Parch', 'remainder__Fare']
# 'encoder_cabin__Cabin',
reference_columns2 = ['encoder__Name', 'cat_preproc__Embarked',
                      'cat_preproc__Sex', 'remainder__Pclass', 'remainder__Age',
                      'remainder__SibSp', 'remainder__Parch', 'remainder__Fare']
encoder_ord = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
pipe_name = Pipeline([('regexp_title', FunctionTransformer(get_title, kw_args={'col': 'Name'})),
                      ('encoder_ord', encoder_ord), ])
pipe_cabin = Pipeline([('regexp_cabin', FunctionTransformer(get_level, kw_args={'col': 'Cabin'})),
                       ('encoder_ord', encoder_ord), ])
pipe_imputer = Pipeline([('knn', KNNImputer())])

pipe_preproc = ColumnTransformer(
    transformers=[('encoder', pipe_name, ['Name']),
                  ('encoder_cabin', pipe_cabin, ['Cabin']),
                  ('cat_preproc', encoder_ord, ['Embarked', 'Sex'])],
    remainder='passthrough',
    force_int_remainder_cols=False).set_output(transform='pandas')

model = DecisionTreeClassifier(random_state=0)

pipe_process_inputer2 = ColumnTransformer(
    transformers=[
        # KNNImputer для целевых столбцов, основываясь только на reference_columns
        ('knn_imputer', Pipeline([
            ('select_references',
             ColumnTransformer([
                 ('references', 'passthrough', reference_columns)  # Выбор только оснований
             ], remainder='drop')),
            ('imputer', KNNImputer(n_neighbors=3))
        ]), ['remainder__Age', 'cat_preproc__Embarked'])
    ],
    remainder='passthrough'  # Оставить необработанные столбцы
)

pipe_process_inputer3 = ColumnTransformer(
    transformers=[
        # KNNImputer для целевых столбцов, основываясь только на reference_columns
        ('knn_imputer', Pipeline([
            ('test', FunctionTransformer(lambda x: print(x.columns))),
        ]), reference_columns2)
    ],
    remainder='passthrough'  # Оставить необработанные столбцы
)
pipe_process_inputer4 = ColumnTransformer(
    transformers=[
        # KNNImputer для целевых столбцов, основываясь только на reference_columns
        ('knn_imputer', Pipeline([
            # ('test', FunctionTransformer(lambda x: print(x.columns))),
            ('select_references',
             ColumnTransformer([
                 ('references', 'passthrough',  ['remainder__Age', 'cat_preproc__Embarked'])  # Выбор только оснований
             ], remainder='drop').set_output(transform='pandas')),
            ('imputer', pipe_imputer)

            # ('imputer', FunctionTransformer(lambda x: print(x)))
        ]), reference_columns)
    ],
    remainder='passthrough'  # Оставить необработанные столбцы
)

# pipe_process_inputer = Pipeline([
#     ('select_references', ColumnTransformer([('references', 'passthrough', reference_columns)], remainder='drop')),
#     ('imputer', KNNImputer(n_neighbors=3))
# ])

# pipe_process = Pipeline([('preproc', pipe_preproc), ('imputer', pipe_imputer)])
pipe_process = Pipeline([('preproc', pipe_preproc), ('imputer_sel', pipe_process_inputer4)])

pipe_model = Pipeline([('process', pipe_process), ('model', model)])


In [224]:
pipe_model

In [225]:
pipe_model.steps

[('process',
  Pipeline(steps=[('preproc',
                   ColumnTransformer(force_int_remainder_cols=False,
                                     remainder='passthrough',
                                     transformers=[('encoder',
                                                    Pipeline(steps=[('regexp_title',
                                                                     FunctionTransformer(func=<function get_title at 0x000001F0ACD0CC20>,
                                                                                         kw_args={'col': 'Name'})),
                                                                    ('encoder_ord',
                                                                     OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                    unknown_value=nan))]),
                                                    ['Name']),
                                                  

In [226]:
pipe_preproc.fit_transform(X_train).columns

Index(['encoder__Name', 'encoder_cabin__Cabin', 'cat_preproc__Embarked',
       'cat_preproc__Sex', 'remainder__Pclass', 'remainder__Age',
       'remainder__SibSp', 'remainder__Parch', 'remainder__Fare'],
      dtype='object')

In [227]:
# pipe_model.set_params(model__max_depth=6, model__max_features=4, process__imputer__knn__n_neighbors=3)
#{'model__max_depth': 6, 'model__max_features': 5, 'process__imputer__knn__n_neighbors': 7}
#{'model__max_depth': 10, 'model__max_features': 4, 'process__imputer__knn__n_neighbors': 9}
# pipe_model.set_params(model__max_depth=10, model__max_features=4, process__imputer_sel__imputer__n_neighbors=9)

In [228]:
pipe_model.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [229]:
from sklearn.metrics import accuracy_score

In [230]:
accuracy_score(y_train, pipe_model.predict(X_train))

0.8171140939597316

In [231]:
accuracy_score(y_test, pipe_model.predict(X_test))

0.6711864406779661

In [232]:

from sklearn.model_selection import GridSearchCV

In [233]:
model_params = {'model__max_depth': range(5, 15), 'model__max_features': range(4, 19),
                'process__imputer__knn__n_neighbors': range(3, 15)}
tree_grid = GridSearchCV(pipe_model, model_params, cv=5, n_jobs=-1, verbose=3, scoring='accuracy')

In [234]:
tree_grid.fit(X_train, y_train)


Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


ValueError: Invalid parameter 'imputer' for estimator Pipeline(steps=[('preproc',
                 ColumnTransformer(force_int_remainder_cols=False,
                                   remainder='passthrough',
                                   transformers=[('encoder',
                                                  Pipeline(steps=[('regexp_title',
                                                                   FunctionTransformer(func=<function get_title at 0x000001B64EAEBE20>,
                                                                                       kw_args={'col': 'Name'})),
                                                                  ('encoder_ord',
                                                                   OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                                  unknown_value=nan))]),
                                                  ['Name']),
                                                 ('enco...
                                                  Pipeline(steps=[('select_references',
                                                                   ColumnTransformer(transformers=[('references',
                                                                                                    'passthrough',
                                                                                                    ['remainder__Age',
                                                                                                     'cat_preproc__Embarked'])])),
                                                                  ('imputer',
                                                                   Pipeline(steps=[('knn',
                                                                                    KNNImputer())]))]),
                                                  ['encoder__Name',
                                                   'cat_preproc__Embarked',
                                                   'cat_preproc__Sex',
                                                   'remainder__Pclass',
                                                   'remainder__Age',
                                                   'remainder__SibSp',
                                                   'remainder__Parch',
                                                   'remainder__Fare'])]))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

In [89]:
tree_grid.best_params_

{'model__max_depth': 10,
 'model__max_features': 4,
 'process__imputer__knn__n_neighbors': 9}

### Метрики обучения

> Скор на трейне : 0.979
> Скор на валиде : 0.76
> Вывод достигли переобучение

### Исследование pipeline

In [31]:
pipe_model.named_steps['model'].feature_importances_

array([0.05834997, 0.04595095, 0.03375033, 0.34446475, 0.1204313 ,
       0.14963924, 0.03744614, 0.0294479 , 0.18051941])

In [32]:
X_train.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

Максимальное влияние параметров:
1. Sex - 0.29
2. SibSp (наличие родственников) - 0.27
3. Embarked (порт назначения) - 0.20


### Кроссвалидация

In [82]:

from sklearn.model_selection import cross_val_score

cv_results = cross_val_score(pipe_model, X_train, y_train, cv=5,
                             scoring='accuracy')

In [83]:
cv_results

array([0.75833333, 0.8907563 , 0.83193277, 0.79831933, 0.82352941])

## Предсказание на реальных данных

In [95]:
df_ground = pd.read_csv('data/test.csv')
df_ground.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [96]:
df_ground['Survived'] = pipe_model.predict(df_ground.drop(['PassengerId', 'Ticket'], axis=1))


In [97]:
df_ground[['PassengerId', 'Survived']].to_csv('data/test11.csv', index=False)

In [98]:
! kaggle competitions submit titanic -f .\data\test11.csv -m"cabin "

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 6.71kB/s]


### Метрика на лидерборде

После дополнения Cabin

Score: 0.79665
Place: 909