# Titanic - Machine Learning from Disaster

## Features constructs

> Создать новые переменные из Name

In [1]:


import numpy as np
import pandas as pd

## Импорт данных

In [2]:
df = pd.read_csv('data/train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


## Обработка данных

### Разделение на выборки 

In [3]:
from sklearn.model_selection import train_test_split

In [43]:
target = df['Survived']
features = df.drop(['PassengerId', 'Ticket', 'Cabin', 'Survived'], axis=1)
# features = df.drop(['PassengerId', 'Ticket','Survived'], axis=1)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
X_train.shape, X_test.shape

((596, 8), (295, 8))

## Создание конвейера

План:

1. Численные:
 - Age - заполнение пропусков
2. Категорийный:
 - Embarked, Sex - Кодирование

In [45]:
import pandas as pd
from sklearn.compose import ColumnTransformer

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier


### Итоговый конвейер

In [46]:
def get_title(df, col):
    df[col] = df[col].str.extract(r'.*\, ([a-zA-Z]*\.) ')
    return df


def get_level(df: pd.DataFrame, col):
    df[col] = df[col].str.slice(0, 1)
    df.fillna('.', inplace=True)
    return df

In [106]:
from sklearn.base import BaseEstimator, TransformerMixin


class IncreasePclass(BaseEstimator, TransformerMixin):
    def __init__(self, col1, col2, new_col_name='Field_new'):
        self.col1 = col1
        self.col2 = col2
        self.new_col_name = new_col_name

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        """
        Преобразует данные, добавляя новый столбец с произведением.
        :param X: pandas DataFrame
        :return: DataFrame с добавленным столбцом
        """
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Входные данные должны быть pandas DataFrame")

        # Добавляем новый столбец с произведением
        X = X.copy()
        X[self.new_col_name] = 10 / X[self.col1] * X[self.col2]
        X.drop([self.col1, self.col2], axis='columns', inplace=True)
        return X

    def set_output(self, *, transform=None):
        pass

In [107]:
increase_pclass = Pipeline([('', IncreasePclass('Pclass', 'Fare', 'PFare'))])
trans_increase_pclass = IncreasePclass('Pclass', 'Fare', 'PFare')
encoder_ord = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
pipe_name = Pipeline([('regexp_title', FunctionTransformer(get_title, kw_args={'col': 'Name'})),
                      ('encoder_ord', encoder_ord), ])
pipe_cabin = Pipeline([('regexp_cabin', FunctionTransformer(get_level, kw_args={'col': 'Cabin'})),
                       ('encoder_ord', encoder_ord), ])
pipe_imputer = Pipeline([('knn', KNNImputer())])
pipe_preproc = ColumnTransformer(
    transformers=[
        # ('increase_pclass', increase_pclass, ['Pclass', 'Fare']),
        ('encoder', pipe_name, ['Name']),
        # ('encoder_cabin', pipe_cabin, ['Cabin']),
        ('cat_preproc', encoder_ord, ['Embarked', 'Sex'])],
    remainder='passthrough',
    force_int_remainder_cols=False).set_output(transform='pandas')

model = DecisionTreeClassifier(random_state=0)

pipe_process = Pipeline(
    [('trans_increase_pclass', trans_increase_pclass), ('preproc', pipe_preproc), ('imputer', pipe_imputer)])
pipe_model = Pipeline([('process', pipe_process), ('model', model)])


In [108]:
# pipe_model.set_params(model__max_depth=7, model__max_features=5, process__imputer__knn__n_neighbors=9)
#{'model__max_depth': 6, 'model__max_features': 5, 'process__imputer__knn__n_neighbors': 7}
# pipe_model.set_params(model__max_depth=6, model__max_features=4, process__imputer__knn__n_neighbors=3)
# pipe_model.set_params(model__max_depth=8, model__max_features=4, process__imputer__knn__n_neighbors=9)
pipe_model.set_params(model__max_depth=6, model__max_features=5, process__imputer__knn__n_neighbors=6)

In [109]:
pipe_model.fit(X_train, y_train)

In [110]:
from sklearn.metrics import accuracy_score

In [111]:
accuracy_score(y_train, pipe_model.predict(X_train))

0.8557046979865772

In [112]:
accuracy_score(y_test, pipe_model.predict(X_test))

0.7966101694915254

## Поиск параметров

In [92]:

from sklearn.model_selection import GridSearchCV

In [113]:
model_params = {'model__max_depth': range(1, 11), 'model__max_features': range(4, 19),
                'process__imputer__knn__n_neighbors': range(3, 10)}
tree_grid = GridSearchCV(pipe_model, model_params, cv=5, n_jobs=-1, verbose=True)

In [114]:
tree_grid.fit(X_train, y_train)


Fitting 5 folds for each of 1050 candidates, totalling 5250 fits


In [115]:
tree_grid.best_params_

{'model__max_depth': 5,
 'model__max_features': 5,
 'process__imputer__knn__n_neighbors': 3}

### Метрики обучения

> Скор на трейне : 0.979
> Скор на валиде : 0.76
> Вывод достигли переобучение

In [116]:
pipe_model.set_params(model__max_depth=5, model__max_features=5, process__imputer__knn__n_neighbors=3)
pipe_model.fit(X_train, y_train)

In [117]:
from sklearn.metrics import accuracy_score

In [118]:
accuracy_score(y_train, pipe_model.predict(X_train))

0.8389261744966443

In [119]:
accuracy_score(y_test, pipe_model.predict(X_test))

0.8101694915254237

### Исследование pipeline

In [104]:
pipe_model.steps

[('process',
  Pipeline(steps=[('trans_increase_pclass',
                   IncreasePclass(col1='Pclass', col2='Fare',
                                  new_col_name='PFare')),
                  ('preproc',
                   ColumnTransformer(force_int_remainder_cols=False,
                                     remainder='passthrough',
                                     transformers=[('encoder',
                                                    Pipeline(steps=[('regexp_title',
                                                                     FunctionTransformer(func=<function get_title at 0x0000027C93D302C0>,
                                                                                         kw_args={'col': 'Name'})),
                                                                    ('encoder_ord',
                                                                     OrdinalEncoder(handle_unknown='use_encoded_value',
                                                        

In [105]:
trans_increase_pclass.transform(X_train)

Unnamed: 0,Name,Sex,Age,SibSp,Parch,Embarked,PFare
6,"McCarthy, Mr. Timothy J",male,54.0,0,0,S,51.8625
718,"McEvoy, Mr. Michael",male,,0,0,Q,46.5000
685,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,C,83.1584
73,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,C,43.3626
882,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,S,31.5501
...,...,...,...,...,...,...,...
106,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,S,22.9500
270,"Cairns, Mr. Alexander",male,,0,0,S,31.0000
860,"Hansen, Mr. Claus Peter",male,41.0,2,0,S,42.3249
435,"Carter, Miss. Lucile Polk",female,14.0,1,2,S,120.0000


In [66]:
pipe_model.named_steps['model'].feature_importances_

array([0.25077015, 0.07209911, 0.30282177, 0.10209646, 0.06553953,
       0.04746769, 0.03851653, 0.12068876])

In [67]:
X_train.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

Максимальное влияние параметров:
1. Sex - 0.29
2. SibSp (наличие родственников) - 0.27
3. Embarked (порт назначения) - 0.20


### Кроссвалидация

In [41]:

from sklearn.model_selection import cross_val_score

cv_results = cross_val_score(pipe_model, X_train, y_train, cv=5,
                             scoring='accuracy')

In [42]:
cv_results

array([0.73333333, 0.8487395 , 0.75630252, 0.81512605, 0.86554622])

## Предсказание на реальных данных

In [120]:
df_ground = pd.read_csv('data/test.csv')
df_ground.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [121]:
df_ground['Survived'] = pipe_model.predict(df_ground.drop(['PassengerId', 'Ticket'], axis=1))
df_ground.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,0


In [122]:
df_ground[['PassengerId', 'Survived']].to_csv('data/test13.csv', index=False)

In [123]:
! kaggle competitions submit titanic -f .\data\test13.csv -m"pclass fare mul"

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 7.44kB/s]


### Метрика на лидерборде

Score: 0.73923

place: 12189

После оптимизации knn

Score: 0.74880

place: 11928

Можно сказать, что улучшение качества модели существенно улучшило результат на лидерборде.




Score: 0.79665
Place: 909