# Titanic - Machine Learning from Disaster

## Features constructs

> Создать новые переменные из Name

In [344]:
import numpy as np
import pandas as pd

## Импорт данных

In [345]:
df = pd.read_csv('data/train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C


## Обработка данных

### Разделение на выборки 

In [346]:
from sklearn.model_selection import train_test_split

In [347]:
target = df['Survived']
# features = df.drop(['PassengerId', 'Ticket', 'Cabin', 'Survived'], axis=1)
features = df.drop(['PassengerId', 'Ticket','Survived'], axis=1)

In [348]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42)
X_train.shape, X_test.shape

((596, 9), (295, 9))

## Создание конвейера

План:

1. Численные:
 - Age - заполнение пропусков
2. Категорийный:
 - Embarked, Sex - Кодирование

In [349]:
import pandas as pd
from sklearn.compose import ColumnTransformer

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier


### Итоговый конвейер

In [360]:
def get_title(ser: pd.Series):
    dic_title = dict(Mr='Mr', Miss='Miss', Mrs='Mrs', Master='Master', Mlle='Miss', Ms='Miss', Mme='Mrs')
    ser = ser.str.extract(r'.*\, ([a-zA-Z]*)\. ')
    ser = ser.map(lambda x: dic_title.get(x, 'rare'))
    # print(type(ser))
    return ser

def get_level(ser: pd.Series):
    ser = ser.str.slice(0, 1).fillna('.')
    # print(type(ser))
    return pd.DataFrame(ser)

In [361]:
from sklearn.base import BaseEstimator, TransformerMixin


class IncreasePclass(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X: pd.DataFrame):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Входные данные должны быть pandas DataFrame")
        X1 = pd.DataFrame()
        X1['new_column'] = (10 / X.iloc[:, 0]) * X.iloc[:, 1]
        return X1

    def set_output(self, *, transform=None):
        pass

In [362]:
X_train.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'Embarked'],
      dtype='object')

In [363]:
from sklearn_pandas import DataFrameMapper

In [364]:
mapper = DataFrameMapper([
    ('Name', [FunctionTransformer(get_title), OrdinalEncoder()]), # name of the function
    ('Cabin', [FunctionTransformer(get_level),OrdinalEncoder()]),
    (['Pclass', 'Fare'], IncreasePclass(), {'alias': 'PFare'}),
    (['Embarked'], OrdinalEncoder()),
    (['Sex'], OrdinalEncoder()),
    (['Age'], None),
    (['SibSp'], None),
    (['Parch'], None),
], input_df=True, df_out=True, )



pipe_model = Pipeline([('map_field', mapper),
                        ('knn', KNNImputer()),
                        ('model', DecisionTreeClassifier(random_state=0))])

In [365]:
# mapper.fit_transform(X_train)
# mapper.transformed_names_

In [377]:
# pipe_model.set_params(model__max_depth=6, model__max_features=5, knn__n_neighbors=6)
pipe_model.set_params(model__max_depth=3, model__max_features=1, knn__n_neighbors=4)

In [378]:
pipe_model.fit(X_train, y_train)



In [379]:
from sklearn.metrics import accuracy_score

In [380]:
accuracy_score(y_train, pipe_model.predict(X_train))

0.7298657718120806

In [381]:
accuracy_score(y_test, pipe_model.predict(X_test))

0.6813559322033899

## Поиск параметров

In [371]:

from sklearn.model_selection import GridSearchCV

In [374]:
model_params = {'model__max_depth': range(1, 11), 'model__max_features': range(4, 19),
                'knn__n_neighbors': range(3, 10)}
tree_grid = GridSearchCV(pipe_model, model_params, cv=5, n_jobs=-1, verbose=True)

In [375]:
tree_grid.fit(X_train, y_train)


Fitting 5 folds for each of 1050 candidates, totalling 5250 fits




In [376]:
tree_grid.best_params_

{'knn__n_neighbors': 3, 'model__max_depth': 1, 'model__max_features': 4}

### Метрики обучения

> Скор на трейне : 0.979
> Скор на валиде : 0.76
> Вывод достигли переобучение

In [30]:
pipe_model.set_params(model__max_depth=4, model__max_features=5, process__imputer__knn__n_neighbors=3)
pipe_model.fit(X_train, y_train)

0.8135593220338984

In [None]:
from sklearn.metrics import accuracy_score

In [35]:
accuracy_score(y_train, pipe_model.predict(X_train))

[('process',
  Pipeline(steps=[('trans_increase_pclass',
                   IncreasePclass(col1='Pclass', col2='Fare',
                                  new_col_name='PFare')),
                  ('preproc',
                   ColumnTransformer(force_int_remainder_cols=False,
                                     remainder='passthrough',
                                     transformers=[('encoder',
                                                    Pipeline(steps=[('regexp_title',
                                                                     FunctionTransformer(func=<function get_title at 0x0000022CE51F9C60>,
                                                                                         kw_args={'col': 'Name'})),
                                                                    ('encoder_ord',
                                                                     OrdinalEncoder(handle_unknown='use_encoded_value',
                                                        

In [37]:
accuracy_score(y_test, pipe_model.predict(X_test))

Unnamed: 0,Name,Sex,Age,SibSp,Parch,Embarked,PFare
6,"McCarthy, Mr. Timothy J",male,54.0,0,0,S,518.625000
718,"McEvoy, Mr. Michael",male,,0,0,Q,51.666667
685,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,C,207.896000
73,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,C,48.180667
882,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,S,35.055667
...,...,...,...,...,...,...,...
106,"Salkjelsvik, Miss. Anna Kristine",female,21.0,0,0,S,25.500000
270,"Cairns, Mr. Alexander",male,,0,0,S,310.000000
860,"Hansen, Mr. Claus Peter",male,41.0,2,0,S,47.027667
435,"Carter, Miss. Lucile Polk",female,14.0,1,2,S,1200.000000


### Исследование pipeline

In [67]:
pipe_model.steps

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

In [None]:
trans_increase_pclass.transform(X_train)

In [382]:
pipe_model.named_steps['model'].feature_importances_

array([0.02057799, 0.55582654, 0.14466407, 0.12009506, 0.        ,
       0.13392706, 0.02490928, 0.        ])

In [41]:
X_train.columns

Максимальное влияние параметров:
1. Sex - 0.29
2. SibSp (наличие родственников) - 0.27
3. Embarked (порт назначения) - 0.20


### Кроссвалидация

In [383]:

from sklearn.model_selection import cross_val_score

cv_results = cross_val_score(pipe_model, X_train, y_train, cv=5,
                             scoring='accuracy')

Traceback (most recent call last):
  File "C:\prj\ml\TitanicML\venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\prj\ml\TitanicML\venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "C:\prj\ml\TitanicML\venv\Lib\site-packages\sklearn\metrics\_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\prj\ml\TitanicML\venv\Lib\site-packages\sklearn\utils\_response.py", line 214, in _get_response_values
    y_pred = prediction_method(X)
             ^^^^^^^^^^^^^^^^^^^^
  File "C:\prj\ml\TitanicML\venv\Lib\site-packages\sklearn\pipeline.py", line 785, in predict
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\prj\ml\TitanicML\venv\Lib\site-packages\sklearn\utils\_set_output.py", line 319, in wrapped
    data_to_wrap = f(self, 

In [32]:
cv_results

## Предсказание на реальных данных

In [34]:
df_ground = pd.read_csv('data/test.csv')
# df_ground.head(2)

Successfully submitted to Titanic - Machine Learning from Disaster



  0%|          | 0.00/3.18k [00:00<?, ?B/s]
100%|##########| 3.18k/3.18k [00:00<00:00, 6.58kB/s]


In [None]:
df_ground['Survived'] = pipe_model.predict(df_ground.drop(['PassengerId', 'Ticket'], axis=1))
# df_ground.head()

In [None]:
df_ground[['PassengerId', 'Survived']].to_csv('data/test14.csv', index=False)

In [None]:
! kaggle competitions submit titanic -f .\data\test14.csv -m"pclass fare mul"

### Метрика на лидерборде

Score: 0.73923

place: 12189

После оптимизации knn

Score: 0.74880

place: 11928

Можно сказать, что улучшение качества модели существенно улучшило результат на лидерборде.




Score: 0.79665
Place: 909