In [20]:
import pandas as pd
import numpy as np
from preprocessors import TitleExtractor, Mapper, ReplaceCatogories, CastType, CastNaOnString
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.selection import DropFeatures
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [21]:
df = pd.read_csv('../data/titanic.csv')
y = df['survived']
X = df.drop(columns='survived')

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## config

In [23]:
VARS = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'name', 'boat']

VARS_TO_REPLACE_CATEGORY = ['boat']
CATEGORIES_TO_LEAVE = ['?']
REPLACE_WITH = 'any'


STRING_TO_NA = '?'
NA_TYPE = np.NaN

VARS_TO_FLOAT = ['age', 'fare']

VAR_TO_EXTRACT_TITLE = ['name']
VAR_NAME_OF_TITLE = ['title']

VARS_NA_TO_MEAN = ['age', 'fare']
VARS_NA_TO_MFREQ = ['embarked']

#VARS_TO_DROP = ['body', 'home.dest', 'ticket', 'cabin']



VARS_TO_MAP = ['sex', 'boat']
DICTS_TO_MAP = [
    {'male':1, 'female':0},
    {np.NaN:0, 'any':1}
                ]


VARS_TO_FREQ_ENCODE = ['title', 'embarked']

In [24]:
# Variable final selection
X_train = X_train[VARS]
X_test = X_test[VARS]

In [27]:
pipeline_survived = Pipeline([
    ('replace_categories', ReplaceCatogories(variables=VARS_TO_REPLACE_CATEGORY, list_of_category_to_leave=CATEGORIES_TO_LEAVE, replace_with=REPLACE_WITH)),
    ('cast_na_on_string', CastNaOnString(variables=VARS,string=STRING_TO_NA, na=NA_TYPE)),
    ('cast_type', CastType(variables=VARS_TO_FLOAT, dtype='float')),
    ('title_extraction', TitleExtractor(variables=VAR_TO_EXTRACT_TITLE, list_of_new_col_names=VAR_NAME_OF_TITLE)),
    ('mean_median_imputer', MeanMedianImputer(imputation_method='mean', variables=VARS_NA_TO_MEAN)),
    ('mode_imputer', CategoricalImputer(imputation_method='frequent', variables=VARS_NA_TO_MFREQ)),
    #('drop_features', DropFeatures(features_to_drop=VARS_TO_DROP)),
    ('mapper', Mapper(variables=VARS_TO_MAP, mappings=DICTS_TO_MAP)),
    ('frequency_encoder', CountFrequencyEncoder(encoding_method='frequency', variables=VARS_TO_FREQ_ENCODE)),
    ('random forest', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [28]:
pipeline_survived.fit(X_train, y_train)

In [34]:
pred_train = pipeline_survived.predict(X_train)
pred_test = pipeline_survived.predict(X_test)

print(classification_report(y_train, pred_train))
print('\n', 100*'-', '\n')
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       665
           1       1.00      0.99      1.00       382

    accuracy                           1.00      1047
   macro avg       1.00      1.00      1.00      1047
weighted avg       1.00      1.00      1.00      1047


 ---------------------------------------------------------------------------------------------------- 

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       144
           1       0.99      0.94      0.97       118

    accuracy                           0.97       262
   macro avg       0.97      0.97      0.97       262
weighted avg       0.97      0.97      0.97       262



