In [163]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve, roc_curve, roc_auc_score, confusion_matrix

import dill

In [118]:
df_train = pd.read_csv('data/train.csv')

In [119]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [120]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [121]:
df_train['Cabin'].unique()
# will use the fact that the passanget has cabin or not

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [122]:
df_train['Embarked'].value_counts()
# S is the most common value, will use it later

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [123]:
df_train['Age'].isna()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool

In [124]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]
    
class NanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.key] = X_copy[self.key].fillna(self.value)
        return X_copy
    
class CabinHandler(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.key] = X_copy[self.key].apply(lambda x: 0 if type(x) == float else 1)
        return X_copy

In [125]:
def prepare_data(columns):
    categorical_columns = ['Sex', 'Embarked', 'Pclass']
    continuos_cols = ['Age', 'SibSp', 'Parch', 'Fare']
    
    transformers = []
    
    for col in columns:
        if col == 'PassengerId' or col == 'Name' or col == 'Ticket':
            continue
        if col == 'Embarked':
            transformer = Pipeline([
                    ('nun_imp', NanImputer(key=col, value='S')),
                    ('selector', ColumnSelector(key=col)),
                    ('ohe', OHEEncoder(key=col))
                ])
        elif col == 'Age':
            transformer =  Pipeline([
                    ('nun_imp', NanImputer(key=col, value=df_train['Age'].median())),
                    ('selector', NumberSelector(key=col)),
                    ('standard', StandardScaler())
                ])
        elif col == 'Cabin':
            transformer =  Pipeline([
                    ('cab_handler', CabinHandler(key=col)),
                    ('selector', NumberSelector(key=col))
                ])
        elif col in continuos_cols:
            transformer =  Pipeline([
                    ('selector', NumberSelector(key=col)),
                    ('standard', StandardScaler())
                ])
        elif col in categorical_columns:
            transformer = Pipeline([
                    ('selector', ColumnSelector(key=col)),
                    ('ohe', OHEEncoder(key=col))
                ])
        else:
            transformer = Pipeline([
                    ('selector', NumberSelector(key=col))
                ])
        transformers.append((col, transformer))
        
    return transformers

In [143]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop('Survived', axis=1), df_train['Survived'], test_size=0.33, random_state=42)

In [144]:
feats = FeatureUnion(prepare_data(X_train.columns))
feature_processing = Pipeline([('feats', feats)])

In [145]:
xgboost_pipeline = Pipeline([
    ('features', feature_processing),
    ('classifier', XGBClassifier(random_state=42, verbosity=0)),
])

In [154]:
def perform_cross_validation(classifier, df, target, cv=16, scoring="roc_auc", beta=1):
    
    X_train, X_test, y_train, y_test = train_test_split(df.drop(target, 1),
                                                        df[target], random_state=43)
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=cv, scoring=scoring)
    #запустим кросс-валидацию

    cv_score = np.mean(cv_scores)
    cv_score_std = np.std(cv_scores)
    print('CV score is {}+-{}'.format(cv_score, cv_score_std))

    #обучим пайплайн на всем тренировочном датасете
    classifier.fit(X_train, y_train)
    y_score = classifier.predict_proba(X_test)[:, 1]
    
    
    precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
    fscore = (1+beta**2)*(precision * recall) / (beta**2*precision + recall)
    ix = np.argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))
    return cv_score, fscore[ix], precision[ix], recall[ix]

In [158]:
perform_cross_validation(xgboost_pipeline, df_train, "Survived")



CV score is 0.8511798878205128+-0.06872665353059325
Best Threshold=0.185902, F-Score=0.739, Precision=0.708, Recall=0.773




(0.8511798878205128,
 0.7391304347826088,
 0.7083333333333334,
 0.7727272727272727)

In [161]:
X = df_train.drop('Survived', 1)
y = df_train['Survived']

xgboost_pipeline.fit(X, y)



Pipeline(steps=[('features',
                 Pipeline(steps=[('feats',
                                  FeatureUnion(transformer_list=[('Pclass',
                                                                  Pipeline(steps=[('selector',
                                                                                   ColumnSelector(key='Pclass')),
                                                                                  ('ohe',
                                                                                   OHEEncoder(key='Pclass'))])),
                                                                 ('Sex',
                                                                  Pipeline(steps=[('selector',
                                                                                   ColumnSelector(key='Sex')),
                                                                                  ('ohe',
                                                                       

In [166]:
with open('app/models/model.dill', 'wb') as f:
    dill.dump(xgboost_pipeline, f)