In [540]:
#@formatter:off
import re

import numpy as np
import pandas
import pandas as pd
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import  SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,  MinMaxScaler


pandas.options.display.max_rows = 100
pandas.options.display.max_columns = 100
%matplotlib inline
sns.set_theme(style='whitegrid')
#@formatter:on

In [541]:
def get_tags(column, tags):
    column.fillna('', inplace=True)
    output = pd.Series(index=column.index, dtype=object)
    for tag in tags:
        output[column.str.contains(tag)] = tag
    return output


def extend_feature_names(origin_column, new_features):
    if isinstance(new_features, str):
        return [origin_column + '_' + new_features]
    else:
        return [origin_column + '_' + feature for feature in new_features]


class TagAdder(BaseEstimator):
    def __init__(self, tags, column, tag_column, na_value=None):
        self.tags = tags
        self.column = column
        self.tag_column = tag_column
        self.na_value = na_value

    def fit(self, df, y=None):
        return self

    def transform(self, df, y=None):
        df = df.copy()
        df[self.tag_column] = get_tags(df[self.column], self.tags)
        if self.na_value is not None:
            df[self.tag_column].fillna(self.na_value, inplace=True)
        return df


class TitleReplacer(BaseEstimator):
    def fit(self, df, y=None):
        return self

    def transform(self, df, y=None):
        df = df.copy()
        df['Title'] = df.apply(self.__replace_title, axis=1)
        return df

    def __replace_title(self, row):
        title = row['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title == 'Dr':
            if row['Sex'] == 'Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title


class FeatureFixer(BaseEstimator):
    def __init__(self, remove_list):
        self.remove_list = remove_list

    def fit(self, df, y=None):
        return self

    def transform(self, df, y=None):
        df = df.copy()
        df['Embarked'].fillna(value=train_df['Embarked'].mode().item(), inplace=True)
        df['Age'].fillna(value=train_df['Age'].median(), inplace=True)
        df['Fare'].fillna(value=train_df['Fare'].median(), inplace=True)
        df['FamilySize'] = df['SibSp'] + df['Parch']
        df['AgeClass'] = df['Age'] * df['Pclass']
        df['FarePerPerson'] = df['Fare'] / (df['FamilySize'] + 1)
        df.drop(self.remove_list, axis=1, inplace=True)
        return df


class AutoEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.__clean_attributes()

    def fit(self, df, y=None):
        self.__init_attributes(df)
        self.categories_.extend(self.__numeric.copy())
        for col in self.__binary:
            encoder = OrdinalEncoder()
            encoder.fit(df[col].to_numpy().reshape(-1, 1))
            self.categories_.extend(extend_feature_names(col, encoder.categories_[0][0]))
            self.__encoders.append(encoder)
        for col in self.__categorical:
            encoder = OneHotEncoder(sparse=False)
            encoder.fit(df[col].to_numpy().reshape(-1, 1))
            extended_features = extend_feature_names(col, encoder.categories_[0])
            self.categories_.extend(extended_features)
            self.__encoders.append(encoder)
        return self

    def transform(self, df, y=None):
        features = [df[self.__numeric].to_numpy()]
        for col, enc in zip(self.__binary + self.__categorical, self.__encoders):
            feature = enc.transform(df[col].to_numpy().reshape(-1, 1))
            features.append(feature)
        return np.concatenate([*features], axis=1)

    def __init_attributes(self, df):
        self.__clean_attributes()
        for col in df.columns:
            if np.issubdtype(df[col].dtype, np.number):
                self.__numeric.append(col)
            else:
                if len(df[col].unique()) == 2:
                    self.__binary.append(col)
                else:
                    self.__categorical.append(col)

    def __clean_attributes(self):
        self.__numeric = []
        self.__binary = []
        self.__categorical = []
        self.__encoders = []
        self.categories_ = []

In [542]:
def which_contains(self, regexp):
    if not isinstance(regexp, str):
        regexp = iterable_to_regexp(regexp)
    return self.loc[self.str.contains(regexp)]


def iterable_to_regexp(entries):
    return '|'.join([re.escape(entry) for entry in entries])


pd.Series.which_contains = which_contains

In [543]:
train_df = pd.read_csv('data/titanic_train.csv')
test_df = pd.read_csv('data/titanic_test.csv')
y_train = train_df.Survived.to_numpy()
train_df.drop(columns='Survived', inplace=True)
train_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [544]:
list_of_list_of_names = train_df.Name.str.split().values
words = [word for lst in list_of_list_of_names for word in lst]
titles = pd.Series(words).which_contains('\\.').str.replace('\\.', '').unique().tolist()
for i in ['L', 'Lady', 'Sir']:
    titles.remove(i)
print(titles)

decks = train_df.Cabin.str[0].unique()[1:]
print(decks)

['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms', 'Major', 'Mlle', 'Col', 'Capt', 'Countess', 'Jonkheer']
['C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']


In [545]:
test_pipeline = Pipeline([
    ('title_tagger', TagAdder(titles, 'Name', 'Title')),
    ('title_replacer', TitleReplacer()),
    ('deck_tagger', TagAdder(decks, 'Cabin', 'Deck', 'U')),
    ('feature_fixer', FeatureFixer(['Name', 'Ticket', 'Cabin', 'PassengerId']))
])

full_pipeline = Pipeline([
    ('title_tagger', TagAdder(titles, 'Name', 'Title')),
    ('title_replacer', TitleReplacer()),
    ('deck_tagger', TagAdder(decks, 'Cabin', 'Deck', 'U')),
    ('feature_fixer', FeatureFixer(['Name', 'Ticket', 'Cabin', 'PassengerId'])),
    ('auto_encoder', AutoEncoder()),
    ('minmax_scaler', MinMaxScaler()),
])

X_train = full_pipeline.fit_transform(train_df)
X_test = full_pipeline.transform(test_df)

In [546]:
# forest_clf = RandomForestClassifier(random_state=228)
sgd_clf = SGDClassifier(random_state=228)  # Todo configure hyper params
cross_val_score(sgd_clf, X_train, y_train.ravel(), cv=3, n_jobs=-1, scoring='accuracy')

array([0.77104377, 0.81818182, 0.80808081])

In [547]:
sgd_clf.fit(X_train, y_train.ravel())
prediction = sgd_clf.predict(X_test)
submission = pd.DataFrame({'Survived': prediction}, index=test_df['PassengerId'])
print(submission)
submission.to_csv('data/submission.csv')

             Survived
PassengerId          
892                 0
893                 1
894                 0
895                 0
896                 1
...               ...
1305                0
1306                1
1307                0
1308                0
1309                1

[418 rows x 1 columns]
