In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [None]:
DATASETS_DIR = 'datasets/'
URL = 'https://www.openml.org/data/get_csv/16826755/phpMYEkMl'
DROP_COLS = ['boat','body','home.dest','ticket','name']
RETRIEVED_DATA = 'raw-data.csv'


SEED_SPLIT = 404
TRAIN_DATA_FILE = DATASETS_DIR + 'train.csv'
TEST_DATA_FILE  = DATASETS_DIR + 'test.csv'


TARGET = 'survived'
FEATURES = ['pclass','sex','age','sibsp','parch','fare','cabin','embarked','title']
NUMERICAL_VARS = ['pclass','age','sibsp','parch','fare']
CATEGORICAL_VARS = ['sex','cabin','embarked','title']


NUMERICAL_VARS_WITH_NA = ['age','fare']
CATEGORICAL_VARS_WITH_NA = ['cabin','embarked']
NUMERICAL_NA_NOT_ALLOWED = [var for var in NUMERICAL_VARS if var not in NUMERICAL_VARS_WITH_NA]
CATEGORICAL_NA_NOT_ALLOWED = [var for var in CATEGORICAL_VARS if var not in CATEGORICAL_VARS_WITH_NA]


SEED_MODEL = 404

In [None]:
def data_retrieval(url):

    # Loading data from specific url
    data = pd.read_csv(url)

    # Uncovering missing data
    data.replace('?', np.nan, inplace=True)
    data['age'] = data['age'].astype('float')
    data['fare'] = data['fare'].astype('float')

    # helper function 1
    def get_first_cabin(row):
        try:
            return row.split()[0]
        except:
            return np.nan

    # helper function 2
    def get_title(passenger):
        line = passenger
        if re.search('Mrs', line):
            return 'Mrs'
        elif re.search('Mr', line):
            return 'Mr'
        elif re.search('Miss', line):
            return 'Miss'
        elif re.search('Master', line):
            return 'Master'
        else:
            return 'Other'

    # Keep only one cabin | Extract the title from 'name'
    data['cabin'] = data['cabin'].apply(get_first_cabin)
    data['title'] = data['name'].apply(get_title)

    # Droping irrelevant columns
    data.drop(DROP_COLS, 1, inplace=True)

    data.to_csv(DATASETS_DIR + RETRIEVED_DATA, index=False)

    return print('Data stored in {}'.format(DATASETS_DIR + RETRIEVED_DATA))

In [None]:
data_retrieval(URL)

Data stored in datasets/raw-data.csv


In [None]:
df = pd.read_csv(DATASETS_DIR + RETRIEVED_DATA)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                                        df.drop(TARGET, axis=1),
                                                        df[TARGET],
                                                        test_size=0.2,
                                                        random_state=404
                                                   )

X_train.to_csv(TRAIN_DATA_FILE, index=False)
X_test.to_csv(TEST_DATA_FILE, index=False)
y_test.to_csv('y_test.csv', index=False)

___
## Creating convenient classes

### Transformations without persisting information

**Before**

```python
def missing_indicator(data, col_name):
    data[col_name+'_nan'] = data[col_name].isnull().astype(int)
    return None

for var in NUMERICAL_VARS:
    missing_indicator(X_train, var)
    missing_indicator(X_test, var)
```

**Now**

In [None]:
class MissingIndicator(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var+'_nan'] = X[var].isnull().astype(int)

        return X


create_missing_flag = MissingIndicator(variables=NUMERICAL_VARS)
X_train = create_missing_flag.transform(X_train)
X_test = create_missing_flag.transform(X_test)

**Before**

```python
def extract_letter_from_cabin(x):
    if type(x)==str:    
        return ''.join(re.findall("[a-zA-Z]+", x))  
    else:
        return x

X_train['cabin'] = X_train['cabin'].apply(extract_letter_from_cabin)    
X_test['cabin'] = X_test['cabin'].apply(extract_letter_from_cabin)    

X_train[CATEGORICAL_VARS_WITH_NA] = X_train[CATEGORICAL_VARS_WITH_NA].fillna('missing')
X_test[CATEGORICAL_VARS_WITH_NA]  = X_test[CATEGORICAL_VARS_WITH_NA].fillna('missing')
```

**Now**

In [None]:
class ExtractLetters(BaseEstimator, TransformerMixin):

    def __init__(self):
        self.variable = 'cabin'

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.variable] = X[self.variable].apply(lambda x: ''.join(re.findall("[a-zA-Z]+", x)) if type(x)==str else x)
        return X


class CategoricalImputer(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = X[var].fillna('Missing')
        return X


extract_letters_cabin = ExtractLetters()
X_train = extract_letters_cabin.transform(X_train)
X_test = extract_letters_cabin.transform(X_test)

categ_imputer = CategoricalImputer(variables=CATEGORICAL_VARS_WITH_NA)
X_train = categ_imputer.transform(X_train)
X_test = categ_imputer.transform(X_test)

### Transformations with persisting information

In [None]:
imp_median = SimpleImputer(strategy='median')
imp_median.fit(X_train[NUMERICAL_VARS_WITH_NA])

X_train[NUMERICAL_VARS_WITH_NA] = imp_median.transform(X_train[NUMERICAL_VARS_WITH_NA])
X_test[NUMERICAL_VARS_WITH_NA]  = imp_median.transform(X_test[NUMERICAL_VARS_WITH_NA])

**Before**

```python
def find_rare_labels(data, col, perc):
    data = data.copy()
    tmp = data.groupby(col)[col].count() / data.shape[0]
    return tmp[tmp < perc].index

rare_labels_ = {}
for col in CATEGORICAL_VARS:
    rare_labels_[col] = find_rare_labels(X_train, col, 0.05)
    
for col in CATEGORICAL_VARS:
    X_train[col] = np.where(X_train[col].isin(rare_labels_[col]), 'Rare', X_train[col])
    X_test[col]  = np.where(X_test[col].isin(rare_labels_[col]), 'Rare', X_test[col])
```

**Now**

In [None]:
class RareLabelCategoricalEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, tol=0.05, variables=None):
        self.tol = tol
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.rare_labels_dict = {}
        for var in self.variables:
            t = pd.Series(X[var].value_counts() / np.float(X.shape[0]))
            self.rare_labels_dict[var] = list(t[t<self.tol].index)
        return self

    def transform(self, X):
        X = X.copy()
        for var in self.variables:
            X[var] = np.where(X[var].isin(self.rare_labels_dict[var]), 'rare', X[var])
        return X


rare_labels = RareLabelCategoricalEncoder(tol=0.05, variables=CATEGORICAL_VARS)
rare_labels.fit(X_train)
X_train = rare_labels.transform(X_train)
X_test  = rare_labels.transform(X_test)

**Before**

```python
X_train = pd.concat([X_train, pd.get_dummies(X_train[CATEGORICAL_VARS], drop_first=True)], 1)
X_test  = pd.concat([X_test, pd.get_dummies(X_test[CATEGORICAL_VARS], drop_first=True)], 1)

X_train.drop(CATEGORICAL_VARS, 1, inplace=True)
X_test.drop(CATEGORICAL_VARS, 1, inplace=True)

# Validation step
set(X_train.columns).difference(set(X_test.columns))

for col in list(set(X_train.columns).difference(set(X_test.columns))):
    X_test[col] = 0
```

**Now**

In [None]:
class OneHotEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, variables=None):
        if not isinstance(variables, list):
            self.variables = [variables]
        else:
            self.variables = variables

    def fit(self, X, y=None):
        self.dummies = pd.get_dummies(X[self.variables], drop_first=True).columns
        return self

    def transform(self, X):
        X = X.copy()
        X = pd.concat([X, pd.get_dummies(X[self.variables], drop_first=True)], 1)
        X.drop(self.variables, 1, inplace=True)

        # Adding missing dummies, if any
        missing_dummies = [var for var in self.dummies if var not in X.columns]
        if len(missing_dummies) != 0:
            for col in missing_dummies:
                X[col] = 0

        return X


dummy_vars = OneHotEncoder(variables=CATEGORICAL_VARS)
dummy_vars.fit(X_train)
X_train = dummy_vars.transform(X_train)
X_test  = dummy_vars.transform(X_test)

**Aligning columns of X_train and X_test**

In [None]:
class OrderingFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X, y=None):
        self.ordered_features = X.columns
        return self

    def transform(self, X):
        return X[self.ordered_features]


sort_feats = OrderingFeatures()
sort_feats.fit(X_train)
X_train = sort_feats.transform(X_train)
X_test  = sort_feats.transform(X_test)

**Scaling**

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

___

## 4. Training model

In [None]:
model = LogisticRegression(C=0.0005, class_weight='balanced', random_state=SEED_MODEL)
model.fit(X_train, y_train)

LogisticRegression(C=0.0005, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=404, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

- train roc-auc : 0.8470412710714978
- train accuracy: 0.7831900668576887

- test roc-auc : 0.8163583073823043
- test accuracy: 0.7748091603053435

In [None]:
for s,t in zip(['train','test'],[(X_train, y_train),(X_test,y_test)]):
    x,y = t[0], t[1]
    class_pred = model.predict(x)
    proba_pred = model.predict_proba(x)[:,1]
    print('{} roc-auc : {}'.format(s, roc_auc_score(y, proba_pred)))
    print('{} accuracy: {}'.format(s, accuracy_score(y, class_pred)))
    print()

train roc-auc : 0.8470412710714978
train accuracy: 0.7831900668576887

test roc-auc : 0.8163583073823043
test accuracy: 0.7748091603053435



In [None]:
tmp = pd.DataFrame(X_test, columns=list(sort_feats.ordered_features))
tmp['y_true'] = np.array(y_test)
tmp['y_pred'] = model.predict(X_test)
tmp['proba_pred'] = model.predict_proba(X_test)[:,1]

tmp.head(10)

Unnamed: 0,pclass,age,sibsp,parch,fare,pclass_nan,age_nan,sibsp_nan,parch_nan,fare_nan,...,cabin_rare,embarked_Q,embarked_S,embarked_rare,title_Mr,title_Mrs,title_rare,y_true,y_pred,proba_pred
0,0.0,0.724426,0.0,0.222222,0.221098,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1,0.502177
1,0.5,0.386221,0.125,0.111111,0.051237,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.481497
2,1.0,0.223382,0.0,0.0,0.015379,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,1,0.513358
3,0.5,0.423799,0.125,0.0,0.040989,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.481422
4,0.5,0.48643,0.0,0.0,0.050749,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.481452
5,1.0,0.298538,0.0,0.0,0.01394,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.47703
6,0.5,0.160751,0.0,0.111111,0.038061,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,1,0.514231
7,0.0,0.611691,0.125,0.0,0.111118,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1,1,0.501921
8,0.0,0.398747,0.0,0.0,0.148911,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.534687
9,0.0,0.26096,0.25,0.222222,0.512122,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.531581
