In [1]:
# We're going to use type hinting
from typing import List, Union, Dict

# Modelling. Warnings will be used to silence various model warnings for tidier output
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import DataConversionWarning
import warnings

# Data handling/display
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.metrics import auc, roc_auc_score, roc_curve

# IBM's fairness tooolbox:
from aif360.datasets import BinaryLabelDataset  # To handle the data
from aif360.metrics import BinaryLabelDatasetMetric, ClassificationMetric  # For calculating metrics
from aif360.explainers import MetricTextExplainer  # For explaining metrics
from aif360.algorithms.preprocessing import Reweighing  # Preprocessing technique

sns.set()
sns.set_context("talk")

In [6]:
# Load data
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')
test.loc[:, 'Survived'] = 0

In [7]:
# Preprocessing will be done using a sklearn pipeline. We need these bits to make the transformers and connect them.
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

# For the logistic regression model
from sklearn.preprocessing import StandardScaler

In [8]:
class SelectCols(TransformerMixin):
    """Select columns from a DataFrame."""
    def __init__(self, cols: List[str]) -> None:
        self.cols = cols

    def fit(self, x: None) -> "SelectCols":
        """Nothing to do."""
        return self

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """Return just selected columns."""
        return x[self.cols]

In [9]:
sc = SelectCols(cols=['Sex', 'Survived'])
sc.transform(train.sample(5))

Unnamed: 0,Sex,Survived
67,male,0
882,female,0
235,female,0
28,female,1
24,female,0


In [10]:
class LabelEncoder(TransformerMixin):
    """Convert non-numeric columns to numeric using label encoding. 
    Handles unseen data on transform."""
    def fit(self, x: pd.DataFrame) -> "LabelEncoder":
        """Learn encoder for each column."""
        encoders = {}
        for c in x:
            # Make encoder using pd.factorize on unique values, 
            # then convert to a dictionary
            v, k = zip(pd.factorize(x[c].unique()))
            encoders[c] = dict(zip(k[0], v[0]))

        self.encoders_ = encoders

        return self

    def transform(self, x) -> pd.DataFrame:
        """For columns in x that have learned encoders, apply encoding."""
        x = x.copy()
        for c in x:
            # Ignore new, unseen values
            x.loc[~x[c].isin(self.encoders_[c]), c] = np.nan
            # Map learned labels
            x.loc[:, c] = x[c].map(self.encoders_[c])

        # Return without nans
        return x.fillna(-2).astype(int)

In [11]:
le = LabelEncoder()
le.fit_transform(train[['Pclass', 'Sex']].sample(5))

Unnamed: 0,Pclass,Sex
861,0,0
291,1,1
153,2,0
619,0,0
697,2,1


In [12]:
le.encoders_

{'Pclass': {2: 0, 1: 1, 3: 2}, 'Sex': {'male': 0, 'female': 1}}

In [13]:
class NumericEncoder(TransformerMixin):
    """Remove invalid values from numerical columns, replace with median."""
    def fit(self, x: pd.DataFrame) -> "NumericEncoder":
        """Learn median for every column in x."""
        # Find median for all columns, handling non-NaNs invalid values and NaNs
        # Where all values are NaNs (after coercion) the median value will be a NaN.
        self.encoders_ = {
            c: pd.to_numeric(x[c],
                             errors='coerce').median(skipna=True) for c in x}

        return self

    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        """For each column in x, encode NaN values are learned 
        median and add a flag column indicating where these 
        replacements were made"""

        # Create a list of new DataFrames, each with 2 columns
        output_dfs = []
        for c in x:
            new_cols = pd.DataFrame()
            # Find invalid values that aren't nans (-inf, inf, string)
            invalid_idx = pd.to_numeric(x[c].replace([-np.inf, np.inf],
                                                     np.nan),
                                        errors='coerce').isnull()

            # Copy to new df for this column
            new_cols.loc[:, c] = x[c].copy()
            # Replace the invalid values with learned median
            new_cols.loc[invalid_idx, c] = self.encoders_[c]
            # Mark these replacement in a new column called 
            # "[column_name]_invalid_flag"
            new_cols.loc[:, f"{c}_invalid_flag"] = invalid_idx.astype(np.int8)

            output_dfs.append(new_cols)

        # Concat list of output_dfs to single df
        df = pd.concat(output_dfs,
                       axis=1)

        # Return wtih an remaining NaNs removed. These might exist if the median
        # is a NaN because there was no numeric data in the column at all.
        return df.fillna(0)


In [14]:
ne = NumericEncoder()
ne.fit_transform(train[['Age', 'Fare']].sample(5))

Unnamed: 0,Age,Age_invalid_flag,Fare,Fare_invalid_flag
500,17.0,0,8.6625,0
169,28.0,0,56.4958,0
864,24.0,0,13.0,0
559,36.0,0,17.4,0
370,25.0,0,55.4417,0


In [15]:
ne.encoders_

{'Age': 25.0, 'Fare': 17.4}

In [16]:
## Constructing the pipeline

# LabelEncoding fork: Select object columns -> label encode
pp_object_cols = Pipeline([('select', SelectCols(cols=['Sex', 'Survived', 
                                                       'Cabin', 'Ticket', 
                                                       'SibSp', 'Embarked',
                                                       'Parch', 'Pclass',
                                                       'Name'])),
                           ('process', LabelEncoder())])

# NumericEncoding fork: Select numeric columns -> numeric encode
pp_numeric_cols = Pipeline([('select', SelectCols(cols=['Age', 
                                                        'Fare'])),
                            ('process', NumericEncoder())])


# We won't use the next part, but typically the pipeline would continue to 
# the model (after dropping 'Survived' from the training data, of course). 
# For example:
pp_pipeline = FeatureUnion([('object_cols', pp_object_cols),
                            ('numeric_cols', pp_numeric_cols)])

model_pipeline = Pipeline([('pp', pp_pipeline),
                           ('mod', LogisticRegression())])
# This could be run with model.pipeline.fit_predict(x), and passed to a 
# gridsearch object

In [17]:
train_, valid = train_test_split(train,
                                 test_size=0.3)

# .fit_transform on train
train_pp = pd.concat((pp_numeric_cols.fit_transform(train_), 
                      pp_object_cols.fit_transform(train_)),
                     axis=1)

# .transform on valid
valid_pp = pd.concat((pp_numeric_cols.transform(valid), 
                      pp_object_cols.transform(valid)),
                     axis=1)
valid_pp.sample(5)

Unnamed: 0,Age,Age_invalid_flag,Fare,Fare_invalid_flag,Sex,Survived,Cabin,Ticket,SibSp,Embarked,Parch,Pclass,Name
119,2.0,0,31.275,0,1,0,-2,26,6,0,2,2,-2
73,26.0,0,14.4542,0,0,0,-2,-2,1,1,0,2,-2
117,29.0,0,21.0,0,0,0,-2,494,1,0,0,0,-2
608,22.0,0,41.5792,0,1,1,-2,162,1,1,2,0,-2
110,47.0,0,52.0,0,0,0,-2,-2,0,0,0,1,-2


In [18]:
# .transform on test
test_pp = pd.concat((pp_numeric_cols.transform(test), 
                     pp_object_cols.transform(test)),
                    axis=1)
test_pp.sample(5)

Unnamed: 0,Age,Age_invalid_flag,Fare,Fare_invalid_flag,Sex,Survived,Cabin,Ticket,SibSp,Embarked,Parch,Pclass,Name
352,18.0,0,73.5,0,0,0,-2,306,0,0,0,0,-2
32,33.0,0,20.575,0,1,0,-2,107,1,0,2,2,-2
87,18.0,0,8.05,0,1,0,-2,-2,0,0,0,2,-2
126,22.0,0,7.7958,0,0,0,-2,-2,0,0,0,2,-2
361,24.0,0,37.0042,0,1,0,-2,2,1,1,1,0,-2
