In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
import subprocess
%matplotlib inline

sns.set(rc={'figure.figsize':(11.7,8.27)})

def load_train_data(split=True):
    target = "Survived"
    data = pd.read_csv("./train.csv", index_col="PassengerId")
    print("load_train_data: done")
    
    if split:
        return split_features_target(data, target)
    else:
        return data
    
def split_features_target(df, target="Survived"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

class DFSimpleImputer(SimpleImputer):
    def transform(self, X, y=None):
        return pd.DataFrame(super().transform(X), columns=X.columns)

class DFOneHotEncoder(OneHotEncoder):
    def transform(self, X, y=None):
        column_names = X.columns
        X_transformed = super().transform(X)
        return pd.DataFrame.sparse.from_spmatrix(X_transformed, columns=self.get_feature_names(column_names))

In [None]:
# Test DFSimpleImputer with a constant
X_train, _ = load_train_data()

encoder = DFSimpleImputer(strategy="constant", fill_value="Unknown")
encoder.fit_transform(X_train["Cabin"].to_frame())

In [None]:
# Sex pipeline

sex_pipeline = Pipeline([
    ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])

X_train, _ = load_train_data()
sex_df = X_train["Sex"].to_frame()
sex_pipeline.fit_transform(sex_df)

In [28]:
# Embarked pipeline
# same as sex_pipeline

embarked_pipeline = Pipeline([
    ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])

X_train, _ = load_train_data()
df = X_train["Embarked"].to_frame()
embarked_pipeline.fit_transform(df)

load_train_data: done


Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S,Embarked_Unknown
0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
886,0.0,0.0,1.0,0.0
887,0.0,0.0,1.0,0.0
888,0.0,0.0,1.0,0.0
889,1.0,0.0,0.0,0.0


In [43]:
# Attribute dropper

def attributes_dropper(df):
    result_df = df.drop("Name", axis="columns")
    result_df = result_df.drop("Ticket", axis="columns")
    
    return result_df

attributes_dropper_transformer = FunctionTransformer(attributes_dropper)

In [49]:
# Fare pipeline

fare_pipeline = Pipeline([
    ('impute_missing', DFSimpleImputer(strategy="mean")),
    ("log_transform", FunctionTransformer(np.log1p)),
    ('standard_scaler', StandardScaler())
])

X_train, _ = load_train_data()
fare_df = X_train["Fare"].to_frame()
fare_pipeline.fit_transform(fare_df)

load_train_data: done


array([[-0.87974057],
       [ 1.36121993],
       [-0.79853997],
       [ 1.06203806],
       [-0.78417924],
       [-0.7386163 ],
       [ 1.03814556],
       [ 0.13649914],
       [-0.48145578],
       [ 0.48943791],
       [-0.091565  ],
       [ 0.36526094],
       [-0.78417924],
       [ 0.52869893],
       [-0.80676338],
       [-0.13322852],
       [ 0.45751954],
       [-0.33369847],
       [-0.01838584],
       [-0.88287417],
       [ 0.34443948],
       [-0.33369847],
       [-0.78655506],
       [ 0.65571883],
       [ 0.13649914],
       [ 0.53229169],
       [-0.88287417],
       [ 2.69870078],
       [-0.80385215],
       [-0.80192362],
       [ 0.40823345],
       [ 2.09779492],
       [-0.81898658],
       [-0.53680577],
       [ 1.50608578],
       [ 1.04082775],
       [-0.88234706],
       [-0.78417924],
       [-0.01838584],
       [-0.47227212],
       [-0.63319725],
       [ 0.13298517],
       [-0.80192362],
       [ 0.81478225],
       [-0.80385215],
       [-0

In [59]:
# Cabin pipeline

def generate_cabin_derived_features(df):
    has_cabin = df["Cabin"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasCabin")
    cabin_letter = df["Cabin"].apply(lambda string: string[0] if pd.notnull(string) else "U")
    cabin_letter = cabin_letter.rename("CabinLetter")

    return pd.concat([df, has_cabin, cabin_letter], axis="columns")

def drop_cabin(df):
    result = df.drop("Cabin", axis="columns")
    return result

cabin_pipeline = Pipeline([
    ("generate_derived_features", FunctionTransformer(generate_cabin_derived_features)),
    ('impute', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
    ("drop", FunctionTransformer(drop_cabin)),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])

In [None]:
# Age pipeline
    
def generate_age_derived_features(X):
    has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
    is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
    is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")
    return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")

def discretize_age(X):
    bins = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, np.inf]
    labels = ["0+", "10+", "20+", "30+", "40+", "50+", "60+", "70+"]

    age_group = pd.cut(X["Age"], bins=bins, labels=labels, include_lowest=True)
    age_group = age_group.rename("AgeGroup")
    
    result = pd.concat([X, age_group], axis="columns")    
    return result

def drop_age(X):
    X = X.drop("Age", axis="columns")
    return X

X_train, _ = load_train_data()
age_pipeline = Pipeline([
    ("age_transformer", FunctionTransformer(generate_age_derived_features)),
    ('impute', DFSimpleImputer(strategy="mean")),
    ("discretize", FunctionTransformer(discretize_age)),
    ("drop", FunctionTransformer(drop_age)),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])
col_transformer = ColumnTransformer([
        ("age", age_pipeline, ["Age"])
    ],
    remainder="drop"
)

# col_transformer returns an ndarray
result = age_pipeline.fit_transform(X_train["Age"].to_frame())
# result = col_transformer.fit_transform(X_train)
result

In [60]:
# Test full pipeline
pipe = ColumnTransformer([
        ("sex", sex_pipeline, ["Sex"]),
        ("embarked", embarked_pipeline, ["Embarked"]),
        ("age", age_pipeline, ["Age"]),
        ("drop_features", attributes_dropper_transformer, ["Name", "Ticket"]),
        ("fare", fare_pipeline, ["Fare"]),
        ("cabin", cabin_pipeline, ["Cabin"]),
    ],
    remainder="passthrough"
)

X_train, _ = load_train_data()
result = pipe.fit_transform(X_train)
result[0, :]

load_train_data: done


array([ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
       -0.87974057,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  3.        ,  1.        ,  0.        ])