In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
import subprocess
%matplotlib inline

sns.set(rc={'figure.figsize':(11.7,8.27)})

def load_train_data(split=True):
    target = "Survived"
    data = pd.read_csv("./train.csv", index_col="PassengerId")
    print("load_train_data: done")
    
    if split:
        return split_features_target(data, target)
    else:
        return data
    
def split_features_target(df, target="Survived"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

In [None]:
# Pipeline with a FunctionTransformer
# no need to call fit if you only have stateless FunctionTransformers
# returns whatever is passed in

def add_one(X):
    breakpoint()
    return X + 1

pipeline = Pipeline([
    ("add_one", FunctionTransformer(add_one))
])

X_train, _ = load_train_data()
X_train_num = X_train.select_dtypes(include=np.number)
pipeline.transform(X_train_num)

In [None]:
# Pipeline with two FunctionTransformers
# no need to call fit if you only have stateless FunctionTransformers
# returns whatever is passed in

def add_one(X):
    breakpoint()
    return X + 1

def add_ten(X):
    return X + 10

pipeline = Pipeline([
    ("add_one", FunctionTransformer(add_one)),
    ("add_ten", FunctionTransformer(add_ten))
])

X_train, _ = load_train_data()
X_train_num = X_train.select_dtypes(include=np.number)
pipeline.transform(X_train_num)

In [None]:
# Pipeline with multiple FunctionTransformers
# returns whatever is passed in

def add_one(X):
    breakpoint()
    return X + 1

def add_ten(X):
    return X + 10

def add_column(X):
    new_col = pd.Series(range(len(X)), index=X.index)
    return pd.concat([X, new_col], axis="columns")

pipeline = Pipeline([
    ("add_one", FunctionTransformer(add_one)),
    ("add_ten", FunctionTransformer(add_ten)),
    ("add_column", FunctionTransformer(add_column))
])

X_train, _ = load_train_data()
X_train_num = X_train.select_dtypes(include=np.number)
pipeline.transform(X_train_num)

In [29]:
# Test

def generate_age_derived_features(X):
    has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
    is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
    is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")

    return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")

age_transformer = FunctionTransformer(generate_age_derived_features)

X_train, _ = load_train_data()
age_series = X_train["Age"]
single_column_df = age_series.to_frame()

age_transformer.transform(single_column_df)

load_train_data: done
> [0;32m<ipython-input-29-4fbfb1d52d08>[0m(5)[0;36mgenerate_age_derived_features[0;34m()[0m
[0;32m      3 [0;31m[0;32mdef[0m [0mgenerate_age_derived_features[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      4 [0;31m    [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 5 [0;31m    [0mhas_age[0m [0;34m=[0m [0mX[0m[0;34m[[0m[0;34m"Age"[0m[0;34m][0m[0;34m.[0m[0mapply[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0;36m0[0m [0;32mif[0m [0mpd[0m[0;34m.[0m[0misnull[0m[0;34m([0m[0mx[0m[0;34m)[0m [0;32melse[0m [0;36m1[0m[0;34m)[0m[0;34m.[0m[0mrename[0m[0;34m([0m[0;34m"HasAge"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m    [0mis_young_child[0m [0;34m=[0m [0mX[0m[0;34m[[0m[0;34m"Age"[0m[0;34m][0m[0;34m.[0m[0mapply[0m[0;34m([0m[0;32mlambda[0m [0mage[0m[0;34m:[0m [0;36m1[0m [0;32mif[

Unnamed: 0_level_0,Age,HasAge,IsYoungChild,IsInfant
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,22.0,1,0,0
2,38.0,1,0,0
3,26.0,1,0,0
4,35.0,1,0,0
5,35.0,1,0,0
...,...,...,...,...
887,27.0,1,0,0
888,19.0,1,0,0
889,,0,0,0
890,26.0,1,0,0


In [35]:
# Test

def generate_age_derived_features(X):
#     breakpoint()
    has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
    is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
    is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")

    return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")


X_train, _ = load_train_data()
age_transformer = FunctionTransformer(generate_age_derived_features)
col_transformer = ColumnTransformer([
        ("age", age_transformer, ["Age"])
    ],
    remainder="drop"
)

# col_transformer returns an ndarray
result = col_transformer.fit_transform(X_train)

# we can transform it back into a pd DataFrame if we want to
pd.DataFrame(data=result, index=X_train.index, columns=["Age", "HasAge", "IsYoungChild", "IsInfant"])

load_train_data: done


Unnamed: 0_level_0,Age,HasAge,IsYoungChild,IsInfant
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,22.0,1.0,0.0,0.0
2,38.0,1.0,0.0,0.0
3,26.0,1.0,0.0,0.0
4,35.0,1.0,0.0,0.0
5,35.0,1.0,0.0,0.0
...,...,...,...,...
887,27.0,1.0,0.0,0.0
888,19.0,1.0,0.0,0.0
889,,0.0,0.0,0.0
890,26.0,1.0,0.0,0.0


In [36]:
# Test

def generate_age_derived_features(X):
#     breakpoint()
    has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
    is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
    is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")

    return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")


X_train, _ = load_train_data()
age_transformer = FunctionTransformer(generate_age_derived_features)
age_pipeline = Pipeline([
    ("age_transformer", age_transformer),
#   ('impute', SimpleImputer(strategy="mean")),
])
col_transformer = ColumnTransformer([
        ("age", age_pipeline, ["Age"])
    ],
    remainder="drop"
)

# col_transformer returns an ndarray
result = col_transformer.fit_transform(X_train)

# we can transform it back into a pd DataFrame if we want to
pd.DataFrame(data=result, index=X_train.index, columns=["Age", "HasAge", "IsYoungChild", "IsInfant"])

load_train_data: done


Unnamed: 0_level_0,Age,HasAge,IsYoungChild,IsInfant
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,22.0,1.0,0.0,0.0
2,38.0,1.0,0.0,0.0
3,26.0,1.0,0.0,0.0
4,35.0,1.0,0.0,0.0
5,35.0,1.0,0.0,0.0
...,...,...,...,...
887,27.0,1.0,0.0,0.0
888,19.0,1.0,0.0,0.0
889,,0.0,0.0,0.0
890,26.0,1.0,0.0,0.0


In [38]:
# Test

def generate_age_derived_features(X):
    breakpoint()
    has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
    is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
    is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")

    return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")


X_train, _ = load_train_data()
age_transformer = FunctionTransformer(generate_age_derived_features)
age_pipeline = Pipeline([
    ("age_transformer", age_transformer),
    ('impute', SimpleImputer(strategy="mean")),
])
col_transformer = ColumnTransformer([
        ("age", age_pipeline, ["Age"])
    ],
    remainder="drop"
)

# col_transformer returns an ndarray
result = col_transformer.fit_transform(X_train)

# we can transform it back into a pd DataFrame if we want to
pd.DataFrame(data=result, index=X_train.index, columns=["Age", "HasAge", "IsYoungChild", "IsInfant"])

load_train_data: done
> [0;32m<ipython-input-38-82e160f50af6>[0m(5)[0;36mgenerate_age_derived_features[0;34m()[0m
[0;32m      3 [0;31m[0;32mdef[0m [0mgenerate_age_derived_features[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      4 [0;31m    [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 5 [0;31m    [0mhas_age[0m [0;34m=[0m [0mX[0m[0;34m[[0m[0;34m"Age"[0m[0;34m][0m[0;34m.[0m[0mapply[0m[0;34m([0m[0;32mlambda[0m [0mx[0m[0;34m:[0m [0;36m0[0m [0;32mif[0m [0mpd[0m[0;34m.[0m[0misnull[0m[0;34m([0m[0mx[0m[0;34m)[0m [0;32melse[0m [0;36m1[0m[0;34m)[0m[0;34m.[0m[0mrename[0m[0;34m([0m[0;34m"HasAge"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m    [0mis_young_child[0m [0;34m=[0m [0mX[0m[0;34m[[0m[0;34m"Age"[0m[0;34m][0m[0;34m.[0m[0mapply[0m[0;34m([0m[0;32mlambda[0m [0mage[0m[0;34m:[0m [0;36m1[0m [0;32mif[

Unnamed: 0_level_0,Age,HasAge,IsYoungChild,IsInfant
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,22.000000,1.0,0.0,0.0
2,38.000000,1.0,0.0,0.0
3,26.000000,1.0,0.0,0.0
4,35.000000,1.0,0.0,0.0
5,35.000000,1.0,0.0,0.0
...,...,...,...,...
887,27.000000,1.0,0.0,0.0
888,19.000000,1.0,0.0,0.0
889,29.699118,0.0,0.0,0.0
890,26.000000,1.0,0.0,0.0


In [72]:
# Test a SimpleImputer subclass that accepts and returns a DataFrame
class DFSimpleImputer(SimpleImputer):
    # just like SimpleImputer, but retuns a df
    # this approach creates problems with the add_indicator=True, since more columns are returned
    # so don't set add_indicator to True
    def transform(self, X, y=None):
        return pd.DataFrame(super().transform(X), columns=X.columns) 
    
ages = np.array([[17, 17], [29, 34], [np.nan, np.nan]])
ages_df = pd.DataFrame(data=ages, columns=["ColA", "ColB"])
    
pipe = Pipeline([
    ('impute', DFSimpleImputer(strategy="mean")),
])
pipe.fit_transform(ages_df)

Unnamed: 0,ColA,ColB
0,17.0,17.0
1,29.0,34.0
2,23.0,25.5


In [89]:
# Test a Pipeline where every step accepts and returns a DataFrame, not an ndarray
# Notice that if you plug this pipeline into a ColumnTransformer, you _will_ get an ndarray back

def generate_age_derived_features(X):
#     breakpoint()
    has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
    is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
    is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")

    return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")

def discretize_age(X):
    bins = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, np.inf]
    labels = ["0+", "10+", "20+", "30+", "40+", "50+", "60+", "70+"]

    age_group = pd.cut(X["Age"], bins=bins, labels=labels, include_lowest=True)
    age_group = age_group.rename("AgeGroup")
    
    result = pd.concat([X, age_group], axis="columns")    
    return result

def drop_age(X):
    X = X.drop("Age", axis="columns")

    return X


X_train, _ = load_train_data()
age_transformer = FunctionTransformer(generate_age_derived_features)
age_discretizer = FunctionTransformer(discretize_age)
age_pipeline = Pipeline([
    ("age_transformer", age_transformer),
    ('impute', DFSimpleImputer(strategy="mean")),
    ("discretize", age_discretizer),
    ("drop", FunctionTransformer(drop_age)),
])
col_transformer = ColumnTransformer([
        ("age", age_pipeline, ["Age"])
    ],
    remainder="drop"
)

# col_transformer returns an ndarray
result = age_pipeline.fit_transform(X_train["Age"].to_frame())
result

load_train_data: done


Unnamed: 0,HasAge,IsYoungChild,IsInfant,AgeGroup
0,1.0,0.0,0.0,20+
1,1.0,0.0,0.0,30+
2,1.0,0.0,0.0,20+
3,1.0,0.0,0.0,30+
4,1.0,0.0,0.0,30+
...,...,...,...,...
886,1.0,0.0,0.0,20+
887,1.0,0.0,0.0,10+
888,0.0,0.0,0.0,20+
889,1.0,0.0,0.0,20+


In [92]:
result = age_pipeline.fit_transform(X_train["Age"].to_frame())

encoder = OneHotEncoder(handle_unknown="ignore")
encoder_result = encoder.fit_transform(result)

columns = ["HasAge", "IsYoungChild", "IsInfant", "AgeGroup"]
feature_names = encoder.get_feature_names(columns)
pd.DataFrame.sparse.from_spmatrix(encoder_result, columns=feature_names)

Unnamed: 0,HasAge_0.0,HasAge_1.0,IsYoungChild_0.0,IsYoungChild_1.0,IsInfant_0.0,IsInfant_1.0,AgeGroup_0+,AgeGroup_10+,AgeGroup_20+,AgeGroup_30+,AgeGroup_40+,AgeGroup_50+,AgeGroup_60+,AgeGroup_70+
0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
887,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
888,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
889,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [101]:
# A OneHotEncoder subclass that accepts and returns a DataFrame
class DFOneHotEncoder(OneHotEncoder):
    def transform(self, X, y=None):
        column_names = X.columns
        X_transformed = super().transform(X)
        return pd.DataFrame.sparse.from_spmatrix(X_transformed, columns=self.get_feature_names(column_names))
    
X_train, _ = load_train_data()
result = age_pipeline.fit_transform(X_train["Age"].to_frame())
encoder = DFOneHotEncoder(handle_unknown="ignore")
encoder_result = encoder.fit_transform(result)
encoder_result

load_train_data: done


Unnamed: 0,HasAge_0.0,HasAge_1.0,IsYoungChild_0.0,IsYoungChild_1.0,IsInfant_0.0,IsInfant_1.0,AgeGroup_0+,AgeGroup_10+,AgeGroup_20+,AgeGroup_30+,AgeGroup_40+,AgeGroup_50+,AgeGroup_60+,AgeGroup_70+
0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
887,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
888,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
889,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [107]:
# Test full Age pipeline

class DFSimpleImputer(SimpleImputer):
    def transform(self, X, y=None):
        return pd.DataFrame(super().transform(X), columns=X.columns)

class DFOneHotEncoder(OneHotEncoder):
    def transform(self, X, y=None):
        column_names = X.columns
        X_transformed = super().transform(X)
        return pd.DataFrame.sparse.from_spmatrix(X_transformed, columns=self.get_feature_names(column_names))
    
def generate_age_derived_features(X):
    has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
    is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
    is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")
    return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")

def discretize_age(X):
    bins = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, np.inf]
    labels = ["0+", "10+", "20+", "30+", "40+", "50+", "60+", "70+"]

    age_group = pd.cut(X["Age"], bins=bins, labels=labels, include_lowest=True)
    age_group = age_group.rename("AgeGroup")
    
    result = pd.concat([X, age_group], axis="columns")    
    return result

def drop_age(X):
    X = X.drop("Age", axis="columns")
    return X


X_train, _ = load_train_data()
age_pipeline = Pipeline([
    ("age_transformer", FunctionTransformer(generate_age_derived_features)),
    ('impute', DFSimpleImputer(strategy="mean")),
    ("discretize", FunctionTransformer(discretize_age)),
    ("drop", FunctionTransformer(drop_age)),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])
col_transformer = ColumnTransformer([
        ("age", age_pipeline, ["Age"])
    ],
    remainder="drop"
)

# col_transformer returns an ndarray
result = age_pipeline.fit_transform(X_train["Age"].to_frame())
# result = col_transformer.fit_transform(X_train)
result

load_train_data: done


Unnamed: 0,HasAge_0.0,HasAge_1.0,IsYoungChild_0.0,IsYoungChild_1.0,IsInfant_0.0,IsInfant_1.0,AgeGroup_0+,AgeGroup_10+,AgeGroup_20+,AgeGroup_30+,AgeGroup_40+,AgeGroup_50+,AgeGroup_60+,AgeGroup_70+
0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
887,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
888,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
889,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
