In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
import subprocess
%matplotlib inline

sns.set(rc={'figure.figsize':(11.7,8.27)})

def load_train_data(split=True):
    target = "Survived"
    data = pd.read_csv("./train.csv", index_col="PassengerId")
    print("load_train_data: done")
    
    if split:
        return split_features_target(data, target)
    else:
        return data
    
def split_features_target(df, target="Survived"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

In [37]:
# FunctionTransformer with an ndarray
# returns an ndarray

def add_one(X):
    breakpoint()
    return X + 1;

ages = np.array([[10, 17], [29, 34]])

add_one_transformer = FunctionTransformer(add_one)

add_one_transformer.transform(ages)

> [0;32m<ipython-input-37-966ce4f64309>[0m(5)[0;36madd_one[0;34m()[0m
[0;32m      3 [0;31m[0;32mdef[0m [0madd_one[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      4 [0;31m    [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 5 [0;31m    [0;32mreturn[0m [0mX[0m [0;34m+[0m [0;36m1[0m[0;34m;[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m[0;34m[0m[0m
[0m[0;32m      7 [0;31m[0mages[0m [0;34m=[0m [0mnp[0m[0;34m.[0m[0marray[0m[0;34m([0m[0;34m[[0m[0;34m[[0m[0;36m10[0m[0;34m,[0m [0;36m17[0m[0;34m][0m[0;34m,[0m [0;34m[[0m[0;36m29[0m[0;34m,[0m [0;36m34[0m[0;34m][0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


array([[11, 18],
       [30, 35]])

In [39]:
# FunctionTransformer with a Pandas DataFrame
# returns a DataFrame

def add_one(X):
    breakpoint()
    return X + 1;

add_one_transformer = FunctionTransformer(add_one)
X_train, y_train = load_train_data()

add_one_transformer.transform(X_train.select_dtypes(include=np.number))

load_train_data: done
> [0;32m<ipython-input-39-2dd3e1fe6df6>[0m(5)[0;36madd_one[0;34m()[0m
[0;32m      3 [0;31m[0;32mdef[0m [0madd_one[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      4 [0;31m    [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 5 [0;31m    [0;32mreturn[0m [0mX[0m [0;34m+[0m [0;36m1[0m[0;34m;[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m[0;34m[0m[0m
[0m[0;32m      7 [0;31m[0madd_one_transformer[0m [0;34m=[0m [0mFunctionTransformer[0m[0;34m([0m[0madd_one[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4,23.0,2,1,8.2500
2,2,39.0,2,1,72.2833
3,4,27.0,1,1,8.9250
4,2,36.0,2,1,54.1000
5,4,36.0,1,1,9.0500
...,...,...,...,...,...
887,3,28.0,1,1,14.0000
888,2,20.0,1,1,31.0000
889,4,,2,3,24.4500
890,2,27.0,1,1,31.0000


In [45]:
# FunctionTransformer inside a ColumnTransformer
# passing in an ndarray
# returns an ndarray
# ColumnTransformer might change the order of columns when it concates them

def add_one(X):
    breakpoint()
    return X + 1;

ages = np.array([
    [10, 17],
    [29, 34]
])

add_one_transformer = FunctionTransformer(add_one)

column_transformer = ColumnTransformer([
        ("add_one", add_one_transformer, [1])
    ],
    remainder="passthrough"
)

column_transformer.fit_transform(ages)

> [0;32m<ipython-input-45-dee6e091923e>[0m(8)[0;36madd_one[0;34m()[0m
[0;32m      6 [0;31m[0;32mdef[0m [0madd_one[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m    [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 8 [0;31m    [0;32mreturn[0m [0mX[0m [0;34m+[0m [0;36m1[0m[0;34m;[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      9 [0;31m[0;34m[0m[0m
[0m[0;32m     10 [0;31mages = np.array([
[0m
ipdb> c


array([[18, 10],
       [35, 29]])

In [64]:
# FunctionTransformer gets a pd Series
# returns a pd Series

def add_one(X):
    breakpoint()
    return X + 1;

add_one_transformer = FunctionTransformer(add_one)
X_train, _ = load_train_data()

add_one_transformer.transform(X_train["Pclass"])

load_train_data: done
> [0;32m<ipython-input-64-51e2f698cd9d>[0m(6)[0;36madd_one[0;34m()[0m
[0;32m      4 [0;31m[0;32mdef[0m [0madd_one[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      5 [0;31m    [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 6 [0;31m    [0;32mreturn[0m [0mX[0m [0;34m+[0m [0;36m1[0m[0;34m;[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m[0;34m[0m[0m
[0m[0;32m      8 [0;31m[0madd_one_transformer[0m [0;34m=[0m [0mFunctionTransformer[0m[0;34m([0m[0madd_one[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


PassengerId
1      4
2      2
3      4
4      2
5      4
      ..
887    3
888    2
889    4
890    2
891    4
Name: Pclass, Length: 891, dtype: int64

In [63]:
# FunctionTransformer inside a ColumnTransformer
# specifying one column using a list of strings
# the FunctionTransformer gets passed a pandas DataFrame with one column (not a pd Series)
# the ColumnTransformer returns an ndarray
# ColumnTransformer might change the order of columns when it concates them

def add_one(X):
    breakpoint()
    return X + 1;

add_one_transformer = FunctionTransformer(add_one)
column_transformer = ColumnTransformer([
        ("add_one", add_one_transformer, ["Pclass"])
    ],
    remainder="drop"
)

X_train, y_train = load_train_data()

result = column_transformer.fit_transform(X_train)
print(type(result))
print(result.shape)

load_train_data: done
> [0;32m<ipython-input-63-c7a9558e907d>[0m(9)[0;36madd_one[0;34m()[0m
[0;32m      7 [0;31m[0;32mdef[0m [0madd_one[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m    [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 9 [0;31m    [0;32mreturn[0m [0mX[0m [0;34m+[0m [0;36m1[0m[0;34m;[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     10 [0;31m[0;34m[0m[0m
[0m[0;32m     11 [0;31m[0madd_one_transformer[0m [0;34m=[0m [0mFunctionTransformer[0m[0;34m([0m[0madd_one[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c
<class 'numpy.ndarray'>
(891, 1)


In [59]:
# FunctionTransformer inside a ColumnTransformer
# specifying two columns using a list of strings
# the FunctionTransformer gets passed a pd DataFrame with those two columns
# the ColumnTransformer returns an ndarray
# ColumnTransformer might change the order of columns when it concates them

def add_one(X):
    breakpoint()
    return X + 1;

add_one_transformer = FunctionTransformer(add_one)
column_transformer = ColumnTransformer([
        ("add_one", add_one_transformer, ["Pclass", "Parch"])
    ],
    remainder="drop"
)

X_train, y_train = load_train_data()

result = column_transformer.fit_transform(X_train)
result

load_train_data: done
> [0;32m<ipython-input-59-a88b62376053>[0m(9)[0;36madd_one[0;34m()[0m
[0;32m      7 [0;31m[0;32mdef[0m [0madd_one[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      8 [0;31m    [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 9 [0;31m    [0;32mreturn[0m [0mX[0m [0;34m+[0m [0;36m1[0m[0;34m;[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     10 [0;31m[0;34m[0m[0m
[0m[0;32m     11 [0;31m[0madd_one_transformer[0m [0;34m=[0m [0mFunctionTransformer[0m[0;34m([0m[0madd_one[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> c


array([[4, 1],
       [2, 1],
       [4, 1],
       ...,
       [4, 3],
       [2, 1],
       [4, 1]])

In [51]:
# SimpleImputer with an ndarray
# returns an ndarray

simple_imputer = SimpleImputer(strategy="mean")

ages = np.array([[10, 17], [29, 34], [np.nan, np.nan]])

result = simple_imputer.fit_transform(ages)
print(type(result))
result

<class 'numpy.ndarray'>


array([[10. , 17. ],
       [29. , 34. ],
       [19.5, 25.5]])

In [67]:
# SimpleImputer with a pd DataFrame
# returns ndarray

simple_imputer = SimpleImputer(strategy="mean")

X_train, _ = load_train_data()

result = simple_imputer.fit_transform(X_train.select_dtypes(include=np.number))
print(type(result))
result

load_train_data: done
<class 'numpy.ndarray'>


array([[ 3.        , 22.        ,  1.        ,  0.        ,  7.25      ],
       [ 1.        , 38.        ,  1.        ,  0.        , 71.2833    ],
       [ 3.        , 26.        ,  0.        ,  0.        ,  7.925     ],
       ...,
       [ 3.        , 29.69911765,  1.        ,  2.        , 23.45      ],
       [ 1.        , 26.        ,  0.        ,  0.        , 30.        ],
       [ 3.        , 32.        ,  0.        ,  0.        ,  7.75      ]])

In [58]:
# OneHotEncoder with a pd DataFrame
# returns a sparse matrix

encoder = OneHotEncoder()

X_train, _ = load_train_data()

# Drop missing values so the encoder can work
X_train = X_train.dropna()

result = encoder.fit_transform(X_train.select_dtypes(include=object))
print(type(result))
result

load_train_data: done
<class 'scipy.sparse.csr.csr_matrix'>


<183x448 sparse matrix of type '<class 'numpy.float64'>'
	with 915 stored elements in Compressed Sparse Row format>