# Custom Pipelines

In [1]:
import sklearn
import scipy
import pandas as pd
import numpy as np

In [4]:
sklearn.__version__, pd.__version__, np.__version__,  scipy.__version__,

('0.24.2', '1.3.1', '1.20.3', '1.6.2')

### Custom Dataframe Transormer

In [6]:
from sklearn.pipeline import Pipeline

class DataFrameFunctionTransformer():
    def __init__(self, func):
        self.func = func
        
    def transform(self, input_df, **transform_params):
        return self.func(input_df)
    
    def fit(self, X, y=None, **fit_params):
        return self

In [7]:
def process_dataframe(input_df):
    input_df["text"] = input_df["text"].map(lambda t: t.upper())
    return input_df

In [8]:
df = pd.DataFrame({
    "id":[1,2,3,4],
    "text":["foo","Bar","BAz","quux"]
})

In [9]:
df

Unnamed: 0,id,text
0,1,foo
1,2,Bar
2,3,BAz
3,4,quux


In [11]:
pipeline = Pipeline([("lowercase", DataFrameFunctionTransformer(process_dataframe))])

In [12]:
pipeline.fit_transform(df)

Unnamed: 0,id,text
0,1,FOO
1,2,BAR
2,3,BAZ
3,4,QUUX


### Custom Transformer Example

In [13]:
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import TransformerMixin, BaseEstimator

In [14]:
data = scipy.sparse.csr_matrix([
    [1.,0.,0.,0.,0.,0.],
    [0.,1.,0.,0.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [0.,0.,0.,0.,1.,0.],
    [0.,0.,0.,1.,0.,0.],
    [1.,0.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
    [1.,1.,0.,0.,0.,0.],
])

target = np.array([1,1,1,0,0,0,1,1])

In [15]:
class ToDenseTransformer():
    def transform(self, X, y=None, **fit_params):
        return X.todense()
        
    def fit(self, X, y=None, **fit_params):
        return self

In [17]:
pipeline = Pipeline([
    ("to_dense", ToDenseTransformer()),
    ("pca", PCA()),
    ("clf", DecisionTreeClassifier())
])

In [18]:
pipeline.fit(data, target)
pipeline.predict(data)

array([1, 1, 1, 0, 0, 1, 1, 1])

### Imputation Column Transform

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [20]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

df.head()

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,
3,david,38.0
4,edward,20.0


In [22]:
transformer_step = ColumnTransformer([("impute_mean", SimpleImputer(strategy="mean"), ["age"])], remainder="passthrough")

In [23]:
pipe = Pipeline([("transformer", transformer_step)])

In [26]:
data = pipe.fit_transform(df)  # sklearn returns numpy arrays
df = pd.DataFrame(data, columns=["name", "age"])

In [27]:
df

Unnamed: 0,name,age
0,24.0,alice
1,32.0,bob
2,28.5,charlie
3,38.0,david
4,20.0,edward


### Column Transform One hot encode

In [29]:
from sklearn.preprocessing import OneHotEncoder

In [30]:
df = pd.DataFrame({
    'favorite_color':['blue','green','red','green','blue'],
    'age': [10,15,10,np.nan,10],
    'target':[1,0,1,0,1]
})

In [31]:
df

Unnamed: 0,favorite_color,age,target
0,blue,10.0,1
1,green,15.0,0
2,red,10.0,1
3,green,,0
4,blue,10.0,1


In [33]:
categorical_preprocessing = Pipeline([("ohe", OneHotEncoder())])
numerical_preprocessing = Pipeline([("imputation", SimpleImputer())])

preprocess = ColumnTransformer([
    ("categorical_preprocessing", categorical_preprocessing, ["favorite_color"]),
    ("numerical_preprocessing", numerical_preprocessing, ["age"]),
])

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("clf", DecisionTreeClassifier()),
])

X = df[["favorite_color", "age"]]
y = df["target"]

pipeline.fit(X, y)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('categorical_preprocessing',
                                                  Pipeline(steps=[('ohe',
                                                                   OneHotEncoder())]),
                                                  ['favorite_color']),
                                                 ('numerical_preprocessing',
                                                  Pipeline(steps=[('imputation',
                                                                   SimpleImputer())]),
                                                  ['age'])])),
                ('clf', DecisionTreeClassifier())])

In [35]:
pipeline.predict(X)

array([1, 0, 1, 0, 1])

In [36]:
y

0    1
1    0
2    1
3    0
4    1
Name: target, dtype: int64

### Select Columns Transformer

In [38]:
class SelectColumnsTransformer():
    def __init__(self, columns=None):
        self.columns = columns
        
    def transform(self, X, **transform_params):
        cpy_df = X[self.columns].copy()
        return cpy_df
    
    def fit(self, X, y=None, **fit_params):
        return self

In [39]:
df = pd.DataFrame({
    'name':['alice','bob','charlie','david','edward'],
    'age':[24,32,np.nan,38,20]
})

df.head()

Unnamed: 0,name,age
0,alice,24.0
1,bob,32.0
2,charlie,
3,david,38.0
4,edward,20.0


In [46]:
pipe = Pipeline([("select", SelectColumnsTransformer(["age"]))])

In [47]:
pipe.fit_transform(df)

Unnamed: 0,age
0,24.0
1,32.0
2,
3,38.0
4,20.0


### Function Transformer with parameters

In [48]:
from nltk.stem import RSLPStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import FunctionTransformer

In [49]:
def stem_str(input_series, stemmer):
    
    def stem(input_str):
        return " ".join([stemmer.stem(t) for t in input_str.split(" ")]).strip()
    
    return input_series.apply(stem)

pipeline = Pipeline([
    ("stemmer", FunctionTransformer(func=stem_str, kw_args={"stemmer": RSLPStemmer()})),
    ("vect", TfidfVectorizer()),
    ("clf", LogisticRegression()),
])

In [50]:
df = pd.DataFrame({
    'text':[
        'Lorem ipsum dolor sit amet, consectetur adipiscing elit.',
        'Sed accumsan congue enim non pretium.',
        'In hac habitasse platea dictumst.',
        'Sed tincidunt ipsum nec urna vulputate luctus.'
    ],
    'target':[0, 1, 0, 1]
})

In [51]:
df

Unnamed: 0,text,target
0,"Lorem ipsum dolor sit amet, consectetur adipis...",0
1,Sed accumsan congue enim non pretium.,1
2,In hac habitasse platea dictumst.,0
3,Sed tincidunt ipsum nec urna vulputate luctus.,1


In [52]:
pipeline.fit(df["text"], df["target"])

Pipeline(steps=[('stemmer',
                 FunctionTransformer(func=<function stem_str at 0x7f9578b0a040>,
                                     kw_args={'stemmer': <nltk.stem.rslp.RSLPStemmer object at 0x7f9578938d90>})),
                ('vect', TfidfVectorizer()), ('clf', LogisticRegression())])

In [53]:
pipeline.predict(df["text"])

array([0, 1, 0, 1])