# 01. Librerias y objetos necesarios

In [0]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
import sklearn
import mlflow.sklearn

from sklearn.pipeline import Pipeline



# 02. Train dataset base

In [0]:
C1_lst = [-21.57, 1.24, -0.84, 32.25, 0.82, -3.11, 0.46, -18.68, 0.04, 30.87]
C2_lst = [2.49, 2.27, 0.25, -2.33, -2.91, -3.61, 0.58, -2.59, -3.99, 1.54]
C3_lst = [1.14, 4.76, 14.23, -2.65, -3.53, -0.03, 17.50, -0.21, 2.96, 2.31]

df_train = pd.DataFrame(list(zip(C1_lst, C2_lst, C3_lst )), columns =['C1', 'C2', 'C3'])
df_train

Unnamed: 0,C1,C2,C3
0,-21.57,2.49,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,14.23
3,32.25,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,17.5
7,-18.68,-2.59,-0.21
8,0.04,-3.99,2.96
9,30.87,1.54,2.31


# 03. SKlearn Estimators

## 03.01. SKlearnOutliersEstimator

In [0]:
class SKlearnOutliersEstimator(BaseEstimator, TransformerMixin):

  def __init__(self, input_columns=None):
    self.input_columns = input_columns

  def fit(self, X=None, y=None):
    Q1 = X[self.input_columns].quantile(0.25, interpolation='nearest')
    Q3 = X[self.input_columns].quantile(0.75, interpolation='nearest')
    IQR = Q3-Q1
    outliers_thresholds_down = Q1-1.5*IQR
    outliers_thresholds_up = Q3+1.5*IQR
  
    self.dict_outliers_thresholds_ = {col:[outliers_thresholds_down[idx], outliers_thresholds_up[idx]] for idx, col in enumerate(self.input_columns)}
    return self

  def transform(self, X):
    X_output = X.copy()
    for col in self.dict_outliers_thresholds_.keys():
      X_output[col] = X_output[col].apply(lambda x: self.dict_outliers_thresholds_[col][0] if x<self.dict_outliers_thresholds_[col][0] else
                                             self.dict_outliers_thresholds_[col][1] if x>self.dict_outliers_thresholds_[col][1] else
                                             x)
    return X_output

## 03.02. SKlearnMinmaxEstimator

In [0]:
class SKlearnMinmaxEstimator(BaseEstimator, TransformerMixin):

  def __init__(self, input_columns=None):
    self.input_columns = input_columns

  def fit(self, X=None, y=None):
    Min = X[self.input_columns].min()
    Max = X[self.input_columns].max()
  
    self.dict_min_max_ = {col:[Min[idx], Max[idx]] for idx, col in enumerate(self.input_columns)}
    return self

  def transform(self, X):
    X_output = X.copy()
    for col in self.dict_min_max_.keys():
      X_output[col] = X_output[col].apply(lambda x: x*100 if x==self.dict_min_max_[col][0] else
                                                    x*10000 if x==self.dict_min_max_[col][1] else
                                             x)
    return X_output

# 04. Manual pipeline

## 04.02. Step 01

In [0]:
Object_01_SKlearnOutliersEstimator = SKlearnOutliersEstimator(input_columns=['C1', 'C3'])

In [0]:
df_output_01 = Object_01_SKlearnOutliersEstimator.fit_transform(X=df_train)
df_output_01

Unnamed: 0,C1,C2,C3
0,-9.635,2.49,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,12.215
3,7.765,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,12.215
7,-9.635,-2.59,-0.21
8,0.04,-3.99,2.96
9,7.765,1.54,2.31


In [0]:
Object_01_SKlearnOutliersEstimator.dict_outliers_thresholds_

Out[7]: {'C1': [-9.635, 7.765], 'C3': [-7.665, 12.215]}

## 04.02. Step 02

In [0]:
Object_02_SKlearnMinmaxEstimator = SKlearnMinmaxEstimator(input_columns=['C1', 'C2'])

In [0]:
df_output_02 = Object_02_SKlearnMinmaxEstimator.fit_transform(X=df_output_01)
df_output_02

Unnamed: 0,C1,C2,C3
0,-963.5,24900.0,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,12.215
3,77650.0,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,12.215
7,-963.5,-2.59,-0.21
8,0.04,-399.0,2.96
9,77650.0,1.54,2.31


In [0]:
Object_02_SKlearnMinmaxEstimator.dict_min_max_

Out[10]: {'C1': [-9.635, 7.765], 'C2': [-3.99, 2.49]}

# 05. SKlearn Pipelines

## 05.01. Automatic pipeline

In [0]:
pipeline_01 = Pipeline(steps = [('estimator_01', SKlearnOutliersEstimator(input_columns=['C1', 'C3']) ),
                                ('estimator_02', SKlearnMinmaxEstimator(input_columns=['C1', 'C2']) )] )

In [0]:
pipeline_01.fit(X=df_train)

Out[12]: Pipeline(steps=[('estimator_01',
                 SKlearnOutliersEstimator(input_columns=['C1', 'C3'])),
                ('estimator_02',
                 SKlearnMinmaxEstimator(input_columns=['C1', 'C2']))])

In [0]:
pipeline_01.transform(X=df_train)

Unnamed: 0,C1,C2,C3
0,-963.5,24900.0,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,12.215
3,77650.0,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,12.215
7,-963.5,-2.59,-0.21
8,0.04,-399.0,2.96
9,77650.0,1.54,2.31


In [0]:
df_output_pipeline = pipeline_01.fit_transform(X=df_train)
df_output_pipeline

Unnamed: 0,C1,C2,C3
0,-963.5,24900.0,1.14
1,1.24,2.27,4.76
2,-0.84,0.25,12.215
3,77650.0,-2.33,-2.65
4,0.82,-2.91,-3.53
5,-3.11,-3.61,-0.03
6,0.46,0.58,12.215
7,-963.5,-2.59,-0.21
8,0.04,-399.0,2.96
9,77650.0,1.54,2.31


In [0]:
pipeline_01

Out[15]: Pipeline(steps=[('estimator_01',
                 SKlearnOutliersEstimator(input_columns=['C1', 'C3'])),
                ('estimator_02',
                 SKlearnMinmaxEstimator(input_columns=['C1', 'C2']))])

In [0]:
pipeline_01.steps[0][1].dict_outliers_thresholds_

Out[16]: {'C1': [-9.635, 7.765], 'C3': [-7.665, 12.215]}

In [0]:
pipeline_01.steps[1][1].dict_min_max_

Out[17]: {'C1': [-9.635, 7.765], 'C2': [-3.99, 2.49]}

## 05.02. Manual Vs Automatic pipelines outputs

In [0]:
df_output_pipeline==df_output_02

Unnamed: 0,C1,C2,C3
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True
4,True,True,True
5,True,True,True
6,True,True,True
7,True,True,True
8,True,True,True
9,True,True,True


# 06. Pipeline: etapas y parámetros (visualización)

## 06.01. Parametros generales (steps)

In [0]:
pipeline_01

Out[19]: Pipeline(steps=[('estimator_01',
                 SKlearnOutliersEstimator(input_columns=['C1', 'C3'])),
                ('estimator_02',
                 SKlearnMinmaxEstimator(input_columns=['C1', 'C2']))])

In [0]:
pipeline_01.steps

Out[20]: [('estimator_01', SKlearnOutliersEstimator(input_columns=['C1', 'C3'])),
 ('estimator_02', SKlearnMinmaxEstimator(input_columns=['C1', 'C2']))]

In [0]:
pipeline_01.named_steps

Out[21]: {'estimator_01': SKlearnOutliersEstimator(input_columns=['C1', 'C3']),
 'estimator_02': SKlearnMinmaxEstimator(input_columns=['C1', 'C2'])}

## 06.02. Parametros especificos de cada step

- Tres maneras diferentes de acceder a los mismos atributos:

In [0]:
pipeline_01.named_steps['estimator_01'].dict_outliers_thresholds_

Out[22]: {'C1': [-9.635, 7.765], 'C3': [-7.665, 12.215]}

In [0]:
pipeline_01.steps[0][1].dict_outliers_thresholds_

Out[23]: {'C1': [-9.635, 7.765], 'C3': [-7.665, 12.215]}

In [0]:
pipeline_01['estimator_01'].dict_outliers_thresholds_

Out[24]: {'C1': [-9.635, 7.765], 'C3': [-7.665, 12.215]}

# XX. Work in progress