In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler 

In [2]:
dados = pd.read_table("../datasets/Potencial_Novos_Clientes.txt",
                      sep="\t",
                      header=0)
dados.head()

Unnamed: 0,COD_CARTAO,IDADE_CLIENTE,RENDA_MENSAL_CLIENTE,BEHAVIOUR_SCORE_CLIENTE,QTD_TRANSACOES_3M,QTD_ITENS_3M,VALOR_GASTO_3M,TICKET_MEDIO_3M,FLAG_ELETRONICOS_3M,SATISFACAO_ULTIMA_COMPRA,VALOR_GASTO_PROX_12M
0,212394,22,1500,52,2,4,802,401.0,0,01_Muito_Satisfeito,226
1,279177,29,1800,94,5,5,1384,276.8,1,03_Neutro,2786
2,291430,36,1500,89,5,6,1610,322.0,1,Nao_Respondeu,3737
3,176612,43,1500,48,3,4,836,278.67,0,01_Muito_Satisfeito,1162
4,223092,28,12090,89,4,5,1318,329.5,0,Nao_Respondeu,2699


#### Transformer

In [None]:
# Podemos criar nossos próprios estimadores e transformers
class ColumnSelector(BaseEstimator, TransformerMixin):
    '''Seleciona um subset de um dado dataframe a partir de uma lista de colunas'''
    def __init__(self, cols_list):
        self.cols_list = cols_list
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X.loc[:, self.cols_list]

In [None]:
cols_list_quanti = ['IDADE_CLIENTE',
                  'RENDA_MENSAL_CLIENTE',
                  'BEHAVIOUR_SCORE_CLIENTE',
                  'QTD_TRANSACOES_3M',
                  'QTD_ITENS_3M',
                  'VALOR_GASTO_3M',
                  'TICKET_MEDIO_3M']

dados_subset = ColumnSelector(cols_list_quanti)
dados_subset.transform(dados).head()


#### Pipeline

In [None]:
num_pipe = Pipeline([('get_num_cols', ColumnSelector(cols_list_quanti)),
                ('fix_nan', SimpleImputer(missing_values=np.nan, strategy='median')),
                ('scale_data', MinMaxScaler())
])

In [None]:
dados_transformados = pd.DataFrame(num_pipe.fit_transform(dados))
dados_transformados.head()

#### ColumnTransformer

In [None]:
cols_list_quali= ['FLAG_ELETRONICOS_3M',
                 'SATISFACAO_ULTIMA_COMPRA']

In [None]:
FeatureEng = ColumnTransformer(
    transformers=[
        ('cat_ohe', OneHotEncoder(), cols_list_quali),
        ('num_pipe', num_pipe, cols_list_quanti)
    ]
)
pd.DataFrame(FeatureEng.fit_transform(dados))

#### Como aplicar

In [3]:
from sklearn.model_selection import RandomizedSearchCV                                                
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from scipy.stats import loguniform 
import warnings
warnings.filterwarnings('ignore')


In [4]:
lista_X_quanti = ['IDADE_CLIENTE',
                  'RENDA_MENSAL_CLIENTE',
                  'BEHAVIOUR_SCORE_CLIENTE',
                  'QTD_TRANSACOES_3M',
                  'QTD_ITENS_3M',
                  'VALOR_GASTO_3M',
                  'TICKET_MEDIO_3M']

lista_X_quali = ['FLAG_ELETRONICOS_3M',
                 'SATISFACAO_ULTIMA_COMPRA']

In [5]:
y = dados['VALOR_GASTO_PROX_12M']
X = dados[lista_X_quanti + lista_X_quali]

X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, test_size=0.2, random_state=123)                                                        

In [None]:
#X_teste.to_csv("X_teste.csv", index=False)

In [6]:
class Wrapper(BaseEstimator):
    def __init__(self, intermediate_model):                
        self.intermediate_model = intermediate_model
    def fit(self, X, y=None):                        
        return self                                  
    def transform(self, X_teste):
        return self.intermediate_model.predict(X_teste)

In [None]:
preprocessador = ColumnTransformer(transformers=[
    ("quanti", StandardScaler(), lista_X_quanti),
    ("quali", OneHotEncoder(
        sparse_output=False,drop="first", handle_unknown='ignore'),lista_X_quali)
])

In [8]:
RL = Pipeline([
        ("preprocess", preprocessador),
        ("linear regression", RandomizedSearchCV(estimator=LinearRegression(),
                  param_distributions={},
                  scoring='neg_root_mean_squared_log_error',
                  cv=10) )
])

In [9]:
RD =  Pipeline([
        ("preprocess", preprocessador),
        ("linear regression", RandomizedSearchCV(estimator=Ridge(),
                  param_distributions={'alpha': loguniform(1e-5, 1e1) },
                  scoring='neg_root_mean_squared_log_error',
                  cv=10) )
])

In [10]:
treino_predict_pipe = FeatureUnion([
    ('linear', Wrapper(RL.fit(X_treino, y_treino))),
    ('ridge', Wrapper(RD.fit(X_treino, y_treino)))
])

In [11]:
y_hat = treino_predict_pipe.fit_transform(X_teste)
y_hat[:10]

array([ 584.75055789, 5481.49951283, 4338.86026119, 2023.84551492,
       3137.28878541, 5922.15906779, 1843.64941438, 4680.58782678,
       1788.20394403, 1585.34864919])

In [12]:
model_select = int(len(y_hat)/2)

In [13]:
from sklearn.metrics import root_mean_squared_error
print(root_mean_squared_error(y_teste, y_hat[:model_select]))
print(root_mean_squared_error(y_teste, y_hat[model_select:]))

696.79226412171
696.7808560580661


In [16]:
pd.DataFrame(preprocessador.fit_transform(dados))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-1.279764,-0.985605,-1.618096,-0.679265,-0.757361,-0.609046,-0.066929,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.773183,-0.940964,1.217685,1.097740,-0.257908,0.542961,-0.590740,1.0,0.0,1.0,0.0,0.0,0.0
2,-0.266603,-0.985605,0.880092,1.097740,0.241544,0.990304,-0.400110,1.0,0.0,0.0,0.0,0.0,1.0
3,0.239977,-0.985605,-1.888170,-0.086930,-0.757361,-0.541746,-0.582853,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.845552,0.590226,0.880092,0.505405,-0.257908,0.412321,-0.368479,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,-0.483709,-0.195457,-0.740354,0.505405,-0.257908,0.907169,-0.104886,1.0,0.0,0.0,0.0,0.0,0.0
2926,-0.556078,0.237561,-1.483059,-0.679265,-0.257908,0.639951,1.263688,0.0,1.0,0.0,0.0,0.0,0.0
2927,-0.049497,0.476391,0.812574,1.097740,0.241544,1.386183,-0.231410,0.0,0.0,0.0,0.0,0.0,1.0
2928,1.542613,1.286627,-0.335243,-0.679265,0.740996,0.396486,1.004312,0.0,0.0,0.0,0.0,0.0,1.0
