In [1]:
import pandas as pd                                                                                                                                                               
from sklearn.compose import ColumnTransformer                                     
from sklearn.preprocessing import StandardScaler, OneHotEncoder                                                  
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error 
from sklearn.linear_model import Lasso
from scipy.stats import loguniform                               

In [2]:
dados = pd.read_table("../datasets/Potencial_Novos_Clientes.txt",sep="\t",header=0)

#### Modelo

In [3]:
# Variáveis explicativas quantitativas (deixar vazio [] caso não haja nenhuma)
lista_X_quanti = ['IDADE_CLIENTE',
                  'RENDA_MENSAL_CLIENTE',
                  'BEHAVIOUR_SCORE_CLIENTE',
                  'QTD_TRANSACOES_3M',
                  'QTD_ITENS_3M',
                  'VALOR_GASTO_3M',
                  'TICKET_MEDIO_3M']

# Variáveis explicativas qualitativas (deixar vazio [] caso não haja nenhuma)
lista_X_quali = ['FLAG_ELETRONICOS_3M',
                 'SATISFACAO_ULTIMA_COMPRA']

In [4]:
y = dados['VALOR_GASTO_PROX_12M']
X = dados[lista_X_quanti + lista_X_quali]

In [8]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=123)

In [9]:
preprocessador = ColumnTransformer(transformers=[
    ("quanti", StandardScaler(), lista_X_quanti),
    ("quali", OneHotEncoder(sparse_output=False, drop="first", handle_unknown='ignore'), lista_X_quali)
])

In [10]:
X_treino_tratada = preprocessador.fit_transform(X_treino)  # O pré-processamento não deve envolver o conjunto de teste externo, que deve ficar reservado apenas para aplicações de resultados já obtidos, e nunca construções
if lista_X_quali:
    nomes_quali = list(preprocessador.named_transformers_['quali'].get_feature_names_out(lista_X_quali))
else:
    nomes_quali = []
nomes_variaveis = list(lista_X_quanti) + nomes_quali

In [11]:
X_treino_tratada = pd.DataFrame(X_treino_tratada, columns=nomes_variaveis)
X_treino_tratada.head()

Unnamed: 0,IDADE_CLIENTE,RENDA_MENSAL_CLIENTE,BEHAVIOUR_SCORE_CLIENTE,QTD_TRANSACOES_3M,QTD_ITENS_3M,VALOR_GASTO_3M,TICKET_MEDIO_3M,FLAG_ELETRONICOS_3M_1,SATISFACAO_ULTIMA_COMPRA_02_Satisfeito,SATISFACAO_ULTIMA_COMPRA_03_Neutro,SATISFACAO_ULTIMA_COMPRA_04_Insatisfeito,SATISFACAO_ULTIMA_COMPRA_05_Muito_Insatisfeito,SATISFACAO_ULTIMA_COMPRA_Nao_Respondeu
0,0.615857,0.631733,-0.327214,-0.082005,-0.249477,-0.272628,-0.400971,1.0,1.0,0.0,0.0,0.0,0.0
1,-0.544218,0.492125,0.939325,1.086501,-0.249477,1.141972,-0.337616,1.0,0.0,0.0,0.0,0.0,1.0
2,0.180829,1.154709,0.072746,-0.666258,0.243172,-0.547689,-0.01769,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.399209,0.959701,-0.260554,0.502248,-0.249477,-0.96421,-1.109685,1.0,0.0,0.0,0.0,0.0,0.0
4,-1.486779,-0.168243,0.072746,-0.666258,-1.234774,-1.017258,-0.520071,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
X_teste_tratada = preprocessador.transform(X_teste)  # Note que o mesmo objeto de pré-processamento deve ser utilizado, agora com 'transform' em vez de 'fit_transform'
X_teste_tratada = pd.DataFrame(X_teste_tratada, columns=nomes_variaveis)
X_teste_tratada.head()

Unnamed: 0,IDADE_CLIENTE,RENDA_MENSAL_CLIENTE,BEHAVIOUR_SCORE_CLIENTE,QTD_TRANSACOES_3M,QTD_ITENS_3M,VALOR_GASTO_3M,TICKET_MEDIO_3M,FLAG_ELETRONICOS_3M_1,SATISFACAO_ULTIMA_COMPRA_02_Satisfeito,SATISFACAO_ULTIMA_COMPRA_03_Neutro,SATISFACAO_ULTIMA_COMPRA_04_Insatisfeito,SATISFACAO_ULTIMA_COMPRA_05_Muito_Insatisfeito,SATISFACAO_ULTIMA_COMPRA_Nao_Respondeu
0,-0.689227,0.365813,-0.060574,-0.666258,-0.249477,-1.156753,-0.669313,0.0,1.0,0.0,0.0,0.0,0.0
1,0.97838,1.325341,0.606025,1.670754,0.73582,2.22257,-0.188667,0.0,0.0,0.0,0.0,0.0,1.0
2,1.195894,1.427277,1.405945,1.086501,1.228469,0.13407,-0.768949,1.0,0.0,1.0,0.0,0.0,0.0
3,-0.616723,0.381325,0.672685,-0.082005,-0.249477,0.540767,0.179185,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.979246,-0.061875,-1.327113,1.086501,-0.742126,0.045657,-0.806785,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
model = Lasso(random_state=123, max_iter=1000)
model.fit(X_treino_tratada, y_treino)
y_hat_teste = model.predict(X_teste_tratada)
erro = root_mean_squared_error(y_teste, y_hat_teste)

#### Serialização

In [14]:
import os
import pickle
import glob
from datetime import datetime

In [15]:
model_folder = '../assets/models/trained/'
filename = f'model_{datetime.now().date()}.pkl'
pickle.dump(model, open(model_folder+filename, 'wb'))

In [16]:
models = sorted(
    glob.iglob(os.path.join(os.path.dirname(model_folder), '*')),
      key=os.path.getctime, reverse=True) 
print(models)

['../assets/models/trained/model_2025-10-21.pkl', '../assets/models/trained/txt']


In [17]:
load_model = pickle.load(open(models[0], 'rb'))
predictions = load_model.predict(X_teste_tratada)
predictions

array([ 5.85026992e+02,  5.48618853e+03,  4.33537428e+03,  2.02550290e+03,
        3.07441924e+03,  5.92173347e+03,  1.84808308e+03,  4.68395301e+03,
        1.80235132e+03,  1.58643156e+03,  5.88861554e+03,  2.37845758e+03,
        4.60707195e+03,  3.68729704e+03,  1.91670415e+03,  2.35901292e+03,
       -3.06173581e+02,  1.76912437e+03,  5.35036014e+02,  1.22018636e+03,
        3.49052675e+03,  2.26912018e+03,  2.37665390e+03,  1.35877524e+03,
        5.42417301e+02,  3.59172913e+02,  2.62467066e+03,  2.52272247e+03,
        8.52949568e+02,  2.02998490e+03, -2.22483242e+02, -3.40091734e+02,
        2.33731405e+03,  5.31327039e+02, -6.68851623e+02,  2.18165768e+03,
       -8.14156760e+02,  7.62804635e+02,  6.34007723e+03,  1.58303525e+03,
        2.38255140e+03, -4.82203250e+02,  4.61169870e+03,  6.83078165e+02,
       -8.48346646e+01,  2.57370358e+03,  3.56798391e+03,  2.79240428e+03,
        7.93912878e+02,  7.17085545e+02, -5.25140857e+02,  5.55410595e+03,
        5.87075889e+02,  