In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.stats import chi2_contingency
from scipy.stats import zscore
from scipy.stats import linregress
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [4]:
# PRÉ PROCESSAMENTO

In [5]:
dados_clientes = pd.read_csv("C:/Projetos Pessoais/DataScience/segmentacao_Clientes_Ecomerce/data/dados_clientes_para_pre_processing.csv")

In [6]:
dados_clientes

Unnamed: 0,Cod_cliente,Ultima_Compra,Total_transacoes,Valor_total_gasto,Avg_transacao,Dia_fav_compra,UK,Gastos_medios_mensais,Desvio_padrao_gastos_mensais,Tendencia_gastos
0,12347,2,182,4310.00,23.68,1,0,615.714286,341.070789,4.486071
1,12348,75,27,1437.24,53.23,3,0,359.310000,203.875689,-100.884000
2,12349,18,72,1457.55,20.24,0,0,1457.550000,0.000000,0.000000
3,12350,310,16,294.40,18.40,2,0,294.400000,0.000000,0.000000
4,12352,36,77,1385.74,18.00,1,0,346.435000,120.042154,-2.682000
...,...,...,...,...,...,...,...,...,...,...
5505,21821,1,2,3.35,1.68,3,1,3.350000,0.000000,0.000000
5506,21822,1,634,5699.00,8.99,3,1,5699.000000,0.000000,0.000000
5507,21823,0,730,6756.06,9.25,4,1,6756.060000,0.000000,0.000000
5508,21824,0,59,3217.20,54.53,4,1,3217.200000,0.000000,0.000000


# Padronização das variáveis

##### Não irei noramlizar as seguintes variáveis:

- Cod_cliente: Pois se refere a código de cada cliente, sem valor expressivo ou relevante;

- UK: Pois já está no formato booleano, 0 ou 1;

- Dia_fav_compra: Pois ´variável categórica e ja está normalizada (1 a 7)


In [7]:
# Variáveis excluidas
var_exclude = ['Cod_cliente', 'Dia_fav_compra', 'UK']

In [8]:
# Inicando o objeto Scaler 
scaler = StandardScaler()

dados = dados_clientes.copy()
colunas_padronizadas = dados.columns.difference(var_exclude)
colunas_padronizadas

Index(['Avg_transacao', 'Desvio_padrao_gastos_mensais',
       'Gastos_medios_mensais', 'Tendencia_gastos', 'Total_transacoes',
       'Ultima_Compra', 'Valor_total_gasto'],
      dtype='object')

In [9]:
# Treinando a padronização nos dados - exceto variáveis listadas em var_exclude
dados[colunas_padronizadas] = scaler.fit_transform(dados[colunas_padronizadas])

In [10]:
# Amostra do dataframe já padronizado
dados.sample(8)

Unnamed: 0,Cod_cliente,Ultima_Compra,Total_transacoes,Valor_total_gasto,Avg_transacao,Dia_fav_compra,UK,Gastos_medios_mensais,Desvio_padrao_gastos_mensais,Tendencia_gastos
2431,15700,0.510037,-0.010758,-0.310057,-0.194037,6,1,0.126381,-0.511282,0.038867
5398,21534,-0.777293,-0.663604,-0.6496,-0.174033,3,1,-0.608819,-0.511282,0.038867
1906,15004,0.28498,0.53456,-0.038469,-0.195721,4,1,0.71444,-0.511282,0.038867
4829,20167,0.888134,-0.494632,-0.577704,-0.208145,0,1,-0.453145,-0.511282,0.038867
2455,15732,0.87013,-0.563757,-0.509111,-0.058116,2,1,-0.462074,-0.445122,0.147331
3831,17663,-0.813302,-0.010758,0.236942,-0.058642,4,1,-0.297808,0.127665,0.205048
1195,13995,-0.903326,-0.010758,0.034594,-0.108652,3,1,-0.321092,-0.000711,-0.137629
3213,16790,-1.011353,0.043005,0.109719,-0.104335,2,1,0.20789,3.869659,-7.143369


In [11]:
# Indexação com código cliente
dados.set_index('Cod_cliente', inplace = True)

In [22]:
dados

Unnamed: 0_level_0,Ultima_Compra,Total_transacoes,Valor_total_gasto,Avg_transacao,Dia_fav_compra,UK,Gastos_medios_mensais,Desvio_padrao_gastos_mensais,Tendencia_gastos
Cod_cliente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12347,-1.020356,0.726574,1.476625,-0.030005,1,0,0.039694,1.269518,0.066020
12348,-0.363187,-0.463910,0.056129,0.281107,3,0,-0.234827,0.553195,-0.571752
12349,-0.876319,-0.118286,0.066171,-0.066223,0,0,0.941013,-0.511282,0.038867
12350,1.752356,-0.548396,-0.508972,-0.085595,2,0,-0.304323,-0.511282,0.038867
12352,-0.714277,-0.079883,0.030663,-0.089806,1,0,-0.248612,0.115483,0.022634
...,...,...,...,...,...,...,...,...,...
21821,-1.029358,-0.655924,-0.652888,-0.261628,3,1,-0.615939,-0.511282,0.038867
21822,-1.029358,4.198178,2.163445,-0.184666,3,1,5.482159,-0.511282,0.038867
21823,-1.038360,4.935510,2.686130,-0.181929,4,1,6.613909,-0.511282,0.038867
21824,-1.038360,-0.218133,0.936267,0.294794,4,1,2.824998,-0.511282,0.038867


Unnamed: 0,Cod_cliente,Ultima_Compra,Total_transacoes,Valor_total_gasto,Avg_transacao,Dia_fav_compra,UK,Gastos_medios_mensais,Desvio_padrao_gastos_mensais,Tendencia_gastos
0,12347,2,182,4310.00,23.68,1,0,615.714286,341.070789,4.486071
1,12348,75,27,1437.24,53.23,3,0,359.310000,203.875689,-100.884000
2,12349,18,72,1457.55,20.24,0,0,1457.550000,0.000000,0.000000
3,12350,310,16,294.40,18.40,2,0,294.400000,0.000000,0.000000
4,12352,36,77,1385.74,18.00,1,0,346.435000,120.042154,-2.682000
...,...,...,...,...,...,...,...,...,...,...
5505,21821,1,2,3.35,1.68,3,1,3.350000,0.000000,0.000000
5506,21822,1,634,5699.00,8.99,3,1,5699.000000,0.000000,0.000000
5507,21823,0,730,6756.06,9.25,4,1,6756.060000,0.000000,0.000000
5508,21824,0,59,3217.20,54.53,4,1,3217.200000,0.000000,0.000000


In [12]:
# Salvando dados para clusterizar
dados_clientes.to_csv('../data/dados_processados.csv', sep =',', index = False, encoding = 'utf-8')

In [25]:
# Salvando DF não processado para futura avaliação
dados_clientes.to_csv('../data/dados_para_avaliacao_final.csv', sep =',', index = False, encoding = 'utf-8')

In [21]:
dados_clientes

Unnamed: 0,Cod_cliente,Ultima_Compra,Total_transacoes,Valor_total_gasto,Avg_transacao,Dia_fav_compra,UK,Gastos_medios_mensais,Desvio_padrao_gastos_mensais,Tendencia_gastos
0,12347,2,182,4310.00,23.68,1,0,615.714286,341.070789,4.486071
1,12348,75,27,1437.24,53.23,3,0,359.310000,203.875689,-100.884000
2,12349,18,72,1457.55,20.24,0,0,1457.550000,0.000000,0.000000
3,12350,310,16,294.40,18.40,2,0,294.400000,0.000000,0.000000
4,12352,36,77,1385.74,18.00,1,0,346.435000,120.042154,-2.682000
...,...,...,...,...,...,...,...,...,...,...
5505,21821,1,2,3.35,1.68,3,1,3.350000,0.000000,0.000000
5506,21822,1,634,5699.00,8.99,3,1,5699.000000,0.000000,0.000000
5507,21823,0,730,6756.06,9.25,4,1,6756.060000,0.000000,0.000000
5508,21824,0,59,3217.20,54.53,4,1,3217.200000,0.000000,0.000000
