In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from scipy.stats import chi2_contingency
from scipy.stats import zscore
from scipy.stats import linregress
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [15]:
# PRÉ PROCESSAMENTO

In [16]:
dados_clientes = pd.read_csv("C:/Projetos Pessoais/DataScience/segmentacao_Clientes_Ecomerce/data/dados_clientes_para_pre_processing.csv")

In [17]:
dados_clientes

Unnamed: 0,Cod_cliente,Ultima_Compra,Total_transacoes,Valor_total_gasto,Avg_transacao,Dia_fav_compra,UK,Gastos_medios_mensais,Desvio_padrao_gastos_mensais,Tendencia_gastos
0,12347,2,182,4310.00,23.68,1,0,615.714286,341.070789,4.486071
1,12348,75,27,1437.24,53.23,3,0,359.310000,203.875689,-100.884000
2,12349,18,72,1457.55,20.24,0,0,1457.550000,0.000000,0.000000
3,12350,310,16,294.40,18.40,2,0,294.400000,0.000000,0.000000
4,12352,36,77,1385.74,18.00,1,0,346.435000,120.042154,-2.682000
...,...,...,...,...,...,...,...,...,...,...
5505,21821,1,2,3.35,1.68,3,1,3.350000,0.000000,0.000000
5506,21822,1,634,5699.00,8.99,3,1,5699.000000,0.000000,0.000000
5507,21823,0,730,6756.06,9.25,4,1,6756.060000,0.000000,0.000000
5508,21824,0,59,3217.20,54.53,4,1,3217.200000,0.000000,0.000000


# Padronização das variáveis

##### Não irei noramlizar as seguintes variáveis:

- Cod_cliente: Pois se refere a código de cada cliente, sem valor expressivo ou relevante;

- UK: Pois já está no formato booleano, 0 ou 1;

- Dia_fav_compra: Pois ´variável categórica e ja está normalizada (1 a 7)


In [18]:
# Variáveis excluidas
var_exclude = ['Cod_cliente', 'Dia_fav_compra', 'UK']

In [25]:
# Inicando o objeto Scaler 
scaler = StandardScaler()

dados = dados_clientes.copy()
colunas_padronizadas = dados.columns.difference(var_exclude)
colunas_padronizadas

Index(['Avg_transacao', 'Desvio_padrao_gastos_mensais',
       'Gastos_medios_mensais', 'Tendencia_gastos', 'Total_transacoes',
       'Ultima_Compra', 'Valor_total_gasto'],
      dtype='object')

In [26]:
# Treinando a padronização nos dados - exceto variáveis listadas em var_exclude
dados[colunas_padronizadas] = scaler.fit_transform(dados[colunas_padronizadas])

In [28]:
# Amostra do dataframe já padronizado
dados.sample(8)

Unnamed: 0,Cod_cliente,Ultima_Compra,Total_transacoes,Valor_total_gasto,Avg_transacao,Dia_fav_compra,UK,Gastos_medios_mensais,Desvio_padrao_gastos_mensais,Tendencia_gastos
269,12688,-0.021099,0.642088,1.755412,0.020742,3,0,4.598662,-0.511282,0.038867
4888,20276,0.762102,-0.294938,-0.442619,-0.187193,0,1,-0.160652,-0.511282,0.038867
3167,16732,-0.759289,-0.45623,-0.356008,-0.052325,1,1,0.026885,-0.511282,0.038867
4590,19533,1.57231,0.949309,0.149945,-0.198143,1,1,1.122404,-0.511282,0.038867
5269,21146,-0.408198,-0.617521,-0.633975,-0.216778,4,1,-0.574986,-0.511282,0.038867
1801,14849,-0.849312,2.308765,3.266876,-0.064117,3,1,0.152375,0.97472,0.444888
3586,17325,1.482287,-0.625202,-0.60668,-0.109494,4,1,-0.515886,-0.511282,0.038867
1348,14215,-0.939335,0.158214,0.23042,-0.104862,0,1,-0.23629,0.790769,0.068616


In [30]:
# Indexação com código cliente
dados.set_index('Cod_cliente', inplace = True)

In [31]:
dados.head()

Unnamed: 0_level_0,Ultima_Compra,Total_transacoes,Valor_total_gasto,Avg_transacao,Dia_fav_compra,UK,Gastos_medios_mensais,Desvio_padrao_gastos_mensais,Tendencia_gastos
Cod_cliente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12347,-1.020356,0.726574,1.476625,-0.030005,1,0,0.039694,1.269518,0.06602
12348,-0.363187,-0.46391,0.056129,0.281107,3,0,-0.234827,0.553195,-0.571752
12349,-0.876319,-0.118286,0.066171,-0.066223,0,0,0.941013,-0.511282,0.038867
12350,1.752356,-0.548396,-0.508972,-0.085595,2,0,-0.304323,-0.511282,0.038867
12352,-0.714277,-0.079883,0.030663,-0.089806,1,0,-0.248612,0.115483,0.022634


In [32]:
from sklearn.decomposition import PCA

pca = PCA()
principal_components = pca.fit_transform(dados)

In [35]:
principal_components

array([[-1.43205556e+00,  2.03345518e+00, -1.04929574e+00, ...,
        -1.36024335e-02,  3.50455360e-01,  8.85907666e-01],
       [ 5.70914961e-01,  9.73881987e-02, -6.77039728e-01, ...,
        -3.18490331e-01,  1.73251917e-02,  8.96101191e-01],
       [-2.46360148e+00,  2.87446992e-01,  2.22778959e-02, ...,
        -7.09452682e-01,  4.38336238e-03,  8.94525900e-01],
       ...,
       [ 1.17270776e+00,  6.60175965e+00,  4.48371364e+00, ...,
        -4.57308947e-01, -1.07519489e+00, -8.13326259e-03],
       [ 1.40048229e+00,  1.62038694e+00,  1.40804986e+00, ...,
        -1.89216340e+00,  8.13264802e-02, -1.20863395e-01],
       [ 1.36041204e+00,  2.70670009e+00,  2.02252128e+00, ...,
        -1.56085735e+00, -1.87372712e-01, -9.39757682e-02]])

In [36]:
# Calculate the Cumulative Sum of the Explained Variance
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = np.cumsum(explained_variance_ratio)