In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# Import Feature Engineered Sales Transaction file
df = pd.read_csv('olist_processado.csv')

## Construir Matriz de Correlação para as relações Cliente-Produto (usando recomendação baseada em Usuário-Usuário) 

In [2]:
df.head(2)

Unnamed: 0,customer_id,customer_city,customer_state,order_id,order_purchase_timestamp,order_item_id,product_id,price,product_category_name,customer_lat,customer_long
0,06b8999e2fba1a1fbc88172c00ba8bc7,franca,SP,00e7ee1b050b8499577073aeb2a297a1,2017-05-16 15:05:35,1,a9516a079e37a9c9c36b9b78b10169e8,124.99,moveis_escritorio,-20.500253,-47.400367
1,8912fc0c3bbf1e2fbf35819e21706718,santarem,PA,c1d2b34febe9cd269e378117d6681172,2017-11-09 00:50:13,1,a9516a079e37a9c9c36b9b78b10169e8,112.99,moveis_escritorio,-2.441535,-54.720801


In [3]:
# Encontre a quantidade total adquirida por cada cliente de cada produto
prod_cliente_qty_df = df.groupby(['product_category_name','customer_id']).agg({'price':'sum'})

# Reseta o index convertendo em colunas
prod_cliente_qty_df.reset_index(inplace=True)


# Encontre o número de clientes únicos que compraram cada produto
prod_cliente_count_df = df.groupby(['product_category_name']).agg({'customer_id':'nunique'})

# Defina a coluna de contagem de clientes
prod_cliente_count_df.columns=['no_clientes']

# Reseta o index convertendo em colunas
prod_cliente_count_df.reset_index(inplace=True)


# Merge a contagem exclusiva do cliente e a quantidade adquirida de cada produto
prod_cliente_df = pd.merge(prod_cliente_qty_df, prod_cliente_count_df, how='inner',on='product_category_name')


# Crie uma tabela dinâmica com todos os produtos em colunas e clientes em linhas e preço como valores
prod_cliente_pivot_df = prod_cliente_df.pivot(index='customer_id',columns='product_category_name',values='price').fillna(0)

# Encontre a correlação entre cada dois clientes e construa uma matriz de correlação usando o método corr ()
# Método de Spearman usado para identificar a correlação. Pearson não estava fornecendo melhores resultados e Kendall está demorando muito para ser executado.cliente_correlation_df = prod_cliente_pivot_df.corr(method='spearman',min_periods=5)
cliente_correlacao_df = prod_cliente_pivot_df.corr(method='spearman',min_periods=5)
cliente_correlacao_df

product_category_name,agro_industria_e_comercio,alimentos,alimentos_bebidas,artes,artes_e_artesanato,artigos_de_festas,artigos_de_natal,audio,automotivo,bebes,...,pet_shop,portateis_casa_forno_e_cafe,portateis_cozinha_e_preparadores_de_alimentos,relogios_presentes,seguros_e_servicos,sinalizacao_e_seguranca,tablets_impressao_imagem,telefonia,telefonia_fixa,utilidades_domesticas
product_category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
agro_industria_e_comercio,1.000000,-0.002952,-0.002094,-0.001975,-0.000666,-0.000867,-0.001572,-0.002602,-0.008842,-0.007568,...,-0.005791,-0.001203,-0.000519,-0.010718,-0.000196,-0.001644,-0.001234,-0.009193,-0.002047,-0.010978
alimentos,-0.002952,1.000000,-0.003297,-0.003110,-0.001048,-0.001365,-0.002475,-0.004097,-0.013923,-0.011916,...,-0.007970,-0.001894,-0.000818,-0.016877,-0.000309,-0.002588,-0.001944,-0.014475,-0.000006,-0.017286
alimentos_bebidas,-0.002094,-0.003297,1.000000,-0.002206,-0.000744,-0.000969,-0.001755,-0.002906,-0.009877,-0.008454,...,-0.006469,-0.001343,-0.000580,-0.011973,-0.000219,-0.001836,-0.001379,-0.010269,-0.002287,-0.010448
artes,-0.001975,-0.003110,-0.002206,1.000000,-0.000702,-0.000914,-0.001656,-0.002741,-0.009316,-0.007974,...,-0.006102,-0.001267,-0.000547,-0.011293,-0.000207,-0.001732,-0.001300,-0.009686,-0.002157,-0.011567
artes_e_artesanato,-0.000666,-0.001048,-0.000744,-0.000702,1.000000,-0.000308,-0.000558,-0.000924,-0.003141,-0.002688,...,-0.002057,-0.000427,-0.000185,-0.003807,-0.000070,-0.000584,-0.000438,-0.003265,-0.000727,-0.003899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sinalizacao_e_seguranca,-0.001644,-0.002588,-0.001836,-0.001732,-0.000584,-0.000760,-0.001378,-0.002281,-0.007753,-0.005039,...,-0.005078,-0.001055,-0.000455,-0.009399,-0.000172,1.000000,-0.001082,-0.008061,-0.001795,-0.009627
tablets_impressao_imagem,-0.001234,-0.001944,-0.001379,-0.001300,-0.000438,-0.000571,-0.001035,-0.001713,-0.005822,-0.004983,...,-0.003813,-0.000792,-0.000342,-0.007058,-0.000129,-0.001082,1.000000,-0.006053,-0.001348,-0.007229
telefonia,-0.009193,-0.014475,-0.010269,-0.009686,-0.003265,-0.004252,-0.007707,-0.012759,-0.042596,-0.037115,...,-0.028401,-0.005898,-0.002547,-0.052149,-0.000963,-0.008061,-0.006053,1.000000,-0.009007,-0.053840
telefonia_fixa,-0.002047,-0.000006,-0.002287,-0.002157,-0.000727,-0.000947,-0.001716,-0.002841,-0.009657,-0.008265,...,-0.006325,-0.001313,-0.000567,-0.011706,-0.000214,-0.001795,-0.001348,-0.009007,1.000000,-0.011990


### Salva o Cliente para a Matriz de Correlação do Cliente em um arquivo .csv

In [4]:
cliente_correlacao_df.to_csv('cliente-cliente-matriz-correlacao.csv')

### Cria um arquivo Pickle (.pkl) com o dataframe da Matriz de Correlação

In [5]:
pickle.dump(cliente_correlacao_df, open('cliente_correlacao_modelo.pkl','wb'))