In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# Import Feature Engineered Sales Transaction file
df = pd.read_csv('olist_processado.csv')

## Construir Matriz de Correlação para as relações Cliente-Produto (usando recomendação baseada em Usuário-Usuário) 

In [2]:
df.head(2)

Unnamed: 0,customer_id,customer_city,customer_state,order_id,order_purchase_timestamp,order_item_id,product_id,price,review_id,review_score,customer_lat,customer_long,product_category_name,quantidade_item
0,06b8999e2fba1a1fbc88172c00ba8bc7,franca,SP,00e7ee1b050b8499577073aeb2a297a1,2017-05-16 15:05:35,1,a9516a079e37a9c9c36b9b78b10169e8,124.99,88b8b52d46df026a9d1ad2136a59b30b,4,-20.500253,-47.400367,moveis_escritorio,1
1,8912fc0c3bbf1e2fbf35819e21706718,santarem,PA,c1d2b34febe9cd269e378117d6681172,2017-11-09 00:50:13,1,a9516a079e37a9c9c36b9b78b10169e8,112.99,7fc63200f12eebb5f387856afdd63db8,1,-2.441535,-54.720801,moveis_escritorio,2


In [6]:
# Encontre a quantidade total adquirida por cada cliente de cada produto
prod_cliente_qty_df = df.groupby(['product_category_name','customer_id']).agg({'quantidade_item':'sum'})

# Reseta o index convertendo em colunas
prod_cliente_qty_df.reset_index(inplace=True)


# Encontre o número de clientes únicos que compraram cada produto
prod_cliente_count_df = df.groupby(['product_category_name']).agg({'customer_id':'nunique'})

# Defina a coluna de contagem de clientes
prod_cliente_count_df.columns=['no_clientes']

# Reseta o index convertendo em colunas
prod_cliente_count_df.reset_index(inplace=True)


# Merge a contagem exclusiva do cliente e a quantidade adquirida de cada produto
prod_cliente_df = pd.merge(prod_cliente_qty_df, prod_cliente_count_df, how='inner',on='product_category_name')

In [8]:
prod_cliente_df.head()

Unnamed: 0,product_category_name,customer_id,quantidade_item,no_clientes
0,agro_industria_e_comercio,0074a15febc9459ab2f999ea0e22c8d6,1,182
1,agro_industria_e_comercio,01122215dd21ac872ae567ec4e351e01,1,182
2,agro_industria_e_comercio,01c79a7d3c3d25b655514206df838701,1,182
3,agro_industria_e_comercio,04d5cb41e92cebd0f1051d38c10f5ec4,1,182
4,agro_industria_e_comercio,05455dfa7cd02f13d132aa7a6a9729c6,36,182


In [21]:
df_sample = prod_cliente_df.sample(n=3000)

In [22]:
# Crie uma tabela dinâmica com todos os produtos em colunas e clientes em linhas e preço como valores
prod_cliente_pivot_df = df_sample.pivot(index='product_category_name',columns='customer_id',values='quantidade_item').fillna(0)

In [23]:
# Encontre a correlação entre cada dois clientes e construa uma matriz de correlação usando o método corr ()
# Método de Spearman usado para identificar a correlação. Pearson não estava fornecendo melhores resultados e Kendall está demorando muito para ser executado.
cliente_correlacao_df_s = prod_cliente_pivot_df.corr(method='spearman',min_periods=5)
cliente_correlacao_df_s

customer_id,001f35d9f262c558fd065346fbf5801d,002c625393dfa194828b462a6ef8cb0b,002f067b028a3643ad3a0969c7a0f3dc,002f90a6eb386bc43bc9ba200db31a89,0030fbb95fbe7e61f798a80edb01a8e9,00331de1659c7f4fb660c8810e6de3f5,0040a8417928d0d5abd5169cd7877181,0049e8442c2a3e4a8d1ff5a9549abd53,0057f8b7c332d630b992d6a8ca79748c,006431d77c665533ac99b501d8a6ab06,...,ff217adc6431eb19f8af637ddd66a2d8,ff582f34a30b0646c52541de86107865,ff58662c328f81d3ee549c9caa942f39,ff6ed6bbecff417c54e415deed2caa67,ff9f37a06f82b3db748599da8b08eb66,ffa5d7160302ea1cc18a275a81930a42,ffa8532faa3bdc63c11d3124b78022ca,ffbbb95cb0540341b9900df515ba893e,ffc57553e2b7342c1f572edb68e0651f,fff4f0c85a4d710abe06599e1ed08a42
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001f35d9f262c558fd065346fbf5801d,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,...,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,-0.015385
002c625393dfa194828b462a6ef8cb0b,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,...,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385
002f067b028a3643ad3a0969c7a0f3dc,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,...,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385
002f90a6eb386bc43bc9ba200db31a89,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,...,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385
0030fbb95fbe7e61f798a80edb01a8e9,-0.015385,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,...,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffa5d7160302ea1cc18a275a81930a42,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,-0.015385,...,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,-0.015385
ffa8532faa3bdc63c11d3124b78022ca,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385,...,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385,-0.015385
ffbbb95cb0540341b9900df515ba893e,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,...,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,1.000000,-0.015385,-0.015385
ffc57553e2b7342c1f572edb68e0651f,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,...,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,-0.015385,1.000000,-0.015385


### Salva o Cliente para a Matriz de Correlação do Cliente em um arquivo .csv

In [25]:
cliente_correlacao_df_s.to_csv('cliente-cliente-matriz-correlacao.csv')

### Cria um arquivo Pickle (.pkl) com o dataframe da Matriz de Correlação

In [27]:
pickle.dump(cliente_correlacao_df_s, open('cliente_correlacao_modelo.pkl','wb'))

In [14]:
cliente_correlacao_df_pe = prod_cliente_pivot_df.corr(method='pearson',min_periods=5)

In [15]:
cliente_correlacao_df_pe

customer_id,002450e9ea44cb90b8a07ba0b197e149,006b5498d9494c061f8c2f80a6c2f343,008657b86f500495539ffa6d275351b7,0118758814dbb115d346a1e8efa705e9,01187871d33eda6ba14aea590bb50d0e,0127d98d4a7e205081ddac271d6b8e52,0130ec6b73b4f7fb58e4a4a78774b47b,017f022018ad01373771ddfd46eb5dc6,018d655fe703cb7b56ee34f117429789,02277211c9842fee492b8e37a17cf0ff,...,fb10dfd11e28e35c964ac8ba36c7690d,fb231f793a63556c123e4c2e8c23536d,fb8d76cc917e3fc3c6ac3a9beb0d6831,fbebdb808932c68bf8286924bebd2f35,fc983c513a5b593a724e389bf39f98a0,fd02e2278bbad54bce38c92d69935f41,fea3ed32e9d3eea5878fb54a399461a8,febea3481938a4b17570c46cb188996b,fef6ee8cb0a99f0fa68808e0fadca078,ffde64401f7cb62d2e8696d9467c781d
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002450e9ea44cb90b8a07ba0b197e149,1.00,-0.02,-0.02,-0.02,1.00,-0.02,-0.02,-0.02,-0.02,-0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,1.00,-0.02,-0.02
006b5498d9494c061f8c2f80a6c2f343,-0.02,1.00,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02
008657b86f500495539ffa6d275351b7,-0.02,-0.02,1.00,1.00,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,...,1.00,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02
0118758814dbb115d346a1e8efa705e9,-0.02,-0.02,1.00,1.00,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,...,1.00,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02
01187871d33eda6ba14aea590bb50d0e,1.00,-0.02,-0.02,-0.02,1.00,-0.02,-0.02,-0.02,-0.02,-0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,1.00,-0.02,-0.02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fd02e2278bbad54bce38c92d69935f41,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,1.00,-0.02,-0.02,-0.02,-0.02
fea3ed32e9d3eea5878fb54a399461a8,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,1.00,-0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,1.00,-0.02,-0.02,-0.02
febea3481938a4b17570c46cb188996b,1.00,-0.02,-0.02,-0.02,1.00,-0.02,-0.02,-0.02,-0.02,-0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,1.00,-0.02,-0.02
fef6ee8cb0a99f0fa68808e0fadca078,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,...,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,-0.02,1.00,-0.02


In [16]:
cliente_correlacao_df_k = prod_cliente_pivot_df.corr(method='kendall',min_periods=5)

KeyboardInterrupt: 

In [None]:
cliente_correlacao_df_k

In [None]:
prod_cliente_qty_df.head()

In [None]:
prod_cliente_count_df.head()