In [1]:
import pandas as pd
import pickle

# Import Feature Engineered Sales Transaction file
df = pd.read_csv('olist_processado.csv')

In [2]:
df.head(2)

Unnamed: 0,customer_id,customer_city,customer_state,order_id,order_purchase_timestamp,order_item_id,product_id,price,review_id,review_score,customer_lat,customer_long,product_category_name,quantidade_item
0,06b8999e2fba1a1fbc88172c00ba8bc7,franca,SP,00e7ee1b050b8499577073aeb2a297a1,2017-05-16 15:05:35,1,a9516a079e37a9c9c36b9b78b10169e8,124.99,88b8b52d46df026a9d1ad2136a59b30b,4,-20.500253,-47.400367,moveis_escritorio,1
1,8912fc0c3bbf1e2fbf35819e21706718,santarem,PA,c1d2b34febe9cd269e378117d6681172,2017-11-09 00:50:13,1,a9516a079e37a9c9c36b9b78b10169e8,112.99,7fc63200f12eebb5f387856afdd63db8,1,-2.441535,-54.720801,moveis_escritorio,2


In [3]:
# Encontre a quantidade total adquirida por cada cliente de cada produto
prod_cliente_qty_df = df.groupby(['product_category_name','customer_id']).agg({'quantidade_item':'sum'})

In [4]:
# Reseta o index convertendo em colunas
prod_cliente_qty_df.reset_index(inplace=True)

In [5]:
# Find the no of unique customers purchased each product
prod_cliente_count_df = df.groupby(['product_category_name']).agg({'customer_id':'nunique'})

In [6]:
# Defina a coluna de contagem de clientes
prod_cliente_count_df.columns=['no_clientes']

In [7]:
# Reseta o index convertendo em colunas
prod_cliente_count_df.reset_index(inplace=True)

In [8]:
# Merge the unique customer count and qty purchased of each product
prod_cliente_df = pd.merge(prod_cliente_qty_df,prod_cliente_count_df,how='inner',on='product_category_name')

In [9]:
# Crie uma tabela dinâmica com todos os produtos em colunas e clientes em linhas e preço como valores
prod_cliente_pivot_df = prod_cliente_df.pivot(index='customer_id',columns='product_category_name',values='quantidade_item').fillna(0)

In [10]:
prod_correlacao_df = prod_cliente_pivot_df.corr(method='spearman',min_periods=5)

In [11]:
prod_correlacao_df

product_category_name,agro_industria_e_comercio,alimentos,alimentos_bebidas,artes,artes_e_artesanato,artigos_de_festas,artigos_de_natal,audio,automotivo,bebes,...,pet_shop,portateis_casa_forno_e_cafe,portateis_cozinha_e_preparadores_de_alimentos,relogios_presentes,seguros_e_servicos,sinalizacao_e_seguranca,tablets_impressao_imagem,telefonia,telefonia_fixa,utilidades_domesticas
product_category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
agro_industria_e_comercio,1.000000,-0.002952,-0.002094,-0.001975,-0.000666,-0.000867,-0.001572,-0.002602,-0.008844,-0.007569,...,-0.005791,-0.001203,-0.000519,-0.010724,-0.000196,-0.001644,-0.001234,-0.009195,-0.002047,-0.010983
alimentos,-0.002952,1.000000,-0.003297,-0.003110,-0.001048,-0.001365,-0.002475,-0.004097,-0.013926,-0.011918,...,-0.007942,-0.001894,-0.000818,-0.016886,-0.000309,-0.002588,-0.001944,-0.014479,0.000005,-0.017294
alimentos_bebidas,-0.002094,-0.003297,1.000000,-0.002206,-0.000744,-0.000969,-0.001755,-0.002906,-0.009879,-0.008455,...,-0.006469,-0.001343,-0.000580,-0.011979,-0.000219,-0.001836,-0.001379,-0.010272,-0.002287,-0.010388
artes,-0.001975,-0.003110,-0.002206,1.000000,-0.000702,-0.000914,-0.001656,-0.002741,-0.009318,-0.007975,...,-0.006102,-0.001267,-0.000547,-0.011299,-0.000207,-0.001732,-0.001300,-0.009688,-0.002157,-0.011572
artes_e_artesanato,-0.000666,-0.001048,-0.000744,-0.000702,1.000000,-0.000308,-0.000558,-0.000924,-0.003141,-0.002688,...,-0.002057,-0.000427,-0.000185,-0.003809,-0.000070,-0.000584,-0.000438,-0.003266,-0.000727,-0.003901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sinalizacao_e_seguranca,-0.001644,-0.002588,-0.001836,-0.001732,-0.000584,-0.000760,-0.001378,-0.002281,-0.007755,-0.004996,...,-0.005078,-0.001055,-0.000455,-0.009403,-0.000172,1.000000,-0.001082,-0.008063,-0.001795,-0.009631
tablets_impressao_imagem,-0.001234,-0.001944,-0.001379,-0.001300,-0.000438,-0.000571,-0.001035,-0.001713,-0.005824,-0.004984,...,-0.003813,-0.000792,-0.000342,-0.007061,-0.000129,-0.001082,1.000000,-0.006055,-0.001348,-0.007232
telefonia,-0.009195,-0.014479,-0.010272,-0.009688,-0.003266,-0.004253,-0.007709,-0.012762,-0.042551,-0.037129,...,-0.028409,-0.005899,-0.002548,-0.052133,-0.000963,-0.008063,-0.006055,1.000000,-0.008928,-0.053877
telefonia_fixa,-0.002047,0.000005,-0.002287,-0.002157,-0.000727,-0.000947,-0.001716,-0.002841,-0.009659,-0.008266,...,-0.006325,-0.001313,-0.000567,-0.011712,-0.000214,-0.001795,-0.001348,-0.008928,1.000000,-0.011995


In [12]:
prod_correlacao_df.to_csv('produto-produto-matriz-correlacao.csv')

In [13]:
pickle.dump(prod_correlacao_df, open('produto_correlacao_modelo.pkl', 'wb'))