##### Carregando pacotes e tabelas

In [0]:
#Carregando tabelas
root_dir = '/dbfs/FileStore/ifood'
dbutils.fs.ls(f'{root_dir}')

df_orders = spark.read.parquet(f'{root_dir}/orders')
df_orders_with_cost_revenue = spark.read.parquet(f'{root_dir}/orders_with_cost_revenue')

#Carregando bibliotecas
import pandas as pd
import numpy as np
import datetime as dt
import re

##### Feature engineering

- Criação da tabela de clientes com base nas informações de pedidos

- Foi utilizado 6 pratos mais pedidos que representam 80% da tabela de orders

In [0]:
orders = df_orders.toPandas().merge(df_orders_with_cost_revenue.toPandas(),on='order_number',how='left')
orders = orders.drop(columns=['customer_id_y','frn_id_y','order_date_local_y'],axis=1)

#Recortando os 6 pedidos mais frequentes
#orders = orders[orders['merchant_dish_type'].isin(['Lanches', 'Comida Brasileira','Pizza','Comida Japonesa','Comida Saudável','Açaí'])]



#Recortando os 6 pedidos mais frequentes e adicionando outros
orders.loc[~orders['merchant_dish_type'].isin(['Lanches', 'Comida Brasileira','Pizza','Comida Japonesa','Comida Saudável','Açaí']),'merchant_dish_type'] = 'outros'


#Criando uma coluna com apontamento do pedido no fim de semana ou meio da semana
orders['fim_de_semana_pedido'] = orders['order_shift'].str.contains("weekday").astype(int)
orders['fim_de_semana_pedido'] = np.where(orders['fim_de_semana_pedido']==1,'Meio da Semana','Fim da Semana')

#Criando uma coluna com a faixa do horário do pedido
faixa_hora = {'weekday dinner': 'dinner (17 - 23:59h)',
             'weekend dinner': 'dinner (17 - 23:59h)',
              'weekday lunch': 'lunch (10 - 14:59h)',
              'weekend lunch': 'lunch (10 - 14:59h)',
              'weekday snack': 'snack (15 - 16:59h)',
              'weekend snack': 'snack (15 - 16:59h)',
              'weekend dawn': 'dawn (0 - 4:59h)',
              'weekday dawn': 'dawn (0 - 4:59h)',
              'weekday breakfast': 'breakfast (5 - 9:59h)',
              'weekend breakfast': 'breakfast (5 - 9:59h)' }
orders['order_shift'] = orders['order_shift'].map(faixa_hora)

#De-para categoria de compra do cliente
cliente_promocao = {'Alta': '> 66% das compras utilizando promoção',
                   'Media': '33-66% das compras utilizando promoção',
                    'Baixa': '<33% das compras utilizando promoção'}

orders['customer_seg_benefits_sensitivity_bucket'] = orders['customer_seg_benefits_sensitivity_bucket'].map(cliente_promocao)

#De-para da coluna que indica a presença de promoção
orders['promo_is_promotion'] = np.where(orders['promo_is_promotion'] == 1.0 ,'Sim','Não')

#De-para frequencia de compra do cliente por mês
freq_mes_cliente = {1: 'Menos que 1 pedido/mês',
                       2: 'Entre 1 e 2 pedidos/mês',
                       3: 'Entre 2 e 4 pedidos/mês',
                       4: 'Entre 4 e 10 pedidos/mês',
                       5: 'Mais que 10 pedidos/mês'}

orders['customer_seg_frequency_bucket'] = orders['customer_seg_frequency_bucket'].map(freq_mes_cliente)

#De-para da renda estimado do ibge por localizacao
renda_estimada_ibge = {1: '> 19960',
                       2: '9998 - 19960',
                       3: '3992 - 9998',
                       4: '1996 - 3992',
                       5: '< 1996',
                       6: 'Sem informação'}

orders['customer_seg_gross_income_bucket'] = orders['customer_seg_gross_income_bucket'].map(renda_estimada_ibge)

#De-para intervalo em dias ultima compra
intervalo_dias = {1: '> 91 dias',
                 2: '28 e 91 dias atrás',
                 3: '14 e 28 dias atrás',
                 4: '7 e 14 dias atrás',
                 5: '< 7 dias'}

orders['customer_seg_recency_bucket'] = orders['customer_seg_recency_bucket'].map(intervalo_dias)

#De-para com a cobertura do restaurante

cobertura = {1: '0 - 30',
             2: '30 - 90',
             3: '90 - 150',
             4: '150 - 500',
             5: '> 500'}

orders['customer_seg_merchant_offer_bucket'] = orders['customer_seg_merchant_offer_bucket'].map(cobertura)

#criando variavel mes 
orders['mes_pedido']= (pd.to_datetime(orders.order_timestamp_local, unit='ns',errors='coerce')).dt.month

#criando variavel dia
orders['dia_pedido']= (pd.to_datetime(orders.order_timestamp_local, unit='ns',errors='coerce')).dt.day

#criando variavel hora
orders['hora_pedido']= (pd.to_datetime(orders.order_timestamp_local, unit='ns',errors='coerce')).dt.hour

#criando variavel dia da semana
orders['dia_semana_pedido']= (pd.to_datetime(orders.order_timestamp_local, unit='ns',errors='coerce')).dt.day_name()

#criando variavel mes 
orders['cohort_month']= (pd.to_datetime(orders.cohort_month, unit='ns',errors='coerce')).dt.month

#criando variavel dia
orders['first_order_date']= (pd.to_datetime(orders.first_order_date, unit='ns',errors='coerce')).dt.day

orders[['lat_usuario', 'log_usuario']] = orders['customer_centroid_id'].str.extract(r'(-?\d?\d.\d\d?)(-?\d?\d.\d\d?)')
orders['lat_usuario'] = orders['lat_usuario'].astype(float)
orders['log_usuario'] = orders['log_usuario'].astype(float)

orders[['lat_restaurante', 'log_restaurante']] = orders['merchant_centroid_id'].str.extract(r'(-?\d?\d.\d\d?)(-?\d?\d.\d\d?)')
orders['lat_restaurante'] = orders['lat_restaurante'].astype(float)
orders['log_restaurante'] = orders['log_restaurante'].astype(float)

#get dummies
PdOrders = pd.get_dummies(orders, columns=['dia_semana_pedido','order_shift','fim_de_semana_pedido','promo_is_promotion'])

#renomeando
PdOrders = PdOrders.rename(columns={'merchant_dish_type_Açaí':'acai'
                        ,'merchant_dish_type_Comida Japonesa': 'comida_japonesa'
                        ,'merchant_dish_type_Comida Saudável': 'comida_saudavel'
                        ,'merchant_dish_type_Lanches':'lanches'
                        ,'merchant_dish_type_Pizza':'pizza'
                        ,'merchant_dish_type_Comida Brasileira':'comida_brasileira'
                        ,'merchant_dish_type_outros' : 'outros_pratos'
                        ,'order_shift_breakfast (5 - 9:59h)': 'breakfast'
                        ,'order_shift_dawn (0 - 4:59h)': 'dawn'
                        ,'order_shift_dinner (17 - 23:59h)': 'dinner'
                        ,'order_shift_lunch (10 - 14:59h)' : 'lunch'
                        ,'order_shift_snack (15 - 16:59h)': 'snack'
                        ,'dia_semana_pedido_Friday' : 'friday'
                        ,'dia_semana_pedido_Monday' : 'monday'
                        ,'dia_semana_pedido_Saturday' : 'saturday'
                        ,'dia_semana_pedido_Sunday' : 'sunday'
                        ,'dia_semana_pedido_Thursday' : 'thurday'
                        ,'dia_semana_pedido_Tuesday' : 'tuesday'
                        ,'dia_semana_pedido_Wednesday' : 'wednesday'
                        ,'fim_de_semana_pedido_Fim da Semana' : 'fim_semana'
                        ,'fim_de_semana_pedido_Meio da Semana' : 'meio_semana'
                      })


colunas = ['distance_merchant_customer',
          'normal_items_quantity',
          'promo_items_quantity',
          'order_lag_at_login',
          'order_lead_at_login',
          'cohort_month',
          'first_order_date',
          'months_after_first_purchase',
          'general_net_profit',
          'mes_pedido',
          'dia_pedido',
          'order_total',
          'credit',
          'paid_amount',
          'hora_pedido',
          'lat_usuario',
          'log_usuario',
          'lat_restaurante',
          'log_restaurante',
          'friday',
          'monday',
          'saturday',
          'sunday',
          'thurday',
          'tuesday',
          'wednesday',
          'breakfast',
          'dawn',
          'dinner',
          'lunch',
          'snack',
          'fim_semana',
          'meio_semana',
          'order_number',
          'customer_id_x',
          'valid_order',
          'merchant_dish_type']

PdOrders_filtro = PdOrders[colunas]

##### Agrupando pedidos por cliente

In [0]:
#transformando na tabela temporaria
parquetOrders = spark.createDataFrame(PdOrders_filtro)
parquetOrders.createOrReplaceTempView("orders")

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW customer AS
select customer_id_x
,mes_pedido
,cohort_month 
,months_after_first_purchase
,avg(order_total) as avg_order_total
,sum(order_total) as sum_order_total
,avg(credit) as avg_credit
,sum(credit) as sum_credit
,avg(paid_amount) as avg_paid_amont
,sum(paid_amount) as sum_paid_amount
,sum(promo_items_quantity) as sum_promo_items_quantity
,avg(distance_merchant_customer) as avg_distance_merchant_customer
,avg(normal_items_quantity) as avg_normal_items_quantity
,sum(normal_items_quantity) as sum_normal_items_quantity
,avg(order_lag_at_login) as avg_order_lag_at_login
,sum(order_lag_at_login) as sum_order_lag_at_login
,avg(order_lead_at_login) as avg_order_lead_at_login
,sum(order_lead_at_login) as sum_order_lead_at_login
,avg(general_net_profit) as avg_general_net_profit
,sum(general_net_profit) as sum_general_net_profit
,avg(hora_pedido) as avg_hora_pedido
,avg(lat_usuario) as avg_lat_customer
,avg(log_usuario) as avg_log_customer 
,sum(sunday) as sunday
,sum(monday) as monday
,sum(tuesday) as tuesday
,sum(wednesday) as wednesday
,sum(thurday) as thurday
,sum(friday) as friday
,sum(saturday) as saturday
,sum(breakfast) as breakfast
,sum(dawn) as dawn
,sum(dinner) as dinner
,sum(lunch) as lunch
,sum(snack) as snack
--,sum(acai) as acai
--,sum(outros_pratos) as outros_pratos
--,sum(comida_brasileira) as comida_brasileira
--,sum(comida_japonesa) as comida_japonesa
--,sum(comida_saudavel) as comida_saudavel
--,sum(lanches) as lanches
--,sum(pizza) as pizza
,sum(fim_semana) as weekend
,sum(meio_semana) as midweek
,count(order_number) as count_orders

from orders

where valid_order = 1

group by customer_id_x
       ,mes_pedido
       ,cohort_month
       ,months_after_first_purchase

In [0]:
abt_customer = spark.table('customer')
display(abt_customer)

customer_id_x,mes_pedido,cohort_month,months_after_first_purchase,avg_order_total,sum_order_total,avg_credit,sum_credit,avg_paid_amont,sum_paid_amount,sum_promo_items_quantity,avg_distance_merchant_customer,avg_normal_items_quantity,sum_normal_items_quantity,avg_order_lag_at_login,sum_order_lag_at_login,avg_order_lead_at_login,sum_order_lead_at_login,avg_general_net_profit,sum_general_net_profit,avg_hora_pedido,avg_lat_customer,avg_log_customer,sunday,monday,tuesday,wednesday,thurday,friday,saturday,breakfast,dawn,dinner,lunch,snack,weekend,midweek,count_orders
1211fd5b964cf7bda9b6fd263ececda4dfbb47920b1bdfe40d6c8c7c442dfe25,12,1,35.0,49.05,98.1,4.0,8.0,45.05,90.1,0.0,3679.734113911241,1.0,2.0,18.0,36.0,17.5,35.0,7.503925980359458,15.007851960718916,16.5,-23.22,-45.92,0,0,0,1,0,1,0,0,0,1,1,0,0,2,2
078e6a1d374f046eae1ead7d7f81dd0eef999aee927200b9dc9952fb25efeac3,9,4,65.0,83.9,167.8,4.0,8.0,79.9,159.8,0.0,2115.73170018726,2.0,4.0,28.0,56.0,6.5,13.0,12.21850625,24.4370125,18.5,-23.54,-46.65,0,0,0,0,0,0,2,0,0,2,0,0,2,0,2
38d4f6fd2eb80fc12b0ce374a34e04191fe769fd7bba89fe56bf46c328c09ada,12,6,18.0,45.7375,182.95,6.5,26.0,41.7375,166.95,1.0,1755.9306605190748,2.0,8.0,4.25,17.0,13.25,53.0,8.91417734375,35.656709375,20.5,-22.88,-43.3,1,0,1,0,2,0,0,0,0,4,0,0,1,3,4
f6095712f5cc4025e2f859887abb6b9d15ad16531e08465a5699223f780a2cd7,8,12,8.0,46.86,187.44,8.5,34.0,40.86,163.44,1.0,465.0897484483777,2.5,10.0,15.0,60.0,6.25,25.0,5.305261874999999,21.2210475,18.75,-16.705,-49.28,1,0,0,1,0,1,1,0,0,3,0,1,3,1,4
61bffb8b4032843affc0b0dce6c1362ac8854a02ee1a370325c5b41a8abf2c0d,11,7,52.0,61.9,61.9,4.0,4.0,57.9,57.9,0.0,4697.131359141407,2.0,2.0,78.0,78.0,22.0,22.0,8.40965,8.40965,20.0,-23.5,-46.55,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1
8f733f41455213e06064f9a63147f083c665c29edd6304a31d659c9e12f47e3f,7,4,3.0,69.08571428571429,483.6,5.428571428571429,38.0,65.08571428571427,455.6,7.0,4931.439160215751,3.4285714285714284,24.0,8.142857142857142,57.0,5.285714285714286,37.0,9.42113482142857,65.94794375,19.0,-25.5,-49.21,2,0,1,0,2,0,2,0,0,6,1,0,4,3,7
2371c8e133d8b46fdb0947a39ae0c421a25da778079f6632fe9ef5fc98132eb6,10,11,23.0,57.38571428571429,401.7000000000001,9.714285714285714,68.0,53.38571428571429,373.7000000000001,1.0,2128.5061101722795,1.2857142857142858,9.0,4.0,28.0,4.142857142857143,29.0,10.114659821428573,70.80261875000001,18.0,-7.102857142857142,-34.84428571428572,1,1,1,1,1,1,1,0,0,6,1,0,3,4,7
dbb22fbd3aaf0b0946792fa09e079419eb69b7cda3184897240eee70387b50d2,12,8,28.0,43.3,129.9,7.333333333333333,22.0,39.3,117.9,2.0,3647.1618897137705,0.6666666666666666,2.0,10.333333333333334,31.0,11.666666666666666,35.0,7.871025509001821,23.613076527005465,17.666666666666668,-18.916666666666668,-48.29333333333333,0,1,0,0,0,1,1,0,0,2,1,0,2,1,3
743b72438fe41f1ba5e280bdfa3b822b193bbc4a69fd3653c7051d2ea4d733d1,7,12,31.0,72.86590909090908,1603.05,6.181818181818182,136.0,66.68409090909091,1467.05,0.0,2941.582300490144,2.590909090909091,57.0,1.2727272727272727,28.0,1.5909090909090908,35.0,9.896538072785097,217.7238376012721,16.363636363636363,-22.366818181818186,-49.06045454545454,0,5,7,3,3,2,2,0,0,10,8,4,2,20,22
9b9bfcebea6a1ffe18d75e2590a65d2dca7bb641270f3962070bb18c45b33a68,12,5,55.0,46.07454545454545,506.82,4.0,44.0,42.07454545454545,462.82,6.0,2741.840137121155,1.818181818181818,20.0,2.363636363636364,26.0,2.8181818181818183,31.0,7.102888518636008,78.13177370499609,20.09090909090909,-8.06,-34.93,2,3,1,0,2,2,1,0,0,10,1,0,5,6,11


In [0]:
abt_customer.write.parquet(f'{root_dir}/enrich/orders', mode='overwrite')

#### Criando ABT recom

In [0]:
parquetOrders.write.parquet(f'{root_dir}/enrich/recomendacao', mode='overwrite')