In [0]:
from pyspark.sql.functions import count, when, isnull, col

root_dir = '/dbfs/FileStore/ifood'
dbutils.fs.ls(f'{root_dir}')

#### Carregando tabelas processadas

In [0]:
pedidos = spark.read.parquet(f'{root_dir}/enrich/recomendacao').repartition(2).cache()

df_push_user_month = spark.read.parquet(f'{root_dir}/enrich/push_user_month')

pedidosabt = pedidos.select('merchant_dish_type','promo_items_quantity','hora_pedido','mes_pedido','paid_amount','lunch','customer_id_x')

#### Verificando missing

In [0]:
def contar_missing(df):
  aux = []
  for c in df.columns:
    aux.append(count(when(isnull(c), c)).alias(c))
  return df.select(aux)

In [0]:
display(contar_missing(pedidosabt))

merchant_dish_type,promo_items_quantity,hora_pedido,mes_pedido,paid_amount,lunch,customer_id_x
0,0,0,0,0,0,0


#### Criando tabela auxiliar para pegar os primeiro e o segundo pedido mais frequente por cliente

In [0]:
pedidos.createOrReplaceTempView("orders")

In [0]:
%sql
create or replace temporary view teste as
    select
    customer_id_x as customer_id,
    merchant_dish_type,
    count(*) total

    from orders a 

    group by 1, 2


In [0]:
%sql

CREATE OR REPLACE TEMPORARY VIEW customer_1 AS

select customer_id,merchant_dish_type as prato_mais_frequente from (
  SELECT
    *,
    dense_rank() over (partition by customer_id order by total desc) as rank

  FROM teste
) final

where rank in(1)

In [0]:
customer = spark.table('customer_1')

In [0]:
aux = pedidosabt.join(customer, customer.customer_id == pedidosabt.customer_id_x , how='left')
pedidosabt = aux.drop('customer_id')

In [0]:
pedidosabt =pedidosabt.withColumn("lanche_1_freq", when(pedidosabt['prato_mais_frequente'] == "Lanches", 1).otherwise(0))
pedidosabt =pedidosabt.withColumn("pizza_1_freq", when(pedidosabt['prato_mais_frequente'] == "Pizza", 1).otherwise(0))
pedidosabt =pedidosabt.withColumn("comida_brasileira_1_freq", when(pedidosabt['prato_mais_frequente'] == "Comida Brasileira", 1).otherwise(0))
pedidosabt =pedidosabt.withColumn("outros_1_freq", when(pedidosabt['prato_mais_frequente'] == "outros", 1).otherwise(0))

pedidosabt = pedidosabt.drop('customer_id_x','prato_mais_frequente','segundo_prato_mais_frequente',
                             'event_date')

display(pedidosabt)

merchant_dish_type,promo_items_quantity,hora_pedido,mes_pedido,paid_amount,lunch,lanche_1_freq,pizza_1_freq,comida_brasileira_1_freq,outros_1_freq
outros,0.0,19,11,34.0,0,0,0,0,1
Lanches,0.0,11,8,64.5,1,0,0,1,0
outros,0.0,19,11,55.9,0,0,0,0,1
Comida Brasileira,0.0,14,12,29.4,1,0,0,1,0
outros,0.0,7,8,53.0,0,0,0,0,1
Lanches,0.0,19,9,25.5,0,0,0,0,1
Pizza,0.0,21,9,67.0,0,0,1,0,0
Pizza,0.0,19,6,52.85,0,0,0,0,1
Comida Brasileira,1.0,11,9,27.0,1,0,0,1,0
Comida Brasileira,1.0,11,6,20.5,1,0,0,1,0


In [0]:
abt_treino, abt_teste = pedidosabt.randomSplit([0.7, 0.3], seed=42)

In [0]:
abt_treino.write.parquet(f'{root_dir}/abt/abt_treino', mode='overwrite')
abt_teste.write.parquet(f'{root_dir}/abt/abt_teste', mode='overwrite')