---
**Turma:** 15

**Grupo:** 11 - Bruno Correia, Gilson Costa, Vivian Liu

**Contato**: `brunovpm@hotmail.com, gilson.costa@gmail.com, liuoliveira.vivian@gmail.com`

**Licença deste notebook**:
![CC BY](https://licensebuttons.net/l/by/3.0/88x31.png)

[Clique aqui para saber mais sobre a licença CC BY v4.0](https://creativecommons.org/licenses/by/4.0/legalcode.pt)

# ETL de dados de push e session
- Geração de dados agrupados com os primeiros 15 dias para cada mês
- Usa as tabelas enriquecidas na primeira entrega de push e session

In [0]:
root_dir = '/dbfs/FileStore/ifood'
dbutils.fs.ls(f'{root_dir}')

# marketing_push_full

In [0]:
df_push_user_day = spark.read.parquet(f'{root_dir}/enrich/push_user_day').repartition(2).cache()
df_push_user_day.createOrReplaceTempView("push_user_day")

# Sumário mensal de eventos do cliente (15 dias)

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW push_user_month_half_view AS

SELECT
date_format(event_date, 'yyyy-MM-01') event_date,
external_user_id,

MAX(IF(total_send > 0, event_date, null)) last_send_date,
MAX(IF(total_received > 0, event_date, null)) last_received_date,
MAX(IF(total_bounce > 0, event_date, null)) last_bounce_date,
MAX(IF(total_click > 0, event_date, null)) last_click_date,

SUM(total_event_dawn) total_event_dawn,
SUM(total_event_breakfast) total_event_breakfast,
SUM(total_event_lunch) total_event_lunch,
SUM(total_event_snack) total_event_snack,
SUM(total_event_dinner) total_event_dinner,

SUM(total_send_dawn) total_send_dawn,
SUM(total_send_breakfast) total_send_breakfast,
SUM(total_send_lunch) total_send_lunch,
SUM(total_send_snack) total_send_snack,
SUM(total_send_dinner) total_send_dinner,

SUM(total_received_dawn) total_received_dawn,
SUM(total_received_breakfast) total_received_breakfast,
SUM(total_received_lunch) total_received_lunch,
SUM(total_received_snack) total_received_snack,
SUM(total_received_dinner) total_received_dinner,

SUM(total_bounce_dawn) total_bounce_dawn,
SUM(total_bounce_breakfast) total_bounce_breakfast,
SUM(total_bounce_lunch) total_bounce_lunch,
SUM(total_bounce_snack) total_bounce_snack,
SUM(total_bounce_dinner) total_bounce_dinner,

SUM(total_click_dawn) total_click_dawn,
SUM(total_click_breakfast) total_click_breakfast,
SUM(total_click_lunch) total_click_lunch,
SUM(total_click_snack) total_click_snack,
SUM(total_click_dinner) total_click_dinner,

SUM(total_event) total_event,
SUM(total_send) total_send,
SUM(total_received) total_received,
SUM(total_bounce) total_bounce,
SUM(total_click) total_click

FROM push_user_day

WHERE date_format(event_date, 'dd') < 16

GROUP BY 1, 2

In [0]:
df_push_user_month_view = spark.table('push_user_month_half_view')
df_push_user_month_view.write.parquet(f'{root_dir}/enrich/push_user_month_half', mode='overwrite')

In [0]:
df_push_user_month_half = spark.read.parquet(
  f'{root_dir}/enrich/push_user_month_half'
).repartition(2).cache()
df_push_user_month_half.createOrReplaceTempView("push_user_month_half")

# Session

In [0]:
df_session_user_day = spark.read.parquet(f'{root_dir}/enrich/session_user_day').repartition(2).cache()
df_session_user_day.createOrReplaceTempView('session_user_day')

# Sumário mensal de sessão do cliente (15 dias iniciais)

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW session_user_month_half_view AS

SELECT
  date_format(to_timestamp(session_date), 'yyyy-MM-01') session_date,
  user_identifier,
  SUM(IF(sum_event_open IS NULL, 0, sum_event_open)) sum_event_open,
  SUM(IF(sum_view_restaurant_screen IS NULL, 0, sum_view_restaurant_screen)) sum_view_restaurant_screen,
  SUM(IF(sum_view_dish_screen IS NULL, 0, sum_view_dish_screen)) sum_view_dish_screen,
  SUM(IF(sum_click_add_item IS NULL, 0, sum_click_add_item)) sum_click_add_item,
  SUM(IF(sum_view_checkout IS NULL, 0, sum_view_checkout)) sum_view_checkout,
  SUM(IF(sum_callback_purchase IS NULL, 0, sum_callback_purchase)) sum_callback_purchase,
  SUM(IF(order_session_quantity IS NULL, 0, order_session_quantity)) order_session_quantity


FROM session_user_day

WHERE dayofmonth(session_date) < 16

GROUP BY 1, 2

In [0]:
df_session_user_month_half_view = spark.table('session_user_month_half_view')
df_session_user_month_half_view.write.parquet(f'{root_dir}/enrich/session_user_month_half', mode='overwrite')

In [0]:
df_session_user_month_half = spark.read.parquet(f'{root_dir}/enrich/session_user_month_half').repartition(2).cache()
df_session_user_month_half.createOrReplaceTempView('session_user_month_half')

In [0]:
df_customer = spark.read.parquet(f'{root_dir}/customer_segmentation').repartition(2).cache()
df_customer.createOrReplaceTempView('customer')

In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW push_session_half_view AS

SELECT
customer.customer_id,
customer.segmentation_month,
session.*,
push.*


FROM customer

LEFT JOIN session_user_month_half session
ON customer.customer_id = session.user_identifier
AND session.session_date = customer.segmentation_month

LEFT JOIN push_user_month_half push
ON customer.customer_id = push.external_user_id
AND push.event_date = customer.segmentation_month

In [0]:
df_push_session_november_half = spark.table('push_session_half_view')
df_push_session_november_half.where('segmentation_month = "2019-11-01"').drop(
  'external_user_id', 'user_identifier', 'segmentation_month'
).write.parquet(
  f'{root_dir}/abt/push_session_nov_half',
  mode='overwrite'
)

In [0]:
df_push_session_jun_half = spark.table('push_session_half_view')
df_push_session_jun_half.where('segmentation_month = "2019-06-01"').drop(
  'external_user_id', 'user_identifier', 'segmentation_month'
).write.parquet(
  f'{root_dir}/abt/push_session_jun_half',
  mode='overwrite'
)

In [0]:
df_push_session_jul_half = spark.table('push_session_half_view')
df_push_session_jul_half.where('segmentation_month = "2019-07-01"').drop(
  'external_user_id', 'user_identifier', 'segmentation_month'
).write.parquet(
  f'{root_dir}/abt/push_session_jul_half',
  mode='overwrite'
)

In [0]:
df_push_session_aug_half = spark.table('push_session_half_view')
df_push_session_aug_half.where('segmentation_month = "2019-08-01"').drop(
  'external_user_id', 'user_identifier', 'segmentation_month'
).write.parquet(
  f'{root_dir}/abt/push_session_aug_half',
  mode='overwrite'
)

In [0]:
df_push_session_sep_half = spark.table('push_session_half_view')
df_push_session_sep_half.where('segmentation_month = "2019-09-01"').drop(
  'external_user_id', 'user_identifier', 'segmentation_month'
).write.parquet(
  f'{root_dir}/abt/push_session_sep_half',
  mode='overwrite'
)

In [0]:
df_push_session_oct_half = spark.table('push_session_half_view')
df_push_session_oct_half.where('segmentation_month = "2019-10-01"').drop(
  'external_user_id', 'user_identifier', 'segmentation_month'
).write.parquet(
  f'{root_dir}/abt/push_session_oct_half',
  mode='overwrite'
)

In [0]:
df_push_session_dec_half = spark.table('push_session_half_view')
df_push_session_dec_half.where('segmentation_month = "2019-12-01"').drop(
  'external_user_id', 'user_identifier', 'segmentation_month'
).write.parquet(
  f'{root_dir}/abt/push_session_dec_half',
  mode='overwrite'
)

In [0]:
df_session_user_nov_half = spark.read.parquet(
  f'{root_dir}/abt/push_session_nov_half'
).repartition(2).cache()
df_session_user_nov_half.createOrReplaceTempView('push_session_nov_half')

In [0]:
%sql
SELECT SUM(sum_event_open), SUM(total_click) FROM push_session_nov_half

sum(sum_event_open),sum(total_click)
428270.0,33838
