---
**Turma**: 15

**Grupo**: 11 - Bruno Correia, Gilson Costa, Vivian Liu

**Contato**: `brunovpm@hotmail.com, gilson.costa@gmail.com, liuoliveira.vivian@gmail.com`

**Licença deste notebook**:
![CC BY](https://licensebuttons.net/l/by/3.0/88x31.png)

[Clique aqui para saber mais sobre a licença CC BY v4.0](https://creativecommons.org/licenses/by/4.0/legalcode.pt)

---
### Carregando os dados armazendos localmente para serem processados

Utilize o comando do `dbutils.fs.rm` para remover arquivos e diretórios dentro do DBFS.

In [0]:
# dbutils.fs.rm('/dbfs/FileStore/ifood/orders', True)

In [0]:
root_dir = '/dbfs/FileStore/ifood'
dbutils.fs.ls(f'{root_dir}')

In [0]:
df_marketing_push_full = spark.read.parquet(f'{root_dir}/marketing_push_full')
print((df_marketing_push_full.count(), len(df_marketing_push_full.columns)))

df_sessions_visits = spark.read.parquet(f'{root_dir}/sessions_visits')
print((df_sessions_visits.count(), len(df_sessions_visits.columns)))

In [0]:
from pyspark.sql.functions import isnull, when, count, col
import pandas as pd

pd.options.display.precision = 4

def summaryZeroNull(df, lista):
  df_null = df.select([count(when(isnull(c), 1)).alias(c) for c in lista])
  df_zero = df.select([count(when(df[c] == 0, 1)).alias(c) for c in lista])

  final = df_null.union(df_zero)

  dfPandas = final.toPandas()
  
  dfTranspose = dfPandas.transpose(copy=True)
  dfTranspose.reset_index(inplace=True);
  dfTranspose.rename(columns={'index':'summary', 0:'null',1:'zero'}, inplace=True)
  dfTranspose
  
  return dfTranspose

# marketing_push_full

_26 % de registros duplicados_

In [0]:
df_marketing_unique = df_marketing_push_full.drop_duplicates()
df_marketing_unique.write.parquet(f'{root_dir}/t11/preprocess/marketing_push_full')

In [0]:
df_marketing_push_full = spark.read.parquet(f'{root_dir}/t11/preprocess/marketing_push_full')
print((df_marketing_push_full.count(), len(df_marketing_push_full.columns)))

In [0]:
df_marketing_push_full.createOrReplaceTempView("marketing")

# sessions_visits
_Sem Dados duplicados (463420 registros distintos)_

In [0]:
df_sessions_visits_unique = df_sessions_visits.drop_duplicates()
df_sessions_visits_unique.count()

In [0]:
df_sessions_visits.createOrReplaceTempView("session")

## Avaliando os bounces de campanhas

In [0]:
%sql
SELECT
  user_id,
  campaign_id,
  campaign_name,
  COUNT(*) total
  
FROM marketing

WHERE event_name = 'bounce'
AND campaign_id IS NOT NULL

GROUP BY campaign_id, campaign_name, user_id

HAVING total > 1

user_id,campaign_id,campaign_name,total
5ac37650f34eb30f5cb2e101,f9fc9aed-210f-4c28-a33a-8e7024bf0056,2019-08-24 / ANDROID / Super-Restaurantes,2
5c1c2e60b9b582f339f65e5b,5b741f8b-84c9-4415-ad67-1068b5797591,2019-08-26 / iOS / Ativos / Jantar - Light / CUPOM,2
5b00ace80dd604e0e4bf20fb,2a56a20e-65dd-41b5-a17a-fd0fc7510671,2019-09-14 / IOS / To Go,2
5cf06ab14c0c724bf208a39c,06437a25-5966-4219-b512-d8f24f05a9c6,2019-06-26 / IOS / Ativos Expansão,2
5b27cc7dbbf503950325b7a0,e23a8e18-bdba-46df-a98b-14261483abed,2019-08-06 / IOS / To Go,2
5afc777c42b973f667fdea65,8ede92df-fbb0-4c2c-81b9-b6bb641593f4,2019-06-29 / IOS / Almoço - Gourmet,2
5ac36b8b0e560d93c57eb758,57cf82b3-88c2-4ffc-bf60-3cf2cb53fc79,2019-12-19 / IOS / Almoço / Loop (New Loop),2
5b930b1408567bfd57191fb8,2dace3cf-5447-4125-8ef9-8509a61cf289,2019-07-23 / ANDROID / Lanche - Light / CUPOM,2
5ac37b98832595335e0856c5,15c58d4e-7cff-4319-b340-a494d5b97988,2019-07-01 / IOS / Loop,2
5b80925b495e75340e2d5453,9de3517a-bcff-4127-83d5-55a1545b4e15,2019-10-21 / IOS / Jantar / Cidades DFN Geral,2


In [0]:
%sql
SELECT *

FROM marketing

WHERE campaign_id = '57cf82b3-88c2-4ffc-bf60-3cf2cb53fc79'
AND user_id = '5ac36b8b0e560d93c57eb758'

event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,canvas_name,canvas_step_id,canvas_is_first_step,canvas_first_step_name,canvas_step_name,canvas_real_step_name,canvas_step_index,canvas_tags,send_id,event_date
push,send,iFood Brasil,sample,5ac36b8b0e560d93c57eb758,87c9e03b3d9b2b792ad457948ae9c2724cee20e3f4170e3283427c1881c8732c,2019-12-19T09:25:36.000Z,ios,57cf82b3-88c2-4ffc-bf60-3cf2cb53fc79,2019-12-19 / IOS / Almoço / Loop (New Loop),ios_push,,,0.0,,,,,,,2019-12-19T00:00:00.000Z
push,bounce,iFood Brasil,sample,5ac36b8b0e560d93c57eb758,87c9e03b3d9b2b792ad457948ae9c2724cee20e3f4170e3283427c1881c8732c,2019-12-19T09:25:36.000Z,ios,57cf82b3-88c2-4ffc-bf60-3cf2cb53fc79,2019-12-19 / IOS / Almoço / Loop (New Loop),ios_push,,,0.0,,,,,,,2019-12-19T00:00:00.000Z
push,bounce,iFood Brasil,sample,5ac36b8b0e560d93c57eb758,87c9e03b3d9b2b792ad457948ae9c2724cee20e3f4170e3283427c1881c8732c,2019-12-19T09:25:37.000Z,ios,57cf82b3-88c2-4ffc-bf60-3cf2cb53fc79,2019-12-19 / IOS / Almoço / Loop (New Loop),ios_push,,,0.0,,,,,,,2019-12-19T00:00:00.000Z


## Problema : Sem anonimização [sessions_visits.user_id]

In [0]:
%sql
SELECT DISTINCT dau, user_identifier, user_account_uuid

FROM session

WHERE user_identifier LIKE '%@%'

dau,user_identifier,user_account_uuid
2019-06-16_203fc1c9-bbb1-4a25-8196-7e790b5965bb,RODRIGOEISINGER@GMAIL.COM,4a4eb380-ac0e-4e78-a607-6323fb7813bf
2019-06-23_431812a5-586f-4ccc-b12f-3a5acca039ec,NADINELEMES@LIVE.COM,7ea425db-81ae-40f0-bbfc-6438bac013fe
2019-07-11_e10f8dbc-a195-456c-8bd0-858259cac57c,LEANDRO-G-R@BOL.COM.BR,c517898d-0c19-402d-b09f-c8cbe9d2e7d3
2019-06-05_a1238251-2ec9-48e6-8167-7c73c1c56be6,BRUNNOATUALIZAHITS@GMAIL.COM,5a44f4fa-dcd4-4d4c-82d7-70950ea58fe9
2019-07-11_53d0a50c-774a-4ed5-b2ca-0fc572e5740d,RENATABRANDAOR@GMAIL.COM,398d4dca-7790-4238-b997-0a00887fa7de
2019-06-30_9453f8d6-00e4-455c-bd35-4be5d9eefb2a,ALYSSAFEELICIANO@HOTMAIL.COM,9bb06fff-5a15-4f1f-9b97-0bc19b5c937a
2019-06-22_1544b62a-fc39-4bf7-a361-b50d2eb66e23,NATH.TERUEL@GMAIL.COM,ee246cd7-f49c-448b-8974-ba145b2c51e0
2019-07-08_d3b298b9-f48f-4d06-89c6-788ed64000f4,DARTHMEI7@GMAIL.COM,f8a4796c-b2ab-4ed3-a2df-d68fbf44cc56
2019-08-09_9453f8d6-00e4-455c-bd35-4be5d9eefb2a,ALYSSAFEELICIANO@HOTMAIL.COM,9bb06fff-5a15-4f1f-9b97-0bc19b5c937a
2019-06-01_60facf65-7976-4eb1-8e38-9132bdc29df1,KAROLINABARBOSA1993@GMAIL.COM,b0b65223-7bcf-4dca-9422-cbff3cb2d777


In [0]:
%sql
SELECT DISTINCT user_identifier

FROM session

WHERE user_identifier LIKE '%@%'

#### Dois registro sem o campaign_name e com campaign_id

In [0]:
%sql
SELECT
  * FROM marketing WHERE campaign_name IS Not NULL AND campaign_id = 'dd6faaaa-714c-4a6a-a511-e1259ae74d92' limit 1

event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,canvas_name,canvas_step_id,canvas_is_first_step,canvas_first_step_name,canvas_step_name,canvas_real_step_name,canvas_step_index,canvas_tags,send_id,event_date
push,received,iFood Brasil,sample,5b7eb90e9ec77b7d9276a422,e3036be8c90ffef3ddd7e06f85a3192b67e4efffcc85b64fe77f12d3b2f184b1,2019-07-31T11:21:52.000Z,android,dd6faaaa-714c-4a6a-a511-e1259ae74d92,2019-07-31 / ANDROID / Ativos / Jantar / CUPOM,,,,,,,,,,,2019-07-31T00:00:00.000Z


In [0]:
%sql
SELECT *
  
FROM marketing
  
WHERE campaign_name IS NULL
AND campaign_id = 'dd6faaaa-714c-4a6a-a511-e1259ae74d92'

event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,canvas_name,canvas_step_id,canvas_is_first_step,canvas_first_step_name,canvas_step_name,canvas_real_step_name,canvas_step_index,canvas_tags,send_id,event_date
push,received,iFood Brasil,sample,5ac35d310e560d5d7d1aa9cf,552756d460425c6ddad686b64ff1898354d7bc34db06765f46a525d6a108c44e,2019-07-31T11:11:02.000Z,android,dd6faaaa-714c-4a6a-a511-e1259ae74d92,,,,,,,,,,,,2019-07-31T00:00:00.000Z
push,send,iFood Brasil,sample,5ac35d310e560d5d7d1aa9cf,552756d460425c6ddad686b64ff1898354d7bc34db06765f46a525d6a108c44e,2019-07-31T11:11:02.000Z,android,dd6faaaa-714c-4a6a-a511-e1259ae74d92,,android_push,,,0.0,,,,,,,2019-07-31T00:00:00.000Z


### Verificar se há mais de um click para a mesma campanha e user

In [0]:
%sql
SELECT campaign_id, external_user_id, COUNT(*) total

FROM marketing

WHERE event_name = "click"
AND campaign_id IS NOT NULL

GROUP BY campaign_id, external_user_id

HAVING total > 1

order by total DESC

campaign_id,external_user_id,total
3029d9d4-280e-4a8c-a57c-68e795b22e4c,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,47
dc0ff0e3-e6a7-4df7-b088-78c58cc11611,f1047c6348674907fd6a670273df55c16625c984472d831ed990f4c92bfd5ea3,6
dbb9dfb4-6860-48ac-b45d-2e27427956d4,bee8ecb45a1b6190638580fc9221ee08d34c5e852f2dd38d482c1c29ccdf69ea,6
c3ea8ca3-81f6-404d-b85c-bf04f252058b,1bb01047428ea9f4e49db77d703d9f98d91639e6ba293234be0886c27749f99b,6
17f1b9eb-0524-454b-ace8-f292029294c8,27cd388b3be55a8d495750c1657b1a02f5c29622a79ad06a89c60ba37759aa0b,6
272067b7-c34c-4375-8354-530dfc141944,0ec52034c019d2b17070e94d84a624318f6b943301bd55882c5be67ba86d6fee,5
abb95553-2e30-434b-83b0-4d54165e76a3,a5acdf2a323ea0e67eac7676c8a56b95be5a9514bfca5ab187ff8d7a66ffc545,5
7ec54a50-ac53-4dd6-b97f-3161acc1cd78,77fab07047e2f48ca60eb0631ff589c310e7ce9ece9513f45b2d7602137260f2,5
13f40701-b53d-40d2-bf3d-b569fe22828c,745498bd676315f2d6c1d25b9924e4bf24ce1855ab1ed9c653db8fc817f579b4,5
64cdeb3d-ed6f-4c92-8820-3f5d0ff926a8,e235028f413a223117a67d568d588dd6bd5c748ccb622507c72ff13f41489abb,5


In [0]:
%sql
SELECT *
FROM marketing
WHERE campaign_id = '3029d9d4-280e-4a8c-a57c-68e795b22e4c'
AND external_user_id = 'c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a'
AND event_name = 'click'


event_channel,event_name,brand,sample_type,user_id,external_user_id,event_time_utc3,platform,campaign_id,campaign_name,message_variation_channel,canvas_name,canvas_step_id,canvas_is_first_step,canvas_first_step_name,canvas_step_name,canvas_real_step_name,canvas_step_index,canvas_tags,send_id,event_date
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:29:59.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:29:59.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:29:59.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:29:59.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:30:00.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:29:59.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:30:00.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:29:59.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:30:00.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z
push,click,iFood Brasil,sample,5c439c566d659837cae721a5,c1836d77803e3a29efbfdab7038b49e260dffb2d5353157201604d121d80f64a,2019-06-20T12:30:00.000Z,android,3029d9d4-280e-4a8c-a57c-68e795b22e4c,2019-06-20 / ANDROID / Ativos / Almoço / CUPOM,android_push,,,0.0,,,,,,,2019-06-20T00:00:00.000Z


# Tabela : marketing_push_full

- _tabela poderia ser transformada em uma tabela wide_
- _com as flags send, received, click e bounce_
- _com os tempos event_send_date, event_received_date, event_click_date, event_bounce_date_

# Quantitativas
### Timestamp
- event_time_utc3 (datahora completa)
- event_date (apenas data)

In [0]:
from pyspark.sql.functions import col

dfEventDate = (
  df_marketing_push_full.select(
    col('event_date')
  ).groupby(
    'event_date'
  ).count().orderBy(
    'event_date'
  )
)

# Evento diários no período

In [0]:

dfEventCast = dfEventDate.select(dfEventDate['event_date'].cast('timestamp'), dfEventDate['count'])
display(dfEventCast)

event_date,count
2019-06-01T00:00:00.000+0000,61439
2019-06-02T00:00:00.000+0000,49900
2019-06-03T00:00:00.000+0000,93701
2019-06-04T00:00:00.000+0000,75133
2019-06-05T00:00:00.000+0000,100105
2019-06-06T00:00:00.000+0000,106810
2019-06-07T00:00:00.000+0000,99556
2019-06-08T00:00:00.000+0000,70314
2019-06-09T00:00:00.000+0000,49815
2019-06-10T00:00:00.000+0000,78778


# Eventos por mês

In [0]:

from pyspark.sql.functions import month, date_format
display(dfEventCast.groupby(date_format('event_date', 'MM - MMM').alias('month')).sum().orderBy('month'))

month,sum(count)
06 - Jun,2621189
07 - Jul,3590220
08 - Aug,2303136
09 - Sep,2428327
10 - Oct,3289334
11 - Nov,2989326
12 - Dec,3653835


# Envios por dia da semana
## Pico de envios
-  9h (Seg - Sex)
- 11h (Seg - Dom)
- 15h (Seg - Sex)
- 18h (Seg - Dom)
- 20h (Qui e Dom)

_Quinta possui os cinco pico bem definidos_

_Sábado os dois principais apenas 11h e 18h_

_Domingo três pico 11h, 18h e 20h_

In [0]:
from pyspark.sql.functions import date_format, dayofweek, hour

display(
  df_marketing_push_full.where('event_name = "send"').groupby(
    dayofweek('event_time_utc3').alias('dow'),
    date_format('event_time_utc3', 'E').alias('day'),
    hour('event_time_utc3').alias('hour')
  ).count().orderBy(['dow', 'hour'])
)

dow,day,hour,count
1,Sun,0,4607
1,Sun,1,2406
1,Sun,2,1273
1,Sun,3,825
1,Sun,4,486
1,Sun,5,161
1,Sun,6,95
1,Sun,7,228
1,Sun,8,1525
1,Sun,9,21508


In [0]:
from pyspark.sql.functions import date_format, dayofweek, hour

display(
  df_marketing_push_full.where('event_name = "click"').groupby(
    dayofweek('event_time_utc3').alias('dow'),
    date_format('event_time_utc3', 'E').alias('day'),
    hour('event_time_utc3').alias('hour')
  ).count().orderBy(['dow', 'hour'])
)

dow,day,hour,count
1,Sun,0,379
1,Sun,1,193
1,Sun,2,106
1,Sun,3,61
1,Sun,4,31
1,Sun,5,34
1,Sun,6,37
1,Sun,7,56
1,Sun,8,107
1,Sun,9,153


In [0]:
from pyspark.sql.functions import date_format, dayofweek, hour

display(
  df_marketing_push_full.where('event_name = "bounce"').groupby(
    dayofweek('event_time_utc3').alias('dow'),
    date_format('event_time_utc3', 'E').alias('day'),
    hour('event_time_utc3').alias('hour')
  ).count().orderBy(['dow', 'hour'])
)

dow,day,hour,count
1,Sun,0,4
1,Sun,1,3
1,Sun,2,1
1,Sun,3,2
1,Sun,4,3
1,Sun,5,1
1,Sun,8,7
1,Sun,9,29
1,Sun,10,65
1,Sun,11,687


# Qualitativas

### Identificadores
- user_id
- external_user_id
- campaign_id
- canvas_step_id
- send_id

### Categóricas
- brand
- event_channel
- event_name
- platform 
- sample_type
- campaign_name
- message_variation_channel
- canvas_name
- canvas_is_first_step
- canvas_first_step_name
- canvas_step_name
- canvas_real_step_name
- canvas_step_index
- canvas_tags

# Valores Nulos
- 97,06%  canvas_name
- 97,06%  canvas_step_id
- 99,93%  canvas_is_first_step
- 99,93%  canvas_first_step_name
- 97,06%  canvas_step_name
- 97,06%  canvas_real_step_name
- 97,06%  canvas_step_index
- 99,93%  canvas_tags
- 100,00% send_id

_Campos não devem ser usados_

In [0]:
# Função summaryZeroNull definida acima
dfMarketingTranspose = summaryZeroNull(df_marketing_push_full, df_marketing_push_full.columns)

Unnamed: 0,summary,null,zero
0,event_channel,0,0
1,event_name,0,0
2,brand,0,0
3,sample_type,0,0
4,user_id,0,0
5,external_user_id,0,0
6,event_time_utc3,0,0
7,platform,157137,0
8,campaign_id,3074483,0
9,campaign_name,3074485,0


In [0]:
countMarketing = 20875367
dfMarketingTranspose['%null'] = dfMarketingTranspose['null']/countMarketing*100
dfMarketingTranspose['%zero'] = dfMarketingTranspose['zero']/countMarketing*100
dfMarketingTranspose['%final'] = (dfMarketingTranspose['zero'] + dfMarketingTranspose['null'])/countMarketing*100
dfMarketingTranspose

Unnamed: 0,summary,null,zero,%null,%zero,%final
0,event_channel,0,0,0.0,0.0,0.0
1,event_name,0,0,0.0,0.0,0.0
2,brand,0,0,0.0,0.0,0.0
3,sample_type,0,0,0.0,0.0,0.0
4,user_id,0,0,0.0,0.0,0.0
5,external_user_id,0,0,0.0,0.0,0.0
6,event_time_utc3,0,0,0.0,0.0,0.0
7,platform,157137,0,0.7527,0.0,0.7527
8,campaign_id,3074483,0,14.7278,0.0,14.7278
9,campaign_name,3074485,0,14.7278,0.0,14.7278


# Campo: brand
_100% do valores "iFood Brasil"_

_Campo deve ser descartado_

In [0]:
df_marketing_push_full.where('brand <> "iFood Brasil"').count()

# Campo: event_channel
_100 % dos valores "push"_

_Campo deve ser descartado_

In [0]:
df_marketing_push_full.where('event_channel <> "push"').count()

# Campo: event_name
_send, received, click, bounce_

_Fluxo de estado provável : send -> received -> [click ou bounce]_

In [0]:
display(
  df_marketing_push_full.groupby(
    'event_name'
  ).count().orderBy('count', ascending=False)
)

event_name,count
send,10356003
received,10182463
click,322629
bounce,14272


In [0]:
display(
  df_marketing_push_full.where(
    'event_name IN("click", "bounce")'
  ).groupby(
    'event_name'
  ).count().orderBy('count', ascending=False)
)

event_name,count
click,322629
bounce,14272


In [0]:
dbClick = df_marketing_push_full.where('event_name = "click"').head(10)

*Busca de um evento de click para análise*

In [0]:
pdClick = pd.DataFrame(dbClick)
pdClick.columns = df_marketing_push_full.columns
pdClick.astype(
  {
    'event_time_utc3':'datetime64[ns]',
    'event_date':'datetime64[ns]',
  }
).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
event_channel,push,push,push,push,push,push,push,push,push,push
event_name,click,click,click,click,click,click,click,click,click,click
brand,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil
sample_type,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
user_id,5d095eee9799566ab68b676f,5ac3817a9e1aaa93cdce4750,5ac37f7d55406de396f0e28a,5c6761a5392abb857b7b8945,5bca8d7b510a69c98efc2638,5ac2ff1c0b1e36d071e043db,5bf8835ba9c41dd9be27d4ac,5aedd714f26e04fbaef7127b,5ac2f4644d6e451c464254c3,5b02eb0c320698abd033a163
external_user_id,b5bd5ba4c04bc46058e0e26efd84e4b0e50f4ff0b1a59a...,e019c735b0c12a212711adaa8832ffcadb55f48c18401d...,d6d990e114f094218a3eb5c63ff62d9c3017fd5d7b2024...,72eb052cf057e3b6cf760ac77cb99be8c225c0def954fa...,f9c107a140f9b8242b448144a277252e6b8d4e86436b00...,152eda7a3fc397011f9ed9e5325f55018484353b17eb1a...,40096fbcd7c06a22aee5d0b317e6282aa5e46a80cd29b4...,0ceeb244e3721abc6272053467a311fbe63b29670c3ca8...,0e8dd3d1b124df4646a120aa224fb3fa191b800eba0209...,79f2eb0e4108416c70c12739c1ea0dd83d0013dbc998de...
event_time_utc3,2019-10-16 14:40:23,2019-10-02 15:17:02,2019-11-15 18:35:00,2019-07-28 11:38:56,2019-11-26 11:55:42,2019-11-19 22:53:16,2019-07-30 18:26:39,2019-11-12 10:50:21,2019-06-02 11:07:27,2019-11-22 11:25:16
platform,android,android,android,android,android,android,android,ios,android,ios
campaign_id,25544cbc-7dc6-4eca-9dc6-aab1bd47b269,776ed3e7-24c7-4279-9548-57429b7c38bf,d6943765-84ec-408b-ae53-821f597b71cb,0aadb78e-240d-489c-966f-4cf9f1b3f8fc,,cecc4c2b-d865-44c5-b8d9-5061fb9000c5,2db129b2-cd7b-4503-92dd-4568562b0c16,9209c9ed-1b91-4488-99b7-4b6a8e02f955,e23ff901-2d19-4851-a7d5-d37549b71034,3a8ac488-59d9-4f0a-9ba2-f792a9a5205b
campaign_name,2019-10-16 / ANDROID / Ativos / Almoço / CUPOM,2019-10-02 / ANDROID / Ativos / Lanche da Tard...,2019-11-15 / ANDROID / Ativos / Jantar - Light...,2019-07-28 / Android / Almoço - Entrega Grátis,,2019-11-19 / ANDROID / Ativos / Jantar - Light...,2019-07-30 / ANDROID / Ativos / Jantar / CUPOM,2019-11-12 / IOS / Almoço / To Go,2019-06-02 / ANDROID / Almoço - Gourmet,2019-11-22 / iOS / Ativos / Almoço / CUPOM


*Cliente de um click para análise*

In [0]:
dbCliente = df_marketing_push_full.where(
  "user_id = '5ac387e559e5ad7ee62471c3' and event_date = '2019-09-29T00:00:00.000Z' and campaign_id = '903fe138-592c-461c-be4d-1c1f596af223'"
).orderBy(
  'campaign_name'
).toPandas()

In [0]:
pdCliente = dbCliente
pdCliente.columns = df_marketing_push_full.columns
pdCliente.drop(
  columns=['user_id', 'external_user_id']
).astype(
  {
    'event_date': 'datetime64[ns]',
    'event_time_utc3': 'datetime64[ns]'
  }
).sort_values('event_time_utc3', ascending=False).transpose()

Unnamed: 0,2,0,1
event_channel,push,push,push
event_name,click,send,received
brand,iFood Brasil,iFood Brasil,iFood Brasil
sample_type,sample,sample,sample
event_time_utc3,2019-09-29 20:55:53,2019-09-29 20:55:41,2019-09-29 20:55:41
platform,ios,ios,ios
campaign_id,903fe138-592c-461c-be4d-1c1f596af223,903fe138-592c-461c-be4d-1c1f596af223,903fe138-592c-461c-be4d-1c1f596af223
campaign_name,2019-09-29 / iOS / Ativos / Ceia - KA / CUPOM,2019-09-29 / iOS / Ativos / Ceia - KA / CUPOM,2019-09-29 / iOS / Ativos / Ceia - KA / CUPOM
message_variation_channel,ios_push,ios_push,ios_push
canvas_name,,,


# Há mais clicks nos push de campanhas ?

_Avaliar se podemos analisar apenas os dados de campanhas_

In [0]:
%sql
SELECT
if(campaign_id is null, "Sem Campanha", "Com Campanha") AS Campanha, COUNT(*) total

FROM marketing

WHERE event_name = 'click'

GROUP BY 1

Campanha,total
Sem Campanha,46978
Com Campanha,275651


In [0]:
pd.DataFrame(df_marketing_push_full.dtypes)

Unnamed: 0,0,1
0,event_channel,string
1,event_name,string
2,brand,string
3,sample_type,string
4,user_id,string
5,external_user_id,string
6,event_time_utc3,string
7,platform,string
8,campaign_id,string
9,campaign_name,string


In [0]:
dfMarketingPandas = pd.DataFrame(df_marketing_push_full.head(5))
dfMarketingPandas.columns = df_marketing_push_full.columns

In [0]:
dfMarketingPandas.transpose()

Unnamed: 0,0,1,2,3,4
event_channel,push,push,push,push,push
event_name,received,send,send,received,send
brand,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil
sample_type,sample,sample,sample,sample,sample
user_id,5ac365c96ee329b1951ac7e9,5ac3121c8e5b4b5696ac49dc,5ac2c99c059e9ee7e5eed43b,5ac36acd409f7734a97df5b9,5ac3654ad75e5a3c29fb256f
external_user_id,6f040026ecb33d63875ca8340a55db433fe185d8e40a01...,290070089ba4bc832a39ffdb7f3bc7e70cb4e7746bc511...,0753a6e30f7b58265e00becd06423d17181aa22d43f51e...,8468bb60ea7888a33a3506e4f3d2b541afca6651bde242...,6cbda064947aec380ac7c459de0902fc0182b23dcbaa59...
event_time_utc3,2019-08-29T18:31:10.000Z,2019-11-13T12:38:00.000Z,2019-10-10T11:31:02.000Z,2019-10-27T21:05:31.000Z,2019-10-28T11:30:49.000Z
platform,android,ios,ios,android,android
campaign_id,23e2e9bf-61ae-48aa-8e24-85d8f2348b21,,c3858a3b-1eea-4ab8-99ca-5b0166a9fb14,e5b7abaf-eafa-4a2e-bde5-62965fefa0f5,2398c143-4f12-4541-afd9-06bb94cb7b8a
campaign_name,2019-08-29 / ANDROID / 20190829MIDIAKITKAS,,2019-10-10 / IOS / Almoço / Cidades DFN Geral,2019-10-27 / ANDROID / Ativos / Ceia - KA / CUPOM,2019-10-28 / ANDROID / Ativos / Almoço - Light...


In [0]:
dbBounce = df_marketing_push_full.where('event_name = "bounce"').head(10)

In [0]:
pdBounce = pd.DataFrame(dbBounce)
pdBounce.columns = df_marketing_push_full.columns
pdBounce.astype(
  {
    'event_time_utc3':'datetime64[ns]',
    'event_date':'datetime64[ns]',
  }
).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
event_channel,push,push,push,push,push,push,push,push,push,push
event_name,bounce,bounce,bounce,bounce,bounce,bounce,bounce,bounce,bounce,bounce
brand,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil
sample_type,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
user_id,5ac35888752e2a98422f4ee1,5ac382a029c1649ca3575d14,5ac30925aea3e938260b51cb,5ce09d592502b770f2cf057a,5ac35e268329206c85a58380,5ac30cefce7caed1602177bf,5ad3cfe7a2ffcb25329546b3,5ac36f871570203ef8a584f9,5ac31065157020d1a971a4cc,5ce5bc3fe8684104edb11963
external_user_id,4c03309d24bd5dfd0ff63476db734e5566142438b25d13...,e552c01ddc11a3d562add510ef0370e4fc31c0897685b6...,1f35ff210d11cc25b8ae2ef04bd8e2160ada87f60c0057...,a318b96ce452e9966fe3845302cf5e895dfe18b350410e...,59bfd1e3a061ce03fe3bbd5133a27050bdbf8edcc157c5...,24100204da0042ff47591eda72a3f4ab2f727b07bcedcb...,9875172e423a71f84e8b3a64ee1ba51439c2b2ca0571b8...,996385df8ad5079ca69068b4581a2fddec9aa71c864960...,26cb22182928f0ab13696a73ae7cf3e4b9dd0676037314...,9a136c6e4f4cda107d3ef62dbfe711c232b1077b5df56b...
event_time_utc3,2019-10-23 11:16:21,2019-11-05 11:28:54,2019-06-29 18:19:55,2019-07-18 11:02:57,2019-10-01 10:46:43,2019-08-07 15:39:44,2019-09-28 17:50:31,2019-10-13 11:24:17,2019-08-14 12:08:44,2019-09-12 11:30:04
platform,android,ios,android,android,android,android,android,ios,android,android
campaign_id,,ea791a16-3970-4377-a123-bda02b1e60e0,cad3d644-d328-4cb1-a5d8-bb99645b801f,8187d873-910c-4091-b3dd-11fd9d2689d9,f5222b23-a7bc-42c9-be84-d24d44cfd59c,501ae547-f6d3-41c4-abee-89dbb63e9c3d,3d9da0d0-d371-4743-bca9-be651f812051,1c896bd3-690d-4a6d-a907-5ec3b68de5ee,d3448f3e-1b9f-4b63-b89a-4b8bfb0c4eeb,13cfa37a-ce9e-4f98-a59c-323f09dffed8
campaign_name,,2019-11-05 / iOS / Ativos / Almoço - Light / C...,2019-06-29 / ANDROID / Ativos / Jantar / CUPOM,2019-07-18 / ANDROID / CRM_Restaurantes_Expans...,2019-10-01 / ANDROID / Almoço / To Go,2019-08-07 / ANDROID / Ativos / Lanche da Tard...,2019-09-28 / ANDROID / Jantar / Ativos Campinas,2019-10-13 / iOS / Ativos / Almoço - Light / C...,2019-08-14 / ANDROID / Almoço - Light / CUPOM,2019-09-12 / ANDROID / Ativos / Almoço - Light...


In [0]:
dbClienteBounce = df_marketing_push_full.where("user_id = '5ac35888752e2a98422f4ee1' and event_date = '2019-10-23T00:00:00.000Z'").orderBy('campaign_name').toPandas()

In [0]:
pdClienteBounce = dbClienteBounce.copy()
pdClienteBounce.columns = df_marketing_push_full.columns
pdClienteBounce.drop(
  columns=['user_id', 'external_user_id']
).astype(
  {
    'event_date': 'datetime64[ns]',
    'event_time_utc3': 'datetime64[ns]'
  }
).sort_values('event_time_utc3', ascending=False).transpose()

Unnamed: 0,3,4,6,8,2,12,5,11,0,1,7,9,10,13,14
event_channel,push,push,push,push,push,push,push,push,push,push,push,push,push,push,push
event_name,received,send,received,send,received,send,received,send,bounce,send,received,received,received,send,send
brand,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil,iFood Brasil
sample_type,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
event_time_utc3,2019-10-23 19:06:36,2019-10-23 19:06:36,2019-10-23 19:06:36,2019-10-23 19:06:36,2019-10-23 11:41:35,2019-10-23 11:41:35,2019-10-23 11:41:34,2019-10-23 11:41:34,2019-10-23 11:16:21,2019-10-23 11:16:21,2019-10-23 11:16:21,2019-10-23 11:16:21,2019-10-23 11:16:21,2019-10-23 11:16:21,2019-10-23 11:16:21
platform,android,android,android,android,android,android,android,android,android,android,android,android,android,android,android
campaign_id,,,,,,,,,,,,,,,
campaign_name,,,,,,,,,,,,,,,
message_variation_channel,,,,,,,,,,,,,,,
canvas_name,,,,,,,,,,,,,,,


# Campo: platform
_157137 valores nulos_

In [0]:
display(df_marketing_push_full.groupby('platform').count().orderBy('count', ascending=False))

platform,count
android,12693963
ios,8024267
,157137


# Campo: message_variation_channel
_4277147 valores nulos_

_Parece bastante correlacionado com o campo de plataforma_

_Usar campo para preencher 111033 valores nulos do campo platform_

In [0]:
display(df_marketing_push_full.groupby('message_variation_channel').count().orderBy('count', ascending=False))


message_variation_channel,count
android_push,10218745
ios_push,6379475
,4277147


In [0]:
display(
  df_marketing_push_full.groupby(
    'platform', 'message_variation_channel'
  ).count().orderBy('count', ascending=False)
)

platform,message_variation_channel,count
android,android_push,10154789
ios,ios_push,6332398
android,,2539174
ios,,1691869
,android_push,63956
,ios_push,47077
,,46104


# Campo: sample_type
_control e sample_

_relativamente pouco registros para o valor control_

_provavelmente usado para acompanhamento de campanhas especificas_

In [0]:
display(df_marketing_push_full.groupby('sample_type').count())

sample_type,count
control,157137
sample,20718230


In [0]:
pdControl = pd.DataFrame(
  df_marketing_push_full.where(
    'sample_type = "control"'
  ).groupby(
    'campaign_id'
  ).count().collect()
)

In [0]:
pdControl.columns = ['campaign_id', 'count']
pdControl.sort_values('campaign_id')

Unnamed: 0,campaign_id,count
151,00048e73-07ff-40da-a95b-6d226d82c230,1
1790,004e5b1d-aea7-4829-b053-ac2d3c2ca1b5,179
2266,004f627a-ee2f-4f37-b828-6a1019c6ccc6,88
2198,00e57dca-4284-4796-9b61-99e2a8992a3f,92
1482,00ec5f6a-f10e-474e-89e9-e84fc24f3226,62
1281,00f76027-c8ad-433c-8ce2-fad26ab11640,243
1815,01091e62-7502-47b5-a66b-8c186f295378,66
1625,0113d45a-3176-4062-ad97-ed0e3acf29c3,6
725,0140bac9-ba0d-45ea-917a-05ed0de5ad5a,86
1487,0182f0ba-4494-49c8-97a8-ff8a40389946,3


# Campo: campaign_name
_3074485 registros nulos (14,73%)_

_talvez usar o campos para extrair informações de cupons, desconto e refeição relacionada ao push_

In [0]:
df_marketing_push_full.where('campaign_name is not null').count()

In [0]:
dbCampaignName = df_marketing_push_full.select('campaign_name').distinct()

In [0]:
pdCampaignName = pd.DataFrame(dbCampaignName.collect())
pdCampaignName.columns = ['name']

In [0]:
pdCampaignName['name'] = pdCampaignName['name'].str.replace(r'\d\d\d\d-\d\d-\d\d', '')
pdCampaignName['name'] = pdCampaignName['name'].str.replace(r'\d\d\d\d\d\d\d\d', '')
pdCampaignName['name'] = pdCampaignName['name'].str.replace(r'^\s*\/\s*', '')
pdCampaignName['name'] = pdCampaignName['name'].str.replace(r'^\s*-\s*', '')
pdCampaignName['name'] = pdCampaignName['name'].str.replace(r'^ANDROID\s*\/\s*', '')
pdCampaignName['name'] = pdCampaignName['name'].str.replace(r'^Android\s*\/\s*', '')
pdCampaignName['name'] = pdCampaignName['name'].str.replace(r'^iOS\s*\/\s*', '')
pdCampaignName['name'] = pdCampaignName['name'].str.replace(r'^IOS\s*\/\s*', '')
pdCampaignName['total'] = 1
resultado = pd.DataFrame(pdCampaignName.groupby('name').sum())
resultado = resultado.reset_index()
display(spark.createDataFrame(resultado))

name,total
1RECCUPOM70OFF,4
2RECCUPOMDIADOSNAMOS,2
5RECCUPOM70OFF,2
70% OFF,9
9RECCUPOM70OFF,2
ACAOPONTUALCES,4
ACAOPONTUALKA,4
ACAOPONTUALKAS,8
ALMOCO / RECSPROMOBOMB_HBB_RGZ_11h00_novos,1
ALMOÇO / RECSPROMOBOMB_HBB_RGZ_11h00_novos,3


# Campo: Segmentação de click por campanha de cupom

In [0]:
import re 

def filterCampaignName(name):
  if name == None:
    return None
  
  name = re.sub(r'\d\d\d\d-\d\d-\d\d', '', name)
  name = re.sub(r'\d\d\d\d\d\d\d\d', '', name)
  name = re.sub(r'^\s*\/\s*', '', name)
  name = re.sub(r'^\s*-\s*', '', name)
  name = re.sub(r'^ANDROID\s*\/\s*', '', name, flags=re.IGNORECASE)
  name = re.sub(r'^iOS\s*\/\s*', '', name, flags=re.IGNORECASE)
  
  return name

filterCampaignName('2019-04-16 / IOS / Almoço / CUPOM')

In [0]:
from pyspark.sql.functions import col, when

df_marketing_push_full.createOrReplaceTempView("cupom_click")

# campaignClick = spark.sql(
#   'SELECT lower(campaign_name) like "%cupom%" cupom,  lower(campaign_name) like "%voucher%" voucher, count(*) total from cupom_click where event_name = "click" group by cupom, voucher'
# )
campaignClick = spark.sql(
   'SELECT campaign_name, count(*) total, sum(if(event_name = "send", 1, 0)) send, sum(if(event_name = "received", 1, 0)) received, sum(if(event_name = "bounce", 1, 0)) bounce, sum(if(event_name = "click", 1, 0)) click from cupom_click group by campaign_name order by click / total desc'
)

In [0]:
pdCampaignClick = campaignClick.toPandas()

In [0]:
pdCampaignClick['campaign_name'] = pdCampaignClick['campaign_name'].apply(filterCampaignName)
pdCampaignClickGroup = pdCampaignClick.groupby('campaign_name').sum()
pdCampaignClickGroup = pdCampaignClickGroup.reset_index()
pdCampaignClickGroup

Unnamed: 0,campaign_name,total,send,received,bounce,click
0,1RECCUPOM70OFF,1798,879,829,2,88
1,2RECCUPOMDIADOSNAMOS,4813,2418,2273,1,121
2,5RECCUPOM70OFF,1825,908,843,4,70
3,70% OFF,23726,11872,11557,15,282
4,9RECCUPOM70OFF,2331,1144,1088,3,96
5,ACAOPONTUALCES,416,217,195,1,3
6,ACAOPONTUALKA,7427,3780,3543,12,92
7,ACAOPONTUALKAS,31646,15985,15100,34,527
8,ALMOCO / RECSPROMOBOMB_HBB_RGZ_11h00_novos,4,2,2,0,0
9,ALMOÇO / RECSPROMOBOMB_HBB_RGZ_11h00_novos,12,6,6,0,0


In [0]:
display(spark.createDataFrame(pdCampaignClickGroup))

campaign_name,total,send,received,bounce,click
1RECCUPOM70OFF,2610,1285,1234,2,89
2RECCUPOMDIADOSNAMOS,6795,3407,3262,1,125
5RECCUPOM70OFF,2603,1300,1225,6,72
70% OFF,30097,15056,14732,19,290
9RECCUPOM70OFF,3382,1671,1610,3,98
ACAOPONTUALCES,538,278,256,1,3
ACAOPONTUALKA,10367,5248,5004,16,99
ACAOPONTUALKAS,44571,22449,21546,35,541
ALMOCO / RECSPROMOBOMB_HBB_RGZ_11h00_novos,4,2,2,0,0
ALMOÇO / RECSPROMOBOMB_HBB_RGZ_11h00_novos,12,6,6,0,0


# Evento de click

In [0]:
dbTotalClick = df_marketing_push_full.where('event_name = "click"')

In [0]:
display(dbTotalClick.groupby('platform').count())

platform,count
android,238947
ios,83682


# Tabela : sessions_visits

In [0]:
df_sessions_visits = spark.read.parquet(f'{root_dir}/sessions_visits')
print((df_sessions_visits.count(), len(df_sessions_visits.columns)))

# Variáveis Qualitativas

### Identificadores

- session_id
- dau
- user_identifier
- user_account_uuid

### Categóricas

- platform
- device_model
- device_manufacturer
- media_network
- first_order_origin_feature

# Campo: platform
_Duas opções balanceadas (Android x IOS)_

In [0]:
# duas opções balanceadas (Android x IOS)
display(df_sessions_visits.groupby('platform').count().orderBy('count', ascending=False).head(10))

platform,count
ANDROID,262643
IOS,200977


# Campo: device_manufacturer
_70 valores distintos (99 % nas 10 primeiras)_

In [0]:
# 70 Maioria entre apple, samsung, motorola, Xiaomi, [juntar como outros]
display(df_sessions_visits.groupby('device_manufacturer').count().orderBy('count', ascending=False).head(6))

device_manufacturer,count
Apple,200977
samsung,135191
motorola,73582
Xiaomi,22817
asus,13361
LGE,8958


# Campo: device_model
_810 valores distintos_

_(50% dos registros em 20 categorias)_

_(73% dos registros em 50 categorias)_

_Muitos valores distintos e com grande dispersão_

_Avaliar se vale a pena reprocessar e agrupar as categorias (Ex. [iPhone9,1], [iPhone9,2], [iPhone9,3], [iPhone9,4])_

In [0]:
# OS 20/810 maiores tem 50 % (juntar os modelos)
display(df_sessions_visits.groupby('device_model').count().orderBy('count', ascending=False).head(20))

device_model,count
"iPhone9,3",30041
"iPhone8,1",23018
"iPhone10,4",17924
"iPhone9,4",17427
"iPhone10,5",16873
"iPhone11,8",13972
"iPhone10,6",12425
"iPhone7,2",11318
SM-G610M,10319
"iPhone11,6",8842


# Campo: media_network
_Muitos valores nulls (98%)_

In [0]:
# Muitos valores nulls 453445 / 463620 (98%)
display(
  df_sessions_visits.where(
    'media_network is not null'
  ).groupby(
    'media_network'
  ).count().orderBy(
    'count', ascending=False
  ).head(6)
)

media_network,count
googleadwords_int,4127
Email Avaliacao,3111
Facebook Ads,1075
twitter,795
MGM,473
criteo_int,103


# Campo: first_order_origin_feature
_406 valores distintos (87% nas 20 maiores categorias)_

In [0]:
# Os 20/406 possuem 87%
display(df_sessions_visits.groupby('first_order_origin_feature').count().orderBy('count', ascending=False).head(7))

first_order_origin_feature,count
Featured Restaurant,122435
Last Restaurants,71207
Ranking,67407
Filter,20137
Lanches,18182
Pizza,15115
CRM Restaurant,12636


# Variávies Quantitativas

### timestamp

- session_started_at_amsp
- session_ended_at_amsp
- session_started_at_utc0
- session_ended_at_utc0

### Inteiros

- session_duration_seconds
- sum_event_open
- sum_view_restaurant_screen
- sum_view_dish_screen
- sum_click_add_item
- sum_view_checkout
- sum_callback_purchase
- order_session_quantity

# Campos de timestamp
_2077 Registros sem a marcação de final da sessão nos campos session_ended_at_amsp e session_ended_at_utc0_

In [0]:
print(
df_sessions_visits.where('session_started_at_amsp is null').count(),
df_sessions_visits.where('session_ended_at_amsp is null').count(),
df_sessions_visits.where('session_started_at_utc0 is null').count(),
df_sessions_visits.where('session_ended_at_utc0 is null').count()
)

#Abertura de sessão por dia da semana e hora

In [0]:
from pyspark.sql.functions import date_format, dayofweek, hour

display(
  df_sessions_visits.groupby(
    dayofweek('session_started_at_amsp').alias('dow'),
    date_format('session_started_at_amsp', 'E').alias('day'),
    hour('session_started_at_amsp').alias('hour')
  ).count().orderBy(['dow', 'hour'])
)

dow,day,hour,count
1,Sun,0,1366
1,Sun,1,749
1,Sun,2,361
1,Sun,3,188
1,Sun,4,87
1,Sun,5,18
1,Sun,6,18
1,Sun,7,34
1,Sun,8,66
1,Sun,9,137


# Campo: session_duration_seconds
_11018 outlier (valores muito altos)_

_13 valores negativo (não faz sentido)_

_1150 valores para mas de 6 horas de sessão_

In [0]:
df_sessions_visits.where('session_duration_seconds > (60 * 60 * 6)') .count(), df_sessions_visits.where('session_duration_seconds < 0') .count()



In [0]:
display(df_sessions_visits.select('session_duration_seconds').summary())

summary,session_duration_seconds
count,461543.0
mean,2658.8750712284664
stddev,20585.254195389636
min,-93.0
25%,667.0
50%,1792.0
75%,3152.0
max,2575751.0


In [0]:
# Verificar outliers (valores negativos e muito altos)
display(
  df_sessions_visits.select(
    'session_duration_seconds'
  ).where('session_duration_seconds < (24 * 60 * 60 * 6)').groupby(
    'session_duration_seconds'
  ).count().orderBy(
    'session_duration_seconds', ascending=False
  )
)

In [0]:
df_sessions_visits.where('session_duration_seconds > 1000000').limit(3).toPandas().transpose()

Unnamed: 0,0,1,2
session_id,a3a57dfe-0c34-4e32-8d32-5c61958f2d3b,a7cbe85c-e4e8-459b-9538-d651953def66,48b976bd-1537-4e10-a923-8b7eda443c7e
dau,2019-12-07_003eacec-005e-4c5c-80ff-b7ffe0467d14,2019-08-22_50a96df6-554d-49d7-99fb-ad39d60e178e,2019-06-29_7bf1d02b-6f26-48ac-bf53-11cbf94a67a8
platform,ANDROID,ANDROID,ANDROID
user_identifier,d66b053d0c4d35a8c061089396175275928340f44abd4b...,2115dc8d454458c94ed8a663cb08c1b3c29b978a86d75f...,013d5458-0ef3-49ef-8ddc-3bc90526bac9
user_account_uuid,1bf907dc-0f6f-457e-af6c-3c0c828888dc,e74c78ff2079c716479b0429e50f378d37e24ad3a494a9...,013d5458-0ef3-49ef-8ddc-3bc90526bac9
session_started_at_amsp,2019-12-07T20:33:48.690Z,2019-08-22T17:34:04.004Z,2019-06-29T19:01:40.040Z
session_ended_at_amsp,2019-12-28T20:34:05.575Z,2019-09-05T16:27:37.627Z,2019-07-20T15:17:28.374Z
session_started_at_utc0,2019-12-07T22:33:48.690Z,2019-08-22T20:34:04.004Z,2019-06-29T22:01:40.040Z
session_ended_at_utc0,2019-12-28T22:34:05.575Z,2019-09-05T19:27:37.627Z,2019-07-20T18:17:28.374Z
session_duration_seconds,1.814e+06,1.206e+06,1.801e+06


# Campo: sum_event_open
_128 valores null (??)_

_7 valores zerados_

_97% abaixo de 20 eventos_

# Campo: sum_view_restaurant_screen

_15264 valores nulos_

_1828 valores zerados_

_87% até 20 eventos_

# Campo: sum_view_dish_screen

_5730 valores nulos_

_714 valores zerados_

_95% até 20 eventos_

# Campo: sum_click_add_item

_7069 valores nulos_

_890 valores zerados_

_98% até 20 eventos_

d
 # Campo: sum_view_checkout
 _1925 valores nulos_
 
 _0 valores zerados_
 
 _98% até 20 eventos_

# Distribuição dos eventos nas sessões

In [0]:
from pyspark.sql.functions import lit

lista = {
    'sum_event_open': 'open',
    'sum_view_restaurant_screen': 'view restaurant',
    'sum_view_dish_screen': 'view dish',
    'sum_click_add_item': 'click add',
    'sum_view_checkout': 'checkout'
}

final = None

for item, label in lista.items():
  print(item)
  dataset = df_sessions_visits.select(item).where(df_sessions_visits[item] > 0).groupby(item).count().orderBy(item).limit(20)
  dataset = dataset.withColumn("tabela", lit(label))
  dataset = dataset.withColumnRenamed(item, 'eventos por sessão')
  dataset = dataset.withColumnRenamed('count', 'total de sessões')
  
  if final is None:
    final = dataset
  else:
    final = final.unionAll(dataset)

display(final)

eventos por sessão,total de sessões,tabela
1.0,62400,open
2.0,68356,open
3.0,58757,open
4.0,49342,open
5.0,40024,open
6.0,33117,open
7.0,27057,open
8.0,22074,open
9.0,17720,open
10.0,14475,open


#Campo: sum_callback_purchase

_1352 valores nulos_

_211 valores zerados_

# Campo: order_session_quantity

_1773 valores nulos_

_368 valores zerados_

In [0]:
from pyspark.sql.functions import lit

lista = {
    'sum_callback_purchase': 'purchase',
    'order_session_quantity': 'order'
}

final = None

for item, label in lista.items():
  print(item)
  dataset = df_sessions_visits.select(item).where(df_sessions_visits[item] > 0)
  dataset = dataset.groupby(item).count().orderBy(item).limit(5)
  dataset = dataset.withColumn("tabela", lit(label))
  dataset = dataset.withColumnRenamed(item, 'eventos por sessão')
  dataset = dataset.withColumnRenamed('count', 'total de sessões')
  
  if final is None:
    final = dataset
  else:
    final = final.unionAll(dataset)

display(final)

eventos por sessão,total de sessões,tabela
1.0,409213,purchase
2.0,35459,purchase
3.0,8949,purchase
4.0,3429,purchase
5.0,1767,purchase
1.0,422286,order
2.0,31898,order
3.0,5014,order
4.0,1183,order
5.0,322,order


In [0]:
dfZero = summaryZeroNull(
  df_sessions_visits,
  [
    'sum_event_open',
    'sum_view_restaurant_screen',
    'sum_view_dish_screen',
    'sum_click_add_item',
    'sum_view_checkout',
    'sum_callback_purchase',
    'order_session_quantity'
  ]
)

In [0]:
dfSummary = df_sessions_visits[[
  'sum_event_open',
  'sum_view_restaurant_screen',
  'sum_view_dish_screen',
  'sum_click_add_item',
  'sum_view_checkout',
  'sum_callback_purchase',
  'order_session_quantity'
]].summary().toPandas().transpose(copy=True)

In [0]:
dfFinal = dfSummary.copy()
dfFinal = dfFinal.reset_index()
dfFinal.columns = dfFinal.iloc[0,:].values
dfFinal.drop(index=[0], inplace=True)
dfFinal = dfFinal.astype({
  'mean':'float32', 'stddev':'float32',
  'min':'float32', '25%':'float32', '50%':'float32', '75%':'float32', 'max':'float32'
})

In [0]:
dfResumo = dfZero.merge(dfFinal)
dfResumo

Unnamed: 0,summary,null,zero,count,mean,stddev,min,25%,50%,75%,max
0,sum_event_open,128,7,463492,6.0916,6.2486,0.0,2.0,4.0,8.0,631.0
1,sum_view_restaurant_screen,15264,1828,448356,8.9338,13.58,0.0,2.0,5.0,11.0,1086.0
2,sum_view_dish_screen,5730,714,457890,5.7182,6.8103,0.0,2.0,4.0,7.0,409.0
3,sum_click_add_item,7069,890,456551,2.9305,3.9398,0.0,1.0,2.0,4.0,708.0
4,sum_view_checkout,1925,0,461695,4.2081,6.5453,1.0,1.0,3.0,5.0,800.0
5,sum_callback_purchase,1352,211,462268,1.2486,2.2442,0.0,1.0,1.0,1.0,359.0
6,order_session_quantity,1773,368,461847,1.1534,2.0747,0.0,1.0,1.0,1.0,311.0
