In [1]:
import os
import pandas as pd
from pathlib import Path
from google.cloud import bigquery

from targetandmarket.config import data_folder

In [2]:
# BigQuery client

SERVICE_ACCOUNT = os.environ['GOOGLE_API_KEY']
bqclient = bigquery.Client.from_service_account_json(SERVICE_ACCOUNT)

**Preliminary look at raw data stored on BigQuery**

In [10]:
# For user behaviour I am interested in the user_engagement events.
# Query from events of May 01, 2020
query = """
SELECT user_id, event_timestamp, event_name, event_params
FROM `analytics_157832975.events_20200501`
WHERE event_name = 'user_engagement'
LIMIT 100
"""

beh_data = pd.DataFrame(bqclient.query(query).result().to_dataframe())

In [17]:
beh_data.head()

Unnamed: 0,user_id,event_timestamp,event_name,event_params
0,145944317,1588338370189003,user_engagement,"[{'key': 'freeride', 'value': {'string_value':..."
1,145944317,1588338370313005,user_engagement,"[{'key': 'freeride', 'value': {'string_value':..."
2,145944317,1588338417014006,user_engagement,"[{'key': 'engaged_session_event', 'value': {'s..."
3,1273103581,1588346644164004,user_engagement,"[{'key': 'freeride', 'value': {'string_value':..."
4,1273103581,1588348301427004,user_engagement,"[{'key': 'freeride', 'value': {'string_value':..."


**For every user_engagement event, event_params are recorded which are a list of dictionaries. Event_params contains the session ID (ga_session_id) and engagement time (engagement_time_msec) which are of interest to me.**

In [18]:
beh_data.iloc[0, 3]

[{'key': 'freeride',
  'value': {'string_value': None,
   'int_value': 1,
   'float_value': None,
   'double_value': None}},
 {'key': 'engaged_session_event',
  'value': {'string_value': None,
   'int_value': 1,
   'float_value': None,
   'double_value': None}},
 {'key': 'firebase_event_origin',
  'value': {'string_value': 'auto',
   'int_value': None,
   'float_value': None,
   'double_value': None}},
 {'key': 'ga_session_number',
  'value': {'string_value': None,
   'int_value': 2135,
   'float_value': None,
   'double_value': None}},
 {'key': 'engagement_time_msec',
  'value': {'string_value': None,
   'int_value': 2081,
   'float_value': None,
   'double_value': None}},
 {'key': 'firebase_screen_id',
  'value': {'string_value': None,
   'int_value': 2984501255267771536,
   'float_value': None,
   'double_value': None}},
 {'key': 'ga_session_id',
  'value': {'string_value': None,
   'int_value': 1588338368,
   'float_value': None,
   'double_value': None}},
 {'key': 'firebase_screen

**Next step:**

Total data size is 125 GB with over 36 million "user_engagement" events. The next step is to query engineered features that could have some predictive power.