In [1]:
%load_ext autoreload
%load_ext google.cloud.bigquery
%autoreload 2

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [3]:
!pip install feast[gcp,redis]
!pip install plotly

[autoreload of google.auth.jwt failed: Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/opt/conda/lib/python3.7/site-packages/IPython/extensions/autoreload.py", line 394, in superreload
    module = reload(module)
  File "/opt/conda/lib/python3.7/imp.py", line 314, in reload
    return importlib.reload(module)
  File "/opt/conda/lib/python3.7/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 630, in _exec
  File "<frozen importlib._bootstrap_external>", line 728, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/opt/conda/lib/python3.7/site-packages/google/auth/jwt.py", line 296, in <module>
    google.auth.credentials.Signing, google.auth.credentials.CredentialsWithQuotaProject
AttributeError: module 'google.auth' has no 

Collecting plotly
  Downloading plotly-5.5.0-py2.py3-none-any.whl (26.5 MB)
     |████████████████████████████████| 26.5 MB 6.2 MB/s            
Installing collected packages: plotly
Successfully installed plotly-5.5.0


In [6]:
import logging

from feast import FeatureStore

import plotly.express as px
import tensorflow as tf
from google.cloud.bigquery.job import QueryJobConfig
from feast.infra.offline_stores.bigquery import BigQueryOfflineStoreConfig
from feast import RepoConfig

logging.getLogger().setLevel(logging.INFO)
tf.get_logger().setLevel('INFO')


In [27]:
GOOGLE_CLOUD_PROJECT = "ueat-data-science-staging"
GOOGLE_CLOUD_REGION = "northamerica-northeast1"

Configure the access for Feast

In [30]:
FEAST_REGISTRY = "gs://ueat-feast-staging/data/registry.db"
FEAST_OFFLINE_DATASET =  "ueat_ai"


FEAST_OFFLINE_STORE_CONFIG = BigQueryOfflineStoreConfig(dataset=FEAST_OFFLINE_DATASET,
                                                        project_id=GOOGLE_CLOUD_PROJECT,
                                                        location=GOOGLE_CLOUD_REGION)

FEAST_REPO_CONFIG = RepoConfig(provider="gcp",
                               registry=FEAST_REGISTRY,
                               project="feature_repo",
                               offline_store=FEAST_OFFLINE_STORE_CONFIG)

DATASET_WITH_FEATURE = "ueat-data-science-staging.ueat_ai.asana_test_dataset"

store = FeatureStore(config=FEAST_REPO_CONFIG)

In [36]:

# Use this command for executing on a smaller dataset
#bigquery_entity_table = "ueat-data-science-staging.ueat_ai.RECSYS_ENTITIES_WITH_SPLIT_TEST"

bigquery_entity_table = "ueat-data-science-staging.ueat_ai.RECSYS_ENTITIES_WITH_SPLIT"

raw_data_query = f"""
SELECT
     RANK
    ,ORDER_ITEM_ID
    ,ORDER_ID
    ,ITEM_ID
    ,ROOT_ITEM_ID
    ,GLOBAL_USER_PROFILE_ID
    ,SERVICE_ID
    ,RESTAURANT_PROFILE_ID
    ,HEADQUARTER_PROFILE_ID
    ,QUANTITY
    ,CREATED_AT as event_timestamp
    ,CREATED_AT_LOCAL
    ,PROCESSED_DATE
    ,PROCESSED_DATE_LOCAL
    ,READY_DATE_LOCAL
    ,NEXT_ITEM_ID
    ,NEXT_ROOT_ITEM_ID
    ,SPLIT_STRATEGY_GLOBAL
    ,SPLIT_STRATEGY_HEADQUARTER
    ,SPLIT_STRATEGY_HYBRID
FROM `{bigquery_entity_table}` re
"""

Create the joined dataset with features on BigQuery

In [37]:
features = [
    "feature_menu_services:TYPE_DELIVERY",
    "feature_menu_services:TYPE_TAKEOUT",
    "feature_menu_services:TYPE_IN_RESTAURANT",
    "feature_menu_services:TYPE_OTHER",
    "feature_restaurants:POSTAL_CODE",
    "feature_restaurants:PROVINCE",
    "feature_items:DISPLAY_NAME",
    "feature_items:IS_ITEM",
    "feature_items:IS_VARIATION",
    "feature_items:IS_OPTION",
    "feature_items:IS_SUGGESTION"
]

retrieval_job = store.get_historical_features(
            entity_df=raw_data_query,
            features=features
)

job_config = QueryJobConfig(destination=DATASET_WITH_FEATURE, write_disposition="WRITE_TRUNCATE")

dataset_table = retrieval_job.to_bigquery(job_config=job_config)

Done writing to 'ueat-data-science-staging.ueat_ai.asana_test_dataset'.


Once this is done we can access the dataset directly from BigQuery

We load  a sample (10 %) of the data in a pandas dataframe with %%bigquery

In [39]:
%%bigquery sample_data
SELECT
    *
FROM `ueat-data-science-staging.ueat_ai.asana_test_dataset` TABLESAMPLE SYSTEM (10 PERCENT)

Query complete after 0.01s: 100%|██████████| 2/2 [00:00<00:00, 1008.00query/s]                        
Downloading: 100%|██████████| 3646375/3646375 [00:06<00:00, 576410.67rows/s]


In [40]:
sample_data.head(5)

Unnamed: 0,RANK,ORDER_ITEM_ID,ORDER_ID,ITEM_ID,ROOT_ITEM_ID,GLOBAL_USER_PROFILE_ID,SERVICE_ID,RESTAURANT_PROFILE_ID,HEADQUARTER_PROFILE_ID,QUANTITY,...,TYPE_TAKEOUT,TYPE_IN_RESTAURANT,TYPE_OTHER,POSTAL_CODE,PROVINCE,DISPLAY_NAME,IS_ITEM,IS_VARIATION,IS_OPTION,IS_SUGGESTION
0,31,48112752,36004684,85280,85280,419306,4729,682,328,1,...,True,False,False,08019,Catalunya,espárragos,False,False,True,False
1,34,26078019,18428116,67036,67036,675294,4783,689,237,1,...,True,False,False,G8Y,Québec,sauce tzatziki,False,False,True,False
2,18,49537198,37036845,171280,171280,1706504,11606,1509,695,1,...,False,False,False,J9T 1T7,Québec,poutine bbq,False,False,True,False
3,26,36640607,27265234,166683,166683,750531,11975,1557,722,1,...,True,False,False,G6W 8H1,Québec,gâteau chômeur,True,False,True,False
4,19,57338944,42770542,454880,454880,1861068,90454,5178,1774,1,...,True,False,False,R2V 4G4,Manitoba,cucumbers,False,False,True,False


The relevant features tables in BigQuery are:
    
    1. FEATURE_MENU_SERVICES: Features about the type of service
    
    2. FEATURE_RESTAURANTS: Features about the restaurant mainly geographic
    
    3. formatted_feature_items: Formatted features items
    
    