In [None]:
!pip install duckdb --pre --upgrade && pip install boto3

In [None]:
from google.cloud import aiplatform
import duckdb
import boto3
import time
import pandas as pd

duckdb.__version__

In [None]:
pd.__version__

In [None]:
!gcloud config get project

In [None]:
PROJECT_ID = "

if PROJECT_ID == "" or PROJECT_ID is None or PROJECT_ID == "[your-project-id]":
    # Get your GCP project id from gcloud
    shell_output = ! gcloud config list --format 'value(core.project)' 2>/dev/null
    PROJECT_ID = shell_output[0]
    print("Project ID:", PROJECT_ID)

In [None]:
REGION = "us-west1"  # @param {type:"string"}
FEATURESTORE_ID = "taxidata_fs"

aiplatform.init(project=PROJECT_ID, location=REGION)

In [None]:
!free -g

In [None]:
MAX_MEMORY = "35GB" # increase to available python memory -25%
TMP_DIR = "fg-data-v8"
DUCKDB_FILE = f"{TMP_DIR}/taxi.duckdb"
DATA_FOLDER = f"{TMP_DIR}/taxidata" 

# S3 Uploads
AWS_ACCESS_KEY=''
AWS_SECRET_ACCESS_KEY=''
AWS_REGION='us-east-2'
BUCKET = "hopsworks-bench-datasets"
session = boto3.Session(aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3')


In [None]:
!mkdir -p {TMP_DIR}
!mkdir -p {DATA_FOLDER}

In [None]:
#con.close()
con = duckdb.connect(DUCKDB_FILE, config={'memory_limit': MAX_MEMORY, 'temp_directory': TMP_DIR}) 
con.execute("INSTALL httpfs;")
con.execute("INSTALL parquet;")
con.execute("LOAD httpfs;")
con.execute("LOAD parquet;")
con.execute(f"""
    SET s3_region='{AWS_REGION}';
    SET s3_access_key_id='{AWS_ACCESS_KEY}';
    SET s3_secret_access_key='{AWS_SECRET_ACCESS_KEY}';
    """)

In [None]:
# Get tables as a dataframe
con.execute("PRAGMA threads=16")
con.execute("SET preserve_insertion_order=false")


In [None]:
def read_feature_data(limit, offset):
    lim = limit
    off = offset
    query = f'''
        CREATE 
        OR REPLACE VIEW taxidata 
        AS
        SELECT 
            tpep_pickup_datetime, 
            pu_location_id, 
            pu_borough,
            pu_svc_zone,
            pu_zone 
        FROM 
            read_parquet([
                's3://{BUCKET}/taxidata_cleaned/2011.parquet',
                's3://{BUCKET}/taxidata_cleaned/2012.parquet',
                's3://{BUCKET}/taxidata_cleaned/2013.parquet',
                's3://{BUCKET}/taxidata_cleaned/2014.parquet',
                's3://{BUCKET}/taxidata_cleaned/2015.parquet',
                's3://{BUCKET}/taxidata_cleaned/2016.parquet'
            ])
    '''
    con.execute(query)
    raw_data = con.execute(f"SELECT * FROM taxidata LIMIT {lim} OFFSET {off}").df()
    # Add row_id index to raw_data
    raw_data['row_id'] = raw_data.reset_index().index
    row_id = raw_data.pop('row_id')
    raw_data.insert(0, 'row_id', row_id)
    return raw_data

In [None]:
# Create featurestore
try:
    taxidata_feature_store = aiplatform.Featurestore(
        featurestore_name='taxidata_fs'
    )
    print("Featurestore already exists...")
except:
    print("Featurestore not found, creating it instead...")
    taxidata_feature_store = aiplatform.Featurestore.create(
        featurestore_id="taxidata_fs",
        online_store_fixed_node_count=0
    )
    
taxidata_feature_store

In [None]:
def get_or_create_fg_entity_type(sf):
    entity_type_name = f"pickup_read_entity_type_{sf}"
    try:
        pickup_fg_entity_type = taxidata_feature_store.get_entity_type(
            entity_type_id=entity_type_name,
        )
        print("pickup feature entity already exists...")
    except:
        print("pickup feature entity type not found, creating it instead...")
        pickup_fg_entity_type = taxidata_feature_store.create_entity_type(
            entity_type_id=entity_type_name,
            description="Pickup features entity type",
        )
        print("Found FG entity type", entity_type_name)
        # Create feature group config for newly created FG entity type
        create_fg_config(pickup_fg_entity_type)
    
    return pickup_fg_entity_type

In [None]:
pickup_fg_config = {
    "pu_location_id": {
        "value_type": "INT64",
        "description": "Pickup location ID",
    },
    "pu_borough": {
        "value_type": "STRING",
        "description": "Pickup borough",
    },
    "pu_svc_zone": {
        "value_type": "STRING",
        "description": "Pickup service zone",
    },
    "pu_zone": {
        "value_type": "STRING",
        "description": "Pickup zone",
    },
}

def create_fg_config(pickup_fg_entity_type):
    try:
        pickup_fg_entity = pickup_fg_entity_type.batch_create_features(
            feature_configs=pickup_fg_config,
            sync = True
        )
        print("Entity feature group definition created")
    except:
        print("Entity feature group definition already exists")

## Ingest features

In [None]:
scale_factor = [50] # Number of millions of rows to scale 
limit = 5000000 # Get 5M at a time so it's faster

for sf in scale_factor:
    offset=0
    total_rows = sf * 1000000  # Millions
    while offset < total_rows:
        print(f"Total rows: {total_rows}; Offset: {offset}")
        # Compute the pickup features from raw data
        pickup_features = read_feature_data(limit, offset)
        pickup_features = pickup_features.astype({"row_id": "string"})
        pickup_features['tpep_pickup_datetime'] = pd.to_datetime(pickup_features['tpep_pickup_datetime'], format='%Y-%m-%d %H:%M:%S').astype('datetime64[ns, UTC]')
        # Get the FG entity type
        pickup_fg_entity_type = get_or_create_fg_entity_type(sf)
        PICKUP_FEAT_IDS = [feature.name for feature in pickup_fg_entity_type.list_features()]
        print(f"Inserting into FG entity type: {pickup_fg_entity_type} - {sf}")
        pickup_fg_entity_type.ingest_from_df(
            feature_ids=PICKUP_FEAT_IDS,
            feature_time="tpep_pickup_datetime",
            entity_id_field="row_id",
            df_source=pickup_features
        )
        offset += limit

# Benchmarks

In [None]:
def get_raw_data(sf):
    query = f'''
      SELECT 
          tpep_pickup_datetime,
          trip_distance,
          tip_amount,
          fare_amount
      FROM 
          read_parquet([
            's3://{BUCKET}/taxidata_cleaned/2011.parquet', 
            's3://{BUCKET}/taxidata_cleaned/2012.parquet', 
            's3://{BUCKET}/taxidata_cleaned/2013.parquet',
            's3://{BUCKET}/taxidata_cleaned/2014.parquet',
            's3://{BUCKET}/taxidata_cleaned/2015.parquet',
            's3://{BUCKET}/taxidata_cleaned/2016.parquet'
          ])
      LIMIT {sf*1000000}
    '''
    raw_data = con.execute(query).df()
    # Add row_id index to raw_data
    raw_data['row_id'] = raw_data.reset_index().index
    row_id = raw_data.pop('row_id')
    raw_data.insert(0, 'row_id', row_id)
    return raw_data

def transform_raw_data(sf, fg):
    pickup_fg_entity_type = fg
    raw_data = get_raw_data(sf)
    raw_data = raw_data.astype({"row_id": "string"})
    raw_data.rename(columns={'row_id':f'{pickup_fg_entity_type.name}'}, inplace = True)
    ts = raw_data.pop('tpep_pickup_datetime')
    raw_data.insert(len(raw_data.columns), 'timestamp', ts)
    print("before: ", raw_data['timestamp'].dtype)
    raw_data['timestamp'] = pd.to_datetime(raw_data['timestamp'], format='%Y-%m-%dT%H:%M:%SZ').astype('datetime64[ns, UTC]')
    print("after:  ", raw_data['timestamp'].dtype)
    return raw_data

In [None]:
def run_benchmark_with_data_in_memory(sf):
    pickup_fg_entity_type = get_or_create_fg_entity_type(sf)
    SERVING_FEATURE_IDS = {
        f"{pickup_fg_entity_type.name}": ["pu_location_id", "pu_borough", "pu_svc_zone", "pu_zone"]
    }
    raw_data = transform_raw_data(sf, pickup_fg_entity_type)
    start = time.time()
    training_df = taxidata_feature_store.batch_serve_to_df(
        serving_feature_ids=SERVING_FEATURE_IDS,
        read_instances_df=raw_data,
        pass_through_fields=["trip_distance", "fare_amount", "tip_amount"],
    )
    print(f"Time taken for in-memory create of FG training data: {time.time() - start}")

    training_df


# Create training data in memory
### Note: In Vertex FS, even with in-memory, the data is first written to a BQ table and then served from there.

## SF=5M

In [None]:
sf=5
run_benchmark_with_data_in_memory(sf)

## SF=10M

In [None]:
sf=10
run_benchmark_with_data_in_memory(sf)

## SF=20M

In [None]:
sf=20
run_benchmark_with_data_in_memory(sf)

## SF=50M

In [None]:
sf=50
run_benchmark_with_data_in_memory(sf)

# Create training data in BigQuery

### Helper Functions

In [None]:
def create_read_instances_csv(sf):
    # Create read-instances csv file from raw_data
    READ_INSTANCES_CSV = f'ri-{sf}m.csv'

    # cols = ['pu_row_id','do_row_id','trip_distance','fare_amount','tip_amount','timestamp']
    raw_data.to_csv(READ_INSTANCES_CSV, header=True, index=False)

In [None]:
scale_factor = [5, 10, 20, 50]
for sf in scale_factor:
    pickup_fg_entity_type = get_or_create_fg_entity_type(sf)
    raw_data = transform_raw_data(sf, pickup_fg_entity_type)
    create_read_instances_csv(sf)
    raw_data.count()

In [None]:
! gsutil cp ri-*m.csv 'gs://ayush-bench/'

In [None]:
# Confirm if CSV uploaded correctly
import csv

sf=5
READ_INSTANCES_CSV = f'ri-{sf}m.csv'

with open(READ_INSTANCES_CSV) as file:
    count = 0
    reader = csv.reader(file, delimiter=',')
    for row in reader:
        if count < 5:
            print(row)
            count+=1
        else:
            break
    file.close()

In [None]:
from google.cloud import bigquery

def run_bq_table_benchmark(sf):
    READ_INSTANCES_CSV = f'ri-{sf}m.csv'
    READ_INSTANCES_CSV_URI = f'gs://ayush-bench/{READ_INSTANCES_CSV}'

    SERVING_FEATURE_IDS = {
        f"pickup_read_entity_type_{sf}": ["pu_location_id", "pu_borough", "pu_svc_zone", "pu_zone"]
    }

    # Output dataset
    DESTINATION_DATA_SET = "taxidata"  # @param {type:"string"}
    VERSION = "v1"
    DESTINATION_DATA_SET = "{prefix}_{version}".format(
        prefix=DESTINATION_DATA_SET, version=VERSION
    )

    # Output table. Make sure that the table does NOT already exist; the BatchReadFeatureValues API cannot overwrite an existing table
    DESTINATION_TABLE_NAME = f"taxidata_{pickup_fg_entity_type.name}"  # @param {type:"string"}

    DESTINATION_PATTERN = "bq://{project}.{dataset}.{table}"
    DESTINATION_TABLE_URI = DESTINATION_PATTERN.format(
        project=PROJECT_ID, dataset=DESTINATION_DATA_SET, table=DESTINATION_TABLE_NAME
    )


    # Delete existing BigQuery dataset first as BatchReadFeatureValues API cannot overwrite an existing table
    client = bigquery.Client(project=PROJECT_ID)
    client.delete_dataset(
        DESTINATION_DATA_SET, delete_contents=True, not_found_ok=True
    )
    print("Deleted dataset '{}'.".format(DESTINATION_DATA_SET))


    # Create dataset
    client = bigquery.Client(project=PROJECT_ID)
    dataset_id = "{}.{}".format(client.project, DESTINATION_DATA_SET)
    dataset = bigquery.Dataset(dataset_id)
    dataset.location = REGION
    try:
        dataset = client.create_dataset(dataset)
        print("Created dataset {}.{}".format(client.project, dataset.dataset_id))
    except:    
        print("Dataset {}.{} already exists. Delete did not work.".format(client.project, dataset.dataset_id))

    start = time.time()
    training_df = taxidata_feature_store.batch_serve_to_bq(
        bq_destination_output_uri=DESTINATION_TABLE_URI,
        serving_feature_ids=SERVING_FEATURE_IDS,
        read_instances_uri=READ_INSTANCES_CSV_URI,
        pass_through_fields=["trip_distance", "fare_amount", "tip_amount"],
    )
    print(f"Time taken for BigQuery write of SF {sf}M FG training data: {time.time() - start}")

## SF=5M

In [None]:
sf = 5
run_bq_table_benchmark(sf)

## SF=10M

In [None]:
sf = 10
run_bq_table_benchmark(sf)

## SF=20M

In [None]:
sf = 20
run_bq_table_benchmark(sf)

## SF=50M

In [None]:
sf = 50
run_bq_table_benchmark(sf)