In [34]:
"""
Python testing suite
"""
from ai.chronon.repo.validator import ChrononRepoValidator
from ai.chronon.repo.compile import _write_obj
from ai.chronon.utils import get_underlying_source, get_query
from ai.chronon.api import ttypes

from ai.chronon.repo.run import Runner, download_jar

from datetime import datetime, timedelta

import getpass
import sys
import os

# Global vars.
user = getpass.getuser()
home = f"/home/{user}/notebooks_home"
chronon_root = f"{home}/repo/chronon"
output_folder = "production"
output_root = os.path.join(chronon_root, output_folder)

# Config specific vars
team = "chronon_test"
conf_name = f"{user}__join_test.v1"
metadata_name = ".".join([team, conf_name])
conf = os.path.join(output_root, "joins", team, conf_name)


# Add chronon modules to the path.
sys.path.append(chronon_root)


def mutate_obj(obj, modulo):
    """
    Add additional where clauses for obj to reduce the input size.
    """
    if isinstance(obj, ttypes.Join):
        mutate_join(obj, modulo)
    elif isinstance(obj, ttypes.GroupBy):
        mutate_group_by(obj, modulo)
    return obj


def additional_wheres(keys, query, modulo):
    mapped_keys = map(lambda x: query.selects[x], keys) if query.selects is not None else keys
    return [f"hash({key}) % {modulo} = 0" for key in mapped_keys]


def mutate_group_by(obj, modulo):
    """
    Group by modifications.
    """
    keys = obj.keyColumns
    for source in map(lambda x: get_underlying_source(x), obj.sources):
        wheres = additional_wheres(keys, source.query, modulo)
        source.query.wheres = wheres if not source.query.wheres else list(set(source.query.wheres + wheres))
    return obj


def mutate_join(obj, modulo):
    """
    Mutate a join source to filter keys.
    """
    keys = []
    for jp in obj.joinParts:
        keys.extend(jp.groupBy.keyColumns)
    source = get_underlying_source(obj.left)
    wheres = additional_wheres(keys, source.query, modulo)
    source.query.wheres = wheres if not source.query.wheres else list(set(source.query.wheres + wheres))
    for jp in obj.joinParts:
        jp.groupBy = mutate_group_by(jp.groupBy, modulo)
    return obj
    

class BaseArgs(object):
    def __init__(self, obj, days = 10):
        self.mode = 'backfill'
        self.ds = (datetime.strptime(get_query(obj.left).startPartition, '%Y-%m-%d') + timedelta(days=days)).strftime('%Y-%m-%d')
        self.conf = conf
        self.repo = chronon_root
        self.sub_help = False
        self.online_jar = None
        self.online_class = None
        self.args = ''
        self.app_name = obj.metaData.name.replace('.', '_')
        self.spark_submit_path = os.path.join(chronon_root, 'scripts/spark_submit.sh')
        self.list_apps = None
                                                                                       
def write_test_config(obj, modulo = 256, verbose = False):
    """
    Write a temporary config for testing
    """
    obj.metaData.name = metadata_name
    obj.metaData.team = team
    obj.metaData.outputNamespace = 'tmp'
    mutate_obj(obj, modulo)
    if _write_obj(
        output_root, 
        validator=ChrononRepoValidator(
            chronon_root_path=chronon_root, 
            output_root=output_folder), 
        name=metadata_name, 
        obj=obj, 
        log_level=None,
        force_compile=False,
        force_overwrite=True
    ) and verbose:
        with open(conf, 'r') as infile:
            print(infile.read())


def run_test_config(obj, days):
    jar_path = download_jar(None)
    os.environ["USER"] = user
    Runner(BaseArgs(obj, days= days), jar_path).run()
print("Loaded Test Suite")

Loaded Test Suite


In [35]:
!pip install chronon-ai -U

Looking in indexes: https://artifactory.d.musta.ch/artifactory/api/pypi/pypi/simple, https://sssp-bighead-artifacts.d.musta.ch/release/wheels/, https://sssp-bighead-artifacts.d.musta.ch/branch/wheels/
Looking in links: https://download.pytorch.org/whl/torch_stable.html


In [26]:
%%file ~/notebooks_home/repo/chronon/group_bys/chronon_test/test_tmp.py
"""
Sample GroupBy for Join
"""
from airbnb import test_sources
from ai.chronon.group_by import (
    Aggregation,
    GroupBy,
    Operation,
    TimeUnit,
    Window,
    Accuracy,
)


v2 = GroupBy(
    sources=test_sources.listing__<metric>,
    keys=["listing"],
    aggregations=[
        Aggregation(input_column="m_guests", operation=Operation.SUM, windows=[Window(7, TimeUnit.DAYS)]),
        Aggregation(input_column="m_dated_<metric>", operation=Operation.SUM),
    ],
    backfill_start_date="2022-01-14",
    output_namespace="chronon_test",
    dependencies=["<tablename>/ds={{ ds }}"],
    table_properties={
        'abb_retention_config_json': '{"policy": "delete_by_last_modified", "days": 30}',
    },
    online=True,
    production=True,
    team_override='ml_infra',
    accuracy=Accuracy.SNAPSHOT,
)

Overwriting /home/USER/notebooks_home/repo/chronon/group_bys/chronon_test/test_tmp.py


In [27]:
"""
Sample Online Join on a small dataset.
"""
from airbnb import test_sources
from ai.chronon.join import Join, JoinPart
from group_bys.chronon_test import test_online_group_by_small, test_tmp
from airbnb.data_sources_2 import HiveEventSource
from ai.chronon.query import Query, select


v1 = Join(
    left=HiveEventSource(
    namespace='global',
    table="<tablename>",
    query=Query(
        selects=select(
            listing="id_product",
            m_guests="m_guests",
            m_dated_<metric>="m_dated_<metric>"
        ),
        wheres=["<dimension> = 'VALUE'"],
        time_column="UNIX_TIMESTAMP(ts) * 1000",
        start_partition="2022-01-14",
        end_partition="2022-03-14",
    )),
    right_parts=[
        JoinPart(group_by=test_online_group_by_small.v1),
        JoinPart(group_by=test_tmp.v2),
    ],
    online=True,
    
)

In [28]:
write_test_config(v1, modulo = 256, verbose = True)

                Join Team - [34mchronon_test[0m
                Join Name - [34mUSER_polynote_join_test.v1[0m
[33mForce overwrite Join USER_polynote_join_test.v1[0m
          Writing Join to - [34m/home/USER/notebooks_home/repo/chronon/production/joins/chronon_test/USER_polynote_join_test.v1[0m
{
  "metaData": {
    "name": "chronon_test.USER_polynote_join_test.v1",
    "online": 1,
    "production": 0,
    "customJson": "{\"check_consistency\": false, \"lag\": 0}",
    "dependencies": [
      "{\"name\": \"wait_for_<tablename>_ds\", \"spec\": \"<tablename>/ds={{ ds }}\", \"start\": \"2022-01-14\", \"end\": \"2022-03-14\"}",
      "{\"name\": \"wait_for_<tablename>_ds\", \"spec\": \"<tablename>/ds={{ ds }}\", \"start\": \"2022-01-14\", \"end\": null}"
    ],
    "outputNamespace": "tmp",
    "team": "chronon_test"
  },
  "left": {
    "events": {
      "table": "<tablename>",
      "query": {
        "selects": {
          "listing": "id_product",
          "m_guests": "m_guest

In [30]:
start_time = datetime.now()
run_test_config(v1, 14)
print(f"Time taken: {(datetime.now() - start_time).total_seconds()} seconds")

Running command: curl -s https://s01.oss.sonatype.org/service/local/repositories/public/content/ai/chronon/spark_uber_2.11/maven-metadata.xml
Running command: curl -sI https://s01.oss.sonatype.org/service/local/repositories/public/content/ai/chronon/spark_uber_2.11/0.0.7/spark_uber_2.11-0.0.7-assembly.jar
Running command: wc -c /tmp/spark_uber_2.11-0.0.7-assembly.jar
Files sizes of https://s01.oss.sonatype.org/service/local/repositories/public/content/ai/chronon/spark_uber_2.11/0.0.7/spark_uber_2.11-0.0.7-assembly.jar vs. /tmp/spark_uber_2.11-0.0.7-assembly.jar
    Remote size: 25456697
    Local size : 25456697
Sizes match. Assuming its already downloaded.
Setting env variables:
Found EMR_CLUSTER=<emr cluster>
Found EMR_QUEUE=backfill
Found EXECUTOR_CORES=1
Found DRIVER_MEMORY=15G
Found EXECUTOR_MEMORY=8G
Found PARALLELISM=4000
Found MAX_EXECUTORS=1000
Found APP_NAME=chronon_test_USER_polynote_join_test_v0
Found CHRONON_CONF_PATH=/home/USER/notebooks_home/repo/chronon/production/joins

+ mkdir -p /tmp/USER
+ export LOG4J_FILE=/tmp/USER/log4j_file
+ LOG4J_FILE=/tmp/USER/log4j_file
+ cat
+ export TEST_NAME=chronon_test_USER_polynote_join_test_v0_USER_test
+ TEST_NAME=chronon_test_USER_polynote_join_test_v0_USER_test
+ unset PYSPARK_DRIVER_PYTHON
+ unset PYSPARK_PYTHON
+ unset SPARK_HOME
+ unset SPARK_CONF_DIR
+ grep -v YarnScheduler:70
+ emr-spark-submit --spark-version 2.4.0 --emr-cluster <emr cluster> --hive-cluster silver --queue backfill --driver-java-options ' -Dlog4j.configuration=file:/tmp/USER/log4j_file' --conf 'spark.executor.extraJavaOptions= -XX:ParallelGCThreads=4 -XX:+UseParallelGC -XX:+UseCompressedOops' --conf spark.reducer.maxReqsInFlight=1024 --conf spark.reducer.maxBlocksInFlightPerAddress=1024 --conf spark.reducer.maxSizeInFlight=256M --conf spark.shuffle.file.buffer=1M --conf spark.shuffle.service.enabled=true --conf spark.shuffle.service.index.cache.entries=2048 --conf spark.shuffle.io.serverThreads=128 --conf spark.shuffle.io.backLog=1024 --conf 

Preparing Spark client dependencies...
Namespace(caller='spark-submit-k8s', cluster_info_directory='s3://sssp/data-infra/emr-client-config/', data_infra_git_sha='', dev_mode=False, emr_cluster='<emr cluster>', host_env='k8s', metastore='silver', name='spark', skip_update=False, version='2.4.0')
Running command: aws s3 cp s3://sssp/data-infra/emr-client-config/cluster-info.yaml /tmp/cluster-info-0d3b3fe1-65a8-4059-8271-c4066f4ff7d8.yaml
Running command: aws s3api head-object --bucket sssp --key hadoop-emr/libs/emr-5.30.2-0.tar.gz
Artifact has not changed, skipping downloading.
prepare_dwi_client_deps_kvp:: env_var: SPARK_HOME, value: /mnt/dwi_client/spark/2.4.0/current/spark_2.4.0_uncompressed/emr-5.30.2-0/spark
The link /mnt/dwi_client/spark/2.4.0/current/spark_2.4.0_uncompressed/emr-5.30.2-0/ to /mnt/opt/hadoop-emr/emr-current already exists, skipping
Running command: aws s3api head-object --bucket sssp --key spark/hive_jars/hive-2.3.4_jars_3a37fbc.tar.gz
Artifact has not changed, ski

CalledProcessError: Command '['bash', '/home/USER/notebooks_home/repo/chronon/scripts/spark_submit.sh', '--class', 'ai.chronon.spark.Driver', '/tmp/spark_uber_2.11-0.0.7-assembly.jar', 'join', '--conf-path=/home/USER/notebooks_home/repo/chronon/production/joins/chronon_test/USER_polynote_join_test.v1', '--end-date=2022-01-28']' returned non-zero exit status 137.