### Configuring credentials
To run the following code, ensure that you have the following environment variables set:
* `SF_USERNAME`
* `SF_PASSWORD`
* `SF_ACCOUNT`
* `DATABASE_NAME`

This example uses data from TPC-H. In your snowflake account, ensure that you can access the [TPC-H sample database](https://docs.snowflake.com/en/user-guide/sample-data-tpch).

In [3]:
import os
username=os.environ["SF_USERNAME"]
password=os.environ["SF_PASSWORD"]
account=os.environ["SF_ACCOUNT"]
warehouse="TEST_WH"
database=os.environ["DATABASE_NAME"]
schema="TPCH_SF1"

In [7]:
import time
import bodo
import bodosql
@bodo.jit(cache=False)
def tpch_q01_sql(schema,conn_str):
    t1 = time.time()
    bc = bodosql.BodoSQLContext(
        {
            "LINEITEM": bodosql.TablePath(
                f"{schema}.lineitem", "sql", conn_str=conn_str, reorder_io=True
            ),
        })
    total = bc.sql(
        """select
                l_returnflag,
                l_linestatus,
                sum(l_quantity) as sum_qty,
                sum(l_extendedprice) as sum_base_price,
                sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
                sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
                avg(l_quantity) as avg_qty,
                avg(l_extendedprice) as avg_price,
                avg(l_discount) as avg_disc,
                count(*) as count_order
            from
                lineitem
            where
                l_shipdate <= date '1998-12-01' - interval '90' day
            group by
                l_returnflag,
                l_linestatus
            order by
                l_returnflag,
                l_linestatus"""
    )

    print("Q01 Execution time (s): ", time.time() - t1)
    return total

q1_result=tpch_q01_sql(schema,f"snowflake://{username}:{password}@{account}/{database}/PUBLIC?warehouse={warehouse}")

Q01 Execution time (s):  8.574808000000303


In [8]:
q1_result

Unnamed: 0,L_RETURNFLAG,L_LINESTATUS,SUM_QTY,SUM_BASE_PRICE,SUM_DISC_PRICE,SUM_CHARGE,AVG_QTY,AVG_PRICE,AVG_DISC,COUNT_ORDER
0,A,F,37734107.0,56586554400.73,53758257134.87,55909065222.827705,25.522006,38273.129735,0.049985,1478493
1,N,F,991417.0,1487504710.38,1413082168.0541,1469649223.194375,25.516472,38284.467761,0.050093,38854
2,N,O,74476040.0,111701729697.74,106118230307.6056,110367043872.497,25.502227,38249.117989,0.049997,2920374
3,R,F,37719753.0,56568041380.9,53741292684.604,55889619119.83192,25.505794,38250.854626,0.050009,1478870


### Configuring credentials
The following code requires AWS credientials i.e. ensure that you have run `aws configure`

In [16]:
import time
import bodo
import bodosql
@bodo.jit(cache=True)
def divvy_sql():
    t1 = time.time()
    bc = bodosql.BodoSQLContext(
        {
            "DIVVY": bodosql.TablePath(
                "s3://bodo-divvy-data/parquet", "parquet"
            ),
        })
    df = bc.sql(
        """select
               \"member_casual\",count(*) as count from divvy group by \"member_casual\""""
    )
    print("Execution time (s): ", time.time() - t1)
    return df

df=divvy_sql()

Execution time (s):  1.527407999999923


In [17]:
df

Unnamed: 0,member_casual,COUNT
0,casual,1376114
1,member,1866825


In [18]:
schema="TPCH_SF10"

In [19]:
import time
import bodo
import bodosql
@bodo.jit(cache=True)
def tpch_q01_sql(schema,conn_str):
    t1 = time.time()
    bc = bodosql.BodoSQLContext(
        {
            "LINEITEM": bodosql.TablePath(
                f"{schema}.lineitem", "sql", conn_str=conn_str, reorder_io=True
            ),
        })
    total = bc.sql(
        """select
                l_returnflag,
                l_linestatus,
                sum(l_quantity) as sum_qty,
                sum(l_extendedprice) as sum_base_price,
                sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
                sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
                avg(l_quantity) as avg_qty,
                avg(l_extendedprice) as avg_price,
                avg(l_discount) as avg_disc,
                count(*) as count_order
            from
                lineitem
            where
                l_shipdate <= date '1998-12-01' - interval '90' day
            group by
                l_returnflag,
                l_linestatus
            order by
                l_returnflag,
                l_linestatus"""
    )

    print("Q01 Execution time (s): ", time.time() - t1)
    return total

q1_result=tpch_q01_sql(schema,f"snowflake://{username}:{password}@{account}/{database}/PUBLIC?warehouse={warehouse}")

Q01 Execution time (s):  46.32196000000022


In [20]:
q1_result

Unnamed: 0,L_RETURNFLAG,L_LINESTATUS,SUM_QTY,SUM_BASE_PRICE,SUM_DISC_PRICE,SUM_CHARGE,AVG_QTY,AVG_PRICE,AVG_DISC,COUNT_ORDER
0,A,F,377518399.0,566065727797.25,537759104278.0657,559276670892.1167,25.500975,38237.151009,0.050007,14804077
1,N,F,9851614.0,14767438399.170006,14028805792.211391,14590490998.366732,25.522448,38257.81066,0.049973,385998
2,N,O,743124873.0,1114302286901.88,1058580922144.964,1100937000170.5913,25.498076,38233.902923,0.050001,29144351
3,R,F,377732830.0,566431054976.0,538110922664.7676,559634780885.0863,25.508385,38251.219274,0.049997,14808183


In [22]:
import time
import bodo
import bodosql
@bodo.jit(cache=True)
def divvy_sql():
    t1 = time.time()
    bc = bodosql.BodoSQLContext(
        {
            "DIVVY": bodosql.TablePath(
                "s3://bodo-divvy-data/parquet", "parquet"
            ),
        })
    df = bc.sql(
        """select
               \"member_casual\",count(*) as count from divvy group by \"member_casual\""""
    )
    print("Execution time (s): ", time.time() - t1)
    return df

df=divvy_sql()

Execution time (s):  1.5023689999998169


In [23]:
df

Unnamed: 0,member_casual,COUNT
0,casual,1376114
1,member,1866825
