# Setup

This example will go over how to use bodo to read from an iceberg table located in a local filesystem. 


In [1]:
import bodo
import time
import numpy as np
import pandas as pd
import json
import os
import pyarrow.fs as pafs

For this example, we will read TPCH data from S3. Note, this requires configuring your aws credentials e.g. ensure you have ran `aws configure`.

In [2]:
@bodo.jit
def bodo_read_parquet(path):
    df = pd.read_parquet(path)
    return df

bodo_df = bodo_read_parquet("s3://bodo-example-data/tpch/SF1/lineitem.pq")
    

    conda install openjdk=11 -c conda-forge
and then reactivate your environment via
    conda deactivate && conda activate /Users/scottroutledge/miniforge3


In [3]:
bodo_df.shape

(6001215, 16)

Bodo supports a handful of different catalogs for interacting with Iceberg tables. See Bodo's [iceberg documentation](https://docs.bodo.ai/2024.2/file_io/?h=iceberg#iceberg-section) for more details.

In [4]:
# Hadoop catalog on local filesystem
conn = "iceberg+file:///Absolute/Path/To/Your/Iceberg/Warehouse/",
# Hadoop catalog on S3:
# conn = "iceberg+s3://<S3 PATH TO ICEBERG WAREHOUSE>"
# AWS Glue:
# conn = "iceberg+glue?<params>//<PATH TO WARHOUSE>"
db_name = "TEST_DB"
table_name = "SF1_LINEITEM_PQ"

@bodo.jit
def write_iceberg_table(df):
    print("starting write...")
    df.to_sql(table_name, conn, schema=db_name, if_exists="fail")

write_iceberg_table(bodo_df)

starting write...


  res = func(*args, **kwargs)
    conda install openjdk=11 -c conda-forge
and then reactivate your environment via
    conda deactivate && conda activate /Users/scottroutledge/miniforge3
Launching JVM with Java executable: java


In [5]:
@bodo.jit
def read_iceberg_table(conn, table_name, db_name):
    df = pd.read_sql_table(
            table_name=table_name,
            con=conn,
            schema=db_name
        )
    return df

In [6]:
@bodo.jit
def q01(lineitem):
    t1 = time.time()
    date = pd.Timestamp("1998-09-02")
    lineitem_filtered = lineitem.loc[
                        :,
                        [
                            "L_QUANTITY",
                            "L_EXTENDEDPRICE",
                            "L_DISCOUNT",
                            "L_TAX",
                            "L_RETURNFLAG",
                            "L_LINESTATUS",
                            "L_SHIPDATE",
                            "L_ORDERKEY",
                        ],
                        ]
    sel = lineitem_filtered.L_SHIPDATE <= date
    lineitem_filtered = lineitem_filtered[sel]
    lineitem_filtered["AVG_QTY"] = lineitem_filtered.L_QUANTITY
    lineitem_filtered["AVG_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE
    lineitem_filtered["DISC_PRICE"] = lineitem_filtered.L_EXTENDEDPRICE * (
            1 - lineitem_filtered.L_DISCOUNT
    )
    lineitem_filtered["CHARGE"] = (
            lineitem_filtered.L_EXTENDEDPRICE
            * (1 - lineitem_filtered.L_DISCOUNT)
            * (1 + lineitem_filtered.L_TAX)
    )
    gb = lineitem_filtered.groupby(["L_RETURNFLAG", "L_LINESTATUS"], as_index=False)[
        "L_QUANTITY",
        "L_EXTENDEDPRICE",
        "DISC_PRICE",
        "CHARGE",
        "AVG_QTY",
        "AVG_PRICE",
        "L_DISCOUNT",
        "L_ORDERKEY",
    ]
    total = gb.agg(
        {
            "L_QUANTITY": "sum",
            "L_EXTENDEDPRICE": "sum",
            "DISC_PRICE": "sum",
            "CHARGE": "sum",
            "AVG_QTY": "mean",
            "AVG_PRICE": "mean",
            "L_DISCOUNT": "mean",
            "L_ORDERKEY": "count",
        }
    )
    total = total.sort_values(["L_RETURNFLAG", "L_LINESTATUS"])
    print(total.head())
    print("Q01 Execution time (s): ", time.time() - t1)
    

Finally, we can read from our local table and run a query.

In [7]:
@bodo.jit
def run_queries():
    print("#" * 128)
    print("Started Q01 Execution...")
    print("#" * 128)

    # Load the data
    start_time = time.time()

    lineitem = read_iceberg_table(conn, table_name, db_name)

    print("Read time (s)", time.time() - start_time)
    
    q01(lineitem)
    

In [8]:
run_queries()

################################################################################################################################
Started Q01 Execution...
################################################################################################################################
Read time (s) 0.8145419999998467
  L_RETURNFLAG L_LINESTATUS  L_QUANTITY  ...     AVG_PRICE  L_DISCOUNT  L_ORDERKEY
3            A            F  37734107.0  ...  38273.129735    0.049985     1478493
1            N            F    991417.0  ...  38284.467761    0.050093       38854
2            N            O  74476040.0  ...  38249.117989    0.049997     2920374
0            R            F  37719753.0  ...  38250.854626    0.050009     1478870

[4 rows x 10 columns]
Q01 Execution time (s):  0.10305499999981294
