# Gravitino Federation Test
Single Spark session connecting to Gravitino to query Iceberg and Hudi tables

## 1. Environment Setup

In [1]:
import os
import subprocess

# Set Java 11+ (required by Iceberg/Gravitino)
java_home = subprocess.check_output(
    ['/usr/libexec/java_home', '-v', '19'], 
    text=True
).strip()
os.environ['JAVA_HOME'] = java_home

# Set Spark 3.5
os.environ['SPARK_HOME'] = os.path.expanduser('~/Documents/spark-3.5.3-bin-hadoop3')
os.environ['PYSPARK_PYTHON'] = 'python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3'

print(f"JAVA_HOME: {os.environ['JAVA_HOME']}")
print(f"SPARK_HOME: {os.environ['SPARK_HOME']}")

JAVA_HOME: /Library/Java/JavaVirtualMachines/temurin-19.jdk/Contents/Home
SPARK_HOME: /Users/andywalner/Documents/spark-3.5.3-bin-hadoop3


## 2. Create Spark Session with Gravitino

In [2]:
import os
import subprocess

# MUST set environment BEFORE importing pyspark
java_home = subprocess.check_output(['/usr/libexec/java_home', '-v', '19'], text=True).strip()
os.environ['JAVA_HOME'] = java_home
os.environ['SPARK_HOME'] = os.path.expanduser('~/Documents/spark-3.5.3-bin-hadoop3')
os.environ['PYSPARK_PYTHON'] = 'python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'python3'

# Add Spark's pyspark to path BEFORE the pip-installed one
import sys
spark_python = os.path.join(os.environ['SPARK_HOME'], 'python')
spark_py4j = os.path.join(spark_python, 'lib', 'py4j-0.10.9.7-src.zip')
# Insert at beginning to override pip pyspark
sys.path.insert(0, spark_python)
sys.path.insert(0, spark_py4j)

# Now import and create session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName("Gravitino Federation Test") \
  .config("spark.plugins", "org.apache.gravitino.spark.connector.plugin.GravitinoSparkPlugin") \
  .config("spark.sql.gravitino.uri", "http://localhost:8090") \
  .config("spark.sql.gravitino.metalake", "test_metalake") \
  .config("spark.sql.gravitino.enableIcebergSupport", "true") \
  .config("spark.sql.gravitino.enableHudiSupport", "true") \
  .config("spark.jars.packages",
          "org.apache.gravitino:gravitino-spark-connector-runtime-3.5_2.12:1.1.0,"
          "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1,"
          "org.apache.hudi:hudi-spark3.5-bundle_2.12:0.15.0") \
  .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
  .config("spark.sql.extensions",
          "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,"
          "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
  .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f"Spark version: {spark.version}")
print(f"Spark home: {os.environ['SPARK_HOME']}")
spark.sql("SHOW CATALOGS").show()

26/02/03 15:14:59 WARN Utils: Your hostname, Andys-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 10.255.46.205 instead (on interface en0)
26/02/03 15:14:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/andywalner/.ivy2/cache
The jars for the packages stored in: /Users/andywalner/.ivy2/jars
org.apache.gravitino#gravitino-spark-connector-runtime-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.hudi#hudi-spark3.5-bundle_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-241c364f-3aa2-44f0-b17b-892c4fdd84fd;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/andywalner/Documents/spark-3.5.3-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.gravitino#gravitino-spark-connector-runtime-3.5_2.12;1.1.0 in central
	found org.apache.gravitino#gravitino-client-java-runtime;1.1.0 in central
	found org.apache.gravitino#gravitino-client-java;1.1.0 in central
	found org.apache.gravitino#gravitino-api;1.1.0 in central
	found org.apache.commons#commons-lang3;3.14.0 in local-m2-cache
	found org.apache.commons#commons-collections4;4.4 in local-m2-cache
	found com.google.guava#guava;32.1.3-jre in local-m2-cache
	found com.google.guava#failureaccess;1.0.1 in local-m2-cache
	found com.google.guava#listenablefuture;9999.0-empty-to-avoid-conflict-with-guava in local-m2-cache
	found com.google.code.findbugs#jsr305;3.0.2 in local-m2-cache
	found org.checkerframework#checker-qual;3.37.0 in local-m2-cache
	found com.google.errorprone#error_prone_annotations;2.21.1 in local-m2-cache
	found com.google.j2objc#j2objc-annotations;2.8 in local-m2-cache
	found com.fasterxml.jackson.core#jackson-annotations;2.15.2 in central
	found com

Spark version: 3.5.3
Spark home: /Users/andywalner/Documents/spark-3.5.3-bin-hadoop3
+-------------+
|      catalog|
+-------------+
|spark_catalog|
+-------------+



## 3. List Available Catalogs

In [4]:
spark.sql("SHOW CATALOGS").show()

+-------------+
|      catalog|
+-------------+
|spark_catalog|
+-------------+



## 4. Query Iceberg Table (from Tabular REST Catalog)

In [5]:
# List schemas in iceberg_catalog
spark.sql("SHOW SCHEMAS IN iceberg_catalog").show()

+---------+
|namespace|
+---------+
|  test_db|
+---------+



In [6]:
# List tables in test_db
spark.sql("SHOW TABLES IN iceberg_catalog.test_db").show()

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  test_db|sales_iceberg|      false|
+---------+-------------+-----------+



In [7]:
# Query Iceberg table
iceberg_df = spark.sql("""
    SELECT * FROM iceberg_catalog.test_db.sales_iceberg
    ORDER BY transaction_id
""")

print("Iceberg table (sales_iceberg):")
iceberg_df.show()
print(f"Row count: {iceberg_df.count()}")

Iceberg table (sales_iceberg):


                                                                                

+--------------+-------------+------+-------------------+
|transaction_id|customer_name|amount|   transaction_date|
+--------------+-------------+------+-------------------+
|             1|        Alice|   100|2024-01-15 00:00:00|
|             2|          Bob|   200|2024-01-16 00:00:00|
|             3|      Charlie|   150|2024-01-17 00:00:00|
|             4|        Diana|   300|2024-01-18 00:00:00|
|             5|          Eve|   250|2024-01-19 00:00:00|
+--------------+-------------+------+-------------------+

Row count: 5


## 5. Query Hudi Table (from Hive Metastore)

In [8]:
# List schemas in hudi_catalog
spark.sql("SHOW SCHEMAS IN hudi_catalog").show()

AnalysisException: [SCHEMA_NOT_FOUND] The schema `hudi_catalog` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.
To tolerate the error on drop use DROP SCHEMA IF EXISTS.

In [None]:
# List tables in test_db
spark.sql("SHOW TABLES IN hudi_catalog.test_db").show()

In [None]:
# Query Hudi table
hudi_df = spark.sql("""
    SELECT transaction_id, customer_tier, discount
    FROM hudi_catalog.test_db.customer_info_hudi
    ORDER BY transaction_id
""")

print("Hudi table (customer_info_hudi):")
hudi_df.show()
print(f"Row count: {hudi_df.count()}")

## 6. Cross-Format Join (Iceberg + Hudi)

In [None]:
# Join Iceberg and Hudi tables
joined_df = spark.sql("""
    SELECT
        i.transaction_id,
        i.customer_name,
        i.amount,
        h.customer_tier,
        h.discount,
        (i.amount - h.discount) as final_amount
    FROM iceberg_catalog.test_db.sales_iceberg i
    INNER JOIN hudi_catalog.test_db.customer_info_hudi h
    ON i.transaction_id = h.transaction_id
    ORDER BY i.transaction_id
""")

print("Cross-format join result:")
joined_df.show()
print(f"Joined row count: {joined_df.count()}")

## 7. Aggregation on Federated Data

In [None]:
# Aggregation by customer tier
agg_df = spark.sql("""
    SELECT
        h.customer_tier,
        COUNT(*) as transaction_count,
        SUM(i.amount) as total_amount,
        AVG(h.discount) as avg_discount
    FROM iceberg_catalog.test_db.sales_iceberg i
    INNER JOIN hudi_catalog.test_db.customer_info_hudi h
    ON i.transaction_id = h.transaction_id
    GROUP BY h.customer_tier
    ORDER BY total_amount DESC
""")

print("Aggregation by customer tier:")
agg_df.show()

## 8. Summary

In [None]:
print("=== Test Complete ===")
print("- Single Gravitino connection configured")
print("- Queried Iceberg table from Tabular REST catalog")
print("- Queried Hudi table from Hive Metastore")
print("- Performed cross-format join successfully")
print("- Executed aggregations on federated data")

## Cleanup (Optional)

In [None]:
# Stop Spark session when done
# spark.stop()