In [0]:
import requests
import pandas as pd
from requests.exceptions import ConnectionError, Timeout
import time

url = "https://health.data.ny.gov/resource/xdss-u53e.json"
params = {"test_date": "2023-08-30T00:00:00.000"}

# Function to make the request with retries
def get_data_with_retries(url, params, retries=3, delay=5):
    for i in range(retries):
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()  # Raise an error for bad status codes
            return response.json()
        except (ConnectionError, Timeout) as e:
            print(f"Attempt {i+1} failed: {e}")
            time.sleep(delay)
    raise Exception("Failed to retrieve data after multiple attempts")

data = get_data_with_retries(url, params)

# Convert to Pandas DataFrame
pdf = pd.DataFrame(data)

display(pdf)

In [0]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# 1. Define the JSON payload schema emitted by Debezium:
value_schema = StructType([
  StructField("after", StructType([ 
      StructField("id", IntegerType()),
      StructField("first_name", StringType()),
      StructField("last_name", StringType()),
      StructField("email", StringType())
  ])),
  StructField("op", StringType()),           # c = create, u = update, d = delete
  StructField("ts_ms", IntegerType())        # event timestamp
])

In [0]:
# 2. Stream from the Kafka topic:
df_raw = (
  spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "dbserver1.inventory.customers")
    .option("startingOffsets", "earliest")
    .load()
)


In [0]:
# 3. Parse the JSON payload:
df_parsed = (
  df_raw
    .selectExpr("CAST(value AS STRING) as json_str")
    .select(from_json(col("json_str"), value_schema).alias("data"))
    .select(
      col("data.after.id").alias("id"),
      col("data.after.first_name").alias("first_name"),
      col("data.after.last_name").alias("last_name"),
      col("data.after.email").alias("email"),
      col("data.op").alias("operation"),
      col("data.ts_ms").alias("event_ts")
    )
)

In [0]:
# 4. Write into a Delta table in append mode:

checkpoint = "s3://databricksbijubucketnew/checkpoints/"
query = (
  df_parsed
    .writeStream
    .format("delta")
    .option("checkpointLocation", f"{checkpoint}/kafka")
    .outputMode("append")
    .trigger(availableNow=True)
    .table("bijucatalog.bijubronzeschema.customers_changes")
)

query.awaitTermination()

In [0]:
%sql
select * from bijucatalog.bijubronzeschema.customers_changes

In [0]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# 1. Define the JSON payload schema emitted by Debezium:
value_schema = StructType([
  StructField("after", StructType([ 
      StructField("id", IntegerType()),
      StructField("first_name", StringType()),
      StructField("last_name", StringType()),
      StructField("email", StringType())
  ])),
  StructField("op", StringType()),           # c = create, u = update, d = delete
  StructField("ts_ms", IntegerType())        # event timestamp
])

# 2. Stream from the Kafka topic:
df_raw = (
  spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "Ydocker.for.mac.host.internal:9092")
    .option("subscribe", "dbserver1.inventory.customers")
    .option("startingOffsets", "earliest")
    .load()
)

# 3. Parse the JSON payload:
df_parsed = (
  df_raw
    .selectExpr("CAST(value AS STRING) as json_str")
    .select(from_json(col("json_str"), value_schema).alias("data"))
    .select(
      col("data.after.id").alias("id"),
      col("data.after.first_name").alias("first_name"),
      col("data.after.last_name").alias("last_name"),
      col("data.after.email").alias("email"),
      col("data.op").alias("operation"),
      col("data.ts_ms").alias("event_ts")
    )
)

# 4. Write into a Delta table in append mode:
query = (
  df_parsed
    .writeStream
    .format("delta")
    .option("checkpointLocation", "/tmp/checkpoints/debezium_to_delta")
    .outputMode("append")
    .table("inventory.customers_changes")
)

query.awaitTermination()
