
# Metadata-Driven Ingestion Flow in Databricks

## Overview
This notebook demonstrates a **metadata-driven approach** to ingest data from multiple sources:
- **Batch Data**
- **Streaming Data**
- **API Data**
- **JDBC**

The ingestion logic is controlled by a parameter (`connection_type`) and metadata stored in a Delta table.


In [0]:
selected_connection_type = dbutils.widgets.get("connection_type")
print(selected_connection_type)

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS main.meta.parquet_files;

In [0]:
%run /Users/sarthak.bhatt@tcs.com/Functions_to_write_data

In [0]:
#%run /Workspace/Users/sarthak.bhatt@tcs.com/Functions_to_write_data_cloned

In [0]:
%python
if selected_connection_type == "batch":
    print("Batch ingestion started...")
    batch_config_df = spark.read.table("main.meta.batch_config")
    connection_config_df = spark.read.table("main.meta.connection_config")

    # filter the enabled as true
    file_batches = batch_config_df.filter(batch_config_df.enabled == True)
    display(file_batches)
    file_batches_with_path = file_batches.join(
        connection_config_df,
        file_batches.source_id == connection_config_df.source_id,
        "inner"
    )
    display(file_batches_with_path)
    for row in file_batches_with_path.collect():
        source_id = row["source_id"]
        source_path = row["source_path"]
        file_format = row["file_format"]
        target_catalog = row["target_catalog"].split(".")[-1]
        target_schema = row["target_schema"].split(".")[-1]
        target_table_name = row["target_table"]
        target_file_format = row["target_file_format"] 
    # Remove any catalog/schema prefix from table name
        target_table_name = target_table_name.split(".")[-1]
        target_table = f"{target_catalog}.{target_schema}.{target_table_name}"
        print(f"Processing Batch ID: {source_id} | Source: {source_path} | Target: {target_table}")
        source_df = (
            spark.read
            .format(file_format)
            .option("multiline", "true")
            .option("header", True)
            .option("inferSchema", True)
            .load(source_path)
        )
        display(source_df)
        for col in source_df.columns:
            clean_col = col.strip().replace(" ", "_").replace("(", "").replace(")", "").replace("-", "_").replace("/", "_")
            if clean_col != col:
                source_df = source_df.withColumnRenamed(col, clean_col)

        write_to_format(source_df, target_table, target_file_format)

elif selected_connection_type == "api":
    import requests
    import json
    print("API ingestion started...")
    api_config_df = spark.read.table("main.meta.api_config")
    connection_config_df = spark.read.table("main.meta.connection_config")
    api_config_pd = api_config_df.toPandas()
    connection_config_pd = connection_config_df.toPandas()
    for _, api_row in api_config_pd.iterrows():
        source_id = api_row["source_id"]
        endpoint = api_row.get("endpoint") or api_row.get("api_url") or ""
        params = json.loads(api_row.get("params", "{}")) if api_row.get("params") else {}
        connection_id = api_row.get("connection_id") or api_row.get("source_id")
        conn_match = connection_config_pd[connection_config_pd["source_id"] == connection_id]
        if conn_match.empty:
            print(f"No connection config for source_id: {connection_id}. Skipping.")
            continue
        conn_details = conn_match.iloc[0].to_dict()
        base_url = conn_details.get("base_url") or conn_details.get("api_url") or ""
        auth_token = conn_details.get("auth_token", None)
        headers = json.loads(conn_details.get("api_headers", "{}")) if conn_details.get("api_headers") else {}
        if auth_token:
            headers["Authorization"] = f"Bearer {auth_token}"
        if base_url.endswith("/") and endpoint.startswith("/"):
            url = base_url + endpoint[1:]
        elif not base_url.endswith("/") and not endpoint.startswith("/"):
            url = base_url + "/" + endpoint
        else:
            url = base_url + endpoint

     # Dynamically get target catalog, schema, and table

        target_catalog = api_row.get("target_catalog", "main").split(".")[-1]
        target_schema = api_row.get("target_schema", "meta").split(".")[-1]
        target_table_name = api_row.get("target_table", "bronze_api").split(".")[-1]
        target_table = f"{target_catalog}.{target_schema}.{target_table_name}"
        target_file_format = api_row["target_file_format"]
        #print(target_file_format)
        print(f"Fetching data from API: {url}")
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            data = response.json()
            if isinstance(data, list):
                source_df = spark.createDataFrame(data)
            elif isinstance(data, dict) and "items" in data:
                source_df = spark.createDataFrame(data["items"])
            else:
                source_df = spark.createDataFrame([data])

        write_to_format(source_df, target_table, target_file_format)
                
        #print(f"Data from API {source_id} ingested into " f"{target_table if target_file_format == 'delta' else f'/Volumes/{target_catalog}/{target_schema}/parquet_files/{source_id}'}")
        
    else:
            print(f"Failed to fetch data from API {source_id}. Status: {response.status_code}")

elif selected_connection_type == "streaming":
    import re
    def sanitize_column_name(name):
        return re.sub(r'[ ,;{}()\n\t=]', '_', name)
    print("Streaming ingestion started...")
    streaming_config_df = spark.read.table("main.meta.streaming_config")
    connection_config_df = spark.read.table("main.meta.connection_config")
    streaming_batches = streaming_config_df.join(
        connection_config_df,
        streaming_config_df.source_id == connection_config_df.source_id,
        "inner"
    )
    allowed_formats = ["csv", "parquet", "json", "avro"]
    for row in streaming_batches.collect():
        source_id = row["source_id"]
        source_path = row["source_path"]
        file_format = row["file_format"]
        target_file_format = row["target_file_format"]
        # Dynamically construct target table from metadata
        target_catalog = row["target_catalog"].split(".")[-1]
        target_schema = row["target_schema"].split(".")[-1]
        target_table_name = row["target_table"].split(".")[-1]
        target_table = f"{target_catalog}.{target_schema}.{target_table_name}"
        checkpoint_location = f"/Volumes/{target_catalog}/{target_schema}/checkpoints/{source_id}/checkpoint_v2"
        schema_location = f"/Volumes/{target_catalog}/{target_schema}/checkpoints/{source_id}/schema_v2"
        df = (
            spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", file_format)
            .option("cloudFiles.schemaLocation", schema_location)
            .option("mergeSchema", "true")
            .load(source_path)
        )
        source_df = df.toDF(*[sanitize_column_name(c) for c in df.columns])

        if target_file_format == "delta":
            (
                df.writeStream
                .format("delta")
                .option("checkpointLocation", checkpoint_location)
                .option("mergeSchema", "true")
                .outputMode("append")
                .trigger(availableNow=True)
                .toTable(target_table)
            )
        else:
            (
                df.writeStream
                .format("parquet")
                .option("checkpointLocation", checkpoint_location)
                .outputMode("append")
                .trigger(availableNow=True)
                .start(f"/Volumes/{target_catalog}/{target_schema}/parquet_files/{source_id}")
            )

elif selected_connection_type == "jdbc":
    jdbc_config_df = spark.read.table("main.meta.jdbc_config")
    connection_config_df = spark.read.table("main.meta.connection_config")
    jdbc_batches = jdbc_config_df.join(
        connection_config_df,
        jdbc_config_df.source_id == connection_config_df.source_id,
        "inner"
    )
    for row in jdbc_batches.collect():
        jdbc_url = row["jdbc_url"]
        jdbc_user = row["jdbc_user"]
        jdbc_password = row["jdbc_password"]
        # Dynamically construct target table name
        target_catalog = row["target_catalog"].split(".")[-1]
        target_schema = row["target_schema"].split(".")[-1]
        target_table_name = row["target_table"].split(".")[-1]
        target_file_format = row["target_file_format"]
        target_table = f"{target_catalog}.{target_schema}.{target_table_name}"
        dbtable = row["dbtable"] if "dbtable" in row.asDict() else None
        if not dbtable:
            print(f"‚ùå No dbtable specified for source_id {row['source_id']}. Skipping.")
            continue
        df = (
            spark.read
            .format("jdbc")
            .option("url", jdbc_url)
            .option("dbtable", dbtable)
            .option("user", jdbc_user)
            .option("password", jdbc_password)
            .load()
        )
        write_to_format(source_df, target_table, target_file_format)

else:
    raise ValueError("Invalid source type provided")


Checking the count of target table: **main.meta.bronze_target_metadatadriven_ingestion** to ensure data ingestion


In [0]:
%sql
select * from main.meta.bronze_batch


In [0]:
# spark.sql("DELETE FROM main.meta.bronze_batch")
# spark.sql("DELETE FROM main.meta.bronze_api")
# spark.sql("DELETE FROM main.meta.bronze_streaming")


In [0]:
spark.sql("DELETE FROM main.meta.bronze_batch")