<mark>Raw-Bronze â†’ Raw ingested data (no/minimal transformations)</mark>

In [11]:
# Mount ADLS Gen2
# Required each time the cluster is restarted which should be only on the first notebook as they run in order

# This builds a dictionary mapping each tier to its ADLS Gen2 path
tiers = ["raw-bronze", "refined-silver", "business-gold"]
adls_paths = {tier: f"abfss://{tier}@synapsetest1298.dfs.core.windows.net/" for tier in tiers}

# Accessing paths
bronze_adls = adls_paths["raw-bronze"]
silver_adls = adls_paths["refined-silver"]
gold_adls = adls_paths["business-gold"] 

# printing the paths
adls_paths

StatementMeta(Synapsedemo, 0, 12, Finished, Available, Finished)

{'raw-bronze': 'abfss://raw-bronze@synapsetest1298.dfs.core.windows.net/',
 'refined-silver': 'abfss://refined-silver@synapsetest1298.dfs.core.windows.net/',
 'business-gold': 'abfss://business-gold@synapsetest1298.dfs.core.windows.net/'}

In [19]:
# importing required libraries
import requests
import json
from datetime import date, timedelta

StatementMeta(Synapsedemo, 0, 20, Finished, Available, Finished)

In [13]:
# setting dates
start_date = date.today() - timedelta(1)
end_date = date.today()

start_date, end_date

StatementMeta(Synapsedemo, 0, 14, Finished, Available, Finished)

(datetime.date(2025, 9, 12), datetime.date(2025, 9, 13))

In [14]:
# Construct API URL from date
url = f"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={start_date}&endtime={end_date}"

try:
    # Make the GET request to fetch data
    response = requests.get(url)

    # Check if the request was successful
    response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
    data = response.json().get('features', [])

    if not data:
        print("No data returned for the specified date range.")
    else:
        # Specify the ADLS path
        file_path = f"{bronze_adls}/{start_date}_earthquake_data.json"

        # Convert data to JSON string
        json_data = json.dumps(data, indent=4)

        # Write the JSON data to ADLS
        # Create an RDD with the JSON string and parallelize it
        rdd = spark.sparkContext.parallelize([json_data])

        # Convert RDD to DataFrame and write to ADLS
        df = spark.read.json(rdd)
        df.limit(100) # To speed up processing
        df.write.mode("overwrite").json(file_path)

        print(f"Data successfully saved to {file_path}")
except requests.exceptions.RequestException as e:
    print(f"Error fetching data from API: {e}")
     

StatementMeta(Synapsedemo, 0, 15, Finished, Available, Finished)

Data successfully saved to abfss://raw-bronze@synapsetest1298.dfs.core.windows.net//2025-09-12_earthquake_data.json


In [15]:
# just print the first item
data[0]

StatementMeta(Synapsedemo, 0, 16, Finished, Available, Finished)

{'type': 'Feature',
 'properties': {'mag': 0.9,
  'place': '8 km NNW of The Geysers, CA',
  'time': 1757721597550,
  'updated': 1757723237978,
  'tz': None,
  'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/nc75237097',
  'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=nc75237097&format=geojson',
  'felt': None,
  'cdi': None,
  'mmi': None,
  'alert': None,
  'status': 'automatic',
  'tsunami': 0,
  'sig': 12,
  'net': 'nc',
  'code': '75237097',
  'ids': ',nc75237097,',
  'sources': ',nc,',
  'types': ',nearby-cities,origin,phase-data,scitech-link,',
  'nst': 16,
  'dmin': 0.007739,
  'rms': 0.02,
  'gap': 82,
  'magType': 'md',
  'type': 'earthquake',
  'title': 'M 0.9 - 8 km NNW of The Geysers, CA'},
 'geometry': {'type': 'Point',
  'coordinates': [-122.817337036133, 38.8286666870117, 1.79999995231628]},
 'id': 'nc75237097'}

In [17]:
# Check if the raw data is in the bronze layer

files = mssparkutils.fs.ls(bronze_adls)
for file in files:
    print(file.name, file.isDir, file.isFile, file.path, file.size, file.modifyTime)

StatementMeta(Synapsedemo, 0, 18, Finished, Available, Finished)

2025-09-12_earthquake_data.json True False abfss://raw-bronze@synapsetest1298.dfs.core.windows.net/2025-09-12_earthquake_data.json 0 1757752683341


In [18]:
# passing metadata/output values from the bronze notebook to the next pipeline step

# Define your variables
output_data = {
    "start_date": start_date.isoformat(),
    "bronze_adls": bronze_adls,
    "silver_adls": silver_adls,
    "gold_adls": gold_adls
}

# Serialize the dictionary to a JSON string
bronze_output = json.dumps(output_data)

# Use mssparkutils.notebook.exit() to pass the JSON output to the pipeline
mssparkutils.notebook.exit(bronze_output)


StatementMeta(Synapsedemo, 0, 19, Finished, Available, Finished)

ExitValue: {"start_date": "2025-09-12", "bronze_adls": "abfss://raw-bronze@synapsetest1298.dfs.core.windows.net/", "silver_adls": "abfss://refined-silver@synapsetest1298.dfs.core.windows.net/", "gold_adls": "abfss://business-gold@synapsetest1298.dfs.core.windows.net/"}