## Libraries and variables setting

In [0]:
# Libraries

import requests
import json
from datetime import datetime
import os

sc = spark.sparkContext

# Variables

socrata_token = dbutils.secrets.get(scope = "bpina_secrets", key = "socrata_api_token")
api_url = 'https://evergreen.data.socrata.com/resource/c53k-p9dd.json'
chunk_size = 50000
max_dataset_size = 130000
num_tasks =( max_dataset_size + chunk_size - 1) // chunk_size 
output_path = "dbfs:/external_data/seattle_911_raw_data/"

## Functions

In [0]:
# Functions

def parallel_api_ingestion(api_url, base_params_list, output_dbfs_dir):

    def fetch_socrata_data(task):
        """Fetches data from the Socrata API for a given set of parameters."""
        import requests
        import json
        params = task  # Task is a dictionary of parameters
        try:
            response = requests.get(api_url, params=params)
            response.raise_for_status()
            data = response.json()

            return data

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {params}: {e}")
            return None
        except ValueError:
            print(f"Error decoding JSON for {params}")
            return None

    params_rdd = spark.sparkContext.parallelize(base_params_list)
    results_rdd = params_rdd.map(fetch_socrata_data)

    # Collect all the results into a single list on the driver
    results = results_rdd.collect()

    # Filter out any None values (failed API calls)
    successful_results = [r for r in results if r is not None]

    def write_json_to_dbfs(data, dbfs_path):
        try:
            dbutils.fs.put(dbfs_path, json.dumps(data), overwrite=True)
            print(f"Successfully wrote data to: {dbfs_path}")
        except Exception as e:
            print(f"An error occurred while writing to DBFS: {e}")

    for i, data in enumerate(successful_results):
        filename = f"chunk_{i + 1}.json"
        dbfs_path = os.path.join(output_dbfs_dir, filename)
        write_json_to_dbfs(data, dbfs_path)




## Execution

In [0]:
# Create a list of dictionaries with the corresponding parameters
list_of_params = []

for i in range(num_tasks):
    list_of_params.append({
        "$limit": chunk_size,
        "$$app_token": socrata_token,
        "$offset": i * chunk_size
    })


In [0]:
# Execute the ingestion 

parallel_api_ingestion(api_url,list_of_params,output_path)
