## Libraries and variables setting

In [0]:
# Libraries

import requests
import json
from datetime import datetime
import os

sc = spark.sparkContext

# Variables

socrata_token = dbutils.secrets.get(scope = "bpina_secrets", key = "socrata_api_token")
api_url = 'https://evergreen.data.socrata.com/resource/c53k-p9dd.json'
chunk_size = 50000
max_dataset_size = 130000
num_tasks =( max_dataset_size + chunk_size - 1) // chunk_size 
output_path = "dbfs:/external_data/seattle_911_raw_data/"

In [0]:
# Functions

def parallel_api_ingestion(api_url, base_params_list, output_dbfs_dir):

    hadoop_config = spark.sparkContext._jsc.hadoopConfiguration()
    fs = spark.sparkContext._jvm.org.apache.hadoop.fs.FileSystem.get(hadoop_config)

    def fetch_and_write_socrata_data(task):
        """Fetches data from the Socrata API for a given set of parameters."""
        import requests
        params = task  # Task is a dictionary of parameters
        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
            filename = f"api_data_{timestamp}_{hash(tuple(sorted(params.items())))}.json"
            file_path_uri = f"{output_dbfs_path}{filename}"
            path = spark.sparkContext._jvm.org.apache.hadoop.fs.Path(file_path_uri)

            output_stream = fs.create(path, True) # True for overwrite
            output_stream.write(bytearray(json.dumps(data), 'utf-8'))
            output_stream.close()

        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {params}: {e}")
            return False
        except ValueError:
            print(f"Error decoding JSON for {params}")
            return False
    
    try:
        dbutils.fs.ls("dbfs:/external_data/seattle_911_raw_data/")
    except Exception as e:
        dbutils.fs.mkdirs(output_dbfs_dir)
        print(f"Created directory: {output_dbfs_dir}")

    params_rdd = spark.sparkContext.parallelize(base_params_list)
    results_rdd = params_rdd.map(fetch_and_write_socrata_data)
    successful_writes = results_rdd.filter(lambda x: x).count()
    total_calls = params_rdd.count()

    print(f"Attempted {total_calls} API calls. {successful_writes} were successful.")



In [0]:
# Create a list of dictionaries with the corresponding parameters
list_of_params = []

for i in range(num_tasks):
    list_of_params.append({
        "$limit": chunk_size,
        "$$app_token": socrata_token,
        "$offset": i * chunk_size
    })


In [0]:
parallel_api_ingestion(api_url,list_of_params,output_path)
