In [2]:
import asyncio
import aiofiles
import glob
import json
import csv
import os
import random
import pandas as pd

In [3]:
semaphore = asyncio.Semaphore(50)
async def read_json_file(file):
    """Read JSON files and extract required fields."""
    async with semaphore:
        async with aiofiles.open(file, "r", encoding="utf-8") as f:
            try:
                data = json.loads(await f.read())
                try:
                    results=data.get("results", [])
                    # processed_data = []
                    # for res in results:
                    #     try:
                    #         station_id = res["id"]
                    #         name = res["name"]
                    #         latitude = res["latitude"]
                    #         longitude = res["longitude"]
                    #         geom = f"SRID=4326;POINT({longitude} {latitude})"
                    #         processed_data.append((station_id, name, geom, latitude, longitude))
                    #     except KeyError:
                    #         print(f"⚠️ Missing fields in {file}: {res}")
                
                    return results

                except KeyError:
                    print(f"⚠️ Missing 'results' field in {file}")
                    return None

            except json.JSONDecodeError:
                print(f"❌ Error parsing {file}")
                return None
async def write_json_file(file, data):
    """Write JSON files asynchronously."""
    async with semaphore:
        async with aiofiles.open(file, "w", encoding="utf-8") as f:
            await f.write(json.dumps(data, indent=4, ensure_ascii=False))
            print(f"✅ {file} written.")
            return True

async def process_all_files(files,output_file):
    """Process all JSON files asynchronously."""
    tasks = [read_json_file(file) for file in files]
    results = await asyncio.gather(*tasks)
    all_results = [item for sublist in results for item in sublist]
    await write_json_file(output_file, all_results)
    return all_results



async def main():
    all_json_files = glob.glob('./api_responses/*.json')
    print("{} JSON files found.".format(len(all_json_files)))
    results=await process_all_files("all_stations.json",all_json_files)
    with open('all_stations.json', 'r') as f:
        data=json.load(f)
        df=pd.DataFrame(data)
        print (df.head())
        print (df.shape())
    



await main()


0 JSON files found.


FileNotFoundError: [Errno 2] No such file or directory: 'a'

GHCND:USC00094674


In [None]:
import aiohttp
from datetime import date

BASE_URL = "https://www.ncei.noaa.gov/cdo-web/api/v2"
TOKEN="fYtUjDsUnAwZJHAUEwNpZMoktpvwRUIZ"
CONCURRENT_REQUESTS = 5  # Set your desired limit
semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
async def fetch(session, url):
    headers = {'Token': TOKEN}
    limit=url.split("limit=")[1].split("&")[0]
    offset=url.split("offset=")[1]
    endpoint=url.split("https://www.ncei.noaa.gov/cdo-web/api/v2/")[1].split("?")[0]
    print (f"Fetching {url} with limit {limit} and offset {offset} for endpoint {endpoint}")
    async with semaphore:
        async with session.get(url,headers=headers) as response:
            try: 
                data=await response.json()
                isWritten=await write_json_file(f"./api_responses/{endpoint}/{endpoint}_{limit}_{offset}.json",data)
                print (f"Written {isWritten}")
                return data
            except aiohttp.client_exceptions.ContentTypeError:
                print(f"❌ Error fetching {url}")
                return None
async def fetch_all(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]
        return await asyncio.gather(*tasks)


from datetime import datetime
endpoints=["datatypes"]
limit=100
offsets=[i for i in range(0,1566,limit)]
urls = [f"{BASE_URL}/{endpoint}?limit={limit}&offset={offset}" for endpoint in endpoints for offset in offsets]

results=await fetch_all(urls)






Fetching https://www.ncei.noaa.gov/cdo-web/api/v2/datatypes?limit=100&offset=0 with limit 100 and offset 0 for endpoint datatypes
Fetching https://www.ncei.noaa.gov/cdo-web/api/v2/datatypes?limit=100&offset=100 with limit 100 and offset 100 for endpoint datatypes
Fetching https://www.ncei.noaa.gov/cdo-web/api/v2/datatypes?limit=100&offset=200 with limit 100 and offset 200 for endpoint datatypes
Fetching https://www.ncei.noaa.gov/cdo-web/api/v2/datatypes?limit=100&offset=300 with limit 100 and offset 300 for endpoint datatypes
Fetching https://www.ncei.noaa.gov/cdo-web/api/v2/datatypes?limit=100&offset=400 with limit 100 and offset 400 for endpoint datatypes
Fetching https://www.ncei.noaa.gov/cdo-web/api/v2/datatypes?limit=100&offset=500 with limit 100 and offset 500 for endpoint datatypes
Fetching https://www.ncei.noaa.gov/cdo-web/api/v2/datatypes?limit=100&offset=600 with limit 100 and offset 600 for endpoint datatypes
Fetching https://www.ncei.noaa.gov/cdo-web/api/v2/datatypes?limit=

NameError: name 'enndpoint' is not defined

In [None]:
from dateutil.relativedelta import relativedelta
urls=[]
with open('datasets.json', 'r') as f:
    data=json.load(f)
    datasets=[d for d in data['results']]
    for d in datasets:
        enddate=d['maxdate']
        date_obj=datetime.strptime(enddate, "%Y-%m-%d").date()
        startdate=date_obj-relativedelta(years=1)
        urls.append(f"{BASE_URL}/data?datasetid={d['id']}&startdate={startdate}&enddate={enddate}")



results=await fetch_all(urls[:1])
counts=[0:100:]


[{'metadata': {'resultset': {'offset': 1, 'count': 35014905, 'limit': 100}}, 'results': [{'date': '2024-03-12T00:00:00', 'datatype': 'PRCP', 'station': 'GHCND:AE000041196', 'attributes': 'D,,S,', 'value': 0}, {'date': '2024-03-12T00:00:00', 'datatype': 'TAVG', 'station': 'GHCND:AE000041196', 'attributes': 'H,,S,', 'value': 230}, {'date': '2024-03-12T00:00:00', 'datatype': 'TMAX', 'station': 'GHCND:AE000041196', 'attributes': ',,S,', 'value': 264}, {'date': '2024-03-12T00:00:00', 'datatype': 'PRCP', 'station': 'GHCND:AEM00041194', 'attributes': ',,S,', 'value': 0}, {'date': '2024-03-12T00:00:00', 'datatype': 'TAVG', 'station': 'GHCND:AEM00041194', 'attributes': 'H,,S,', 'value': 238}, {'date': '2024-03-12T00:00:00', 'datatype': 'TMAX', 'station': 'GHCND:AEM00041194', 'attributes': ',,S,', 'value': 264}, {'date': '2024-03-12T00:00:00', 'datatype': 'TAVG', 'station': 'GHCND:AEM00041217', 'attributes': 'H,,S,', 'value': 233}, {'date': '2024-03-12T00:00:00', 'datatype': 'TMAX', 'station': '