In [1]:
import asyncio
import aiofiles
import glob
import json
import csv
import os
import random
import pandas as pd

In [34]:
semaphore = asyncio.Semaphore(50)
async def read_json_file(file):
    print (file)
    """Read JSON files and extract required fields."""
    async with semaphore:
        async with aiofiles.open(file, "r", encoding="utf-8") as f:
            try:
                data = json.loads(await f.read())
                try:
                    return data["results"]

                except KeyError:
                    print(f"⚠️ Missing 'results' field in {file}")
                    return None

            except json.JSONDecodeError:
                print(f"❌ Error parsing {file}")
                return None
            
async def write_json_file(file, data):
    """Write JSON files asynchronously."""
    async with semaphore:
        async with aiofiles.open(file, "w", encoding="utf-8") as f:
            await f.write(json.dumps(data, indent=4, ensure_ascii=False))
            print(f"✅ {file} written.")
            return True



async def main():
    all_json_files = glob.glob('./data/2023-03-14_2025-03-14/GSOM/TAVG/*.json')
    print("{} JSON files found.".format(len(all_json_files)))
    tasks=[read_json_file(file) for file in all_json_files]
    results = await asyncio.gather(*tasks)
    flattened_list = [item for sublist in results for item in sublist]
    
    await write_json_file("all_tavg_gsom.json", flattened_list)
    
  


await main()


222 JSON files found.
./data/2023-03-14_2025-03-14/GSOM/TAVG/136000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/116000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/171000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/173000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/184000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/90000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/78000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/24000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/32000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/112000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/18000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/73000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/44000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/48000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/132000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/72000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/189000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/168000.json
./data/2023-03-14_2025-03-14/GSOM/TAVG/42000.json
./data/2023-03-14_2

In [39]:
df=pd.read_json("all_tavg_gsom.json")
df.rename(columns={"date": "date", "datatype": "datatype", "station": "station_id", "attributes": "attributes", "value": "value"}, inplace=True)
df['dataset_name']="GSOM"
sorted_columns = ["dataset_name",  "station_id","date","datatype","attributes", "value"]
df = df[sorted_columns]
df.head()
print (len(df))
df.drop_duplicates(subset=["station_id","date","datatype"], keep="first", inplace=True)
print (len(df))

221981
221980


In [40]:
df.to_csv("all_tavg_gsom.csv", index=False)
