In [5]:
import pandas as pd
import os
import glob
from pathlib import Path
from tqdm.auto import tqdm

import numpy as np

In [3]:

def format_gsod_full(gsod_df):
    """Clean and convert all core GSOD columns."""
    # Ensure all columns are numeric first
    cols_to_numeric = ['TEMP', 'DEWP', 'MAX', 'MIN', 'PRCP', 'WDSP', 'MXSPD', 'GUST', 'SLP']
    for col in cols_to_numeric:
        if col in gsod_df.columns:
            gsod_df[col] = pd.to_numeric(gsod_df[col], errors='coerce')

    # Temperature (°F → °C), invalid flags handled
    temp_cols = ['TEMP', 'DEWP', 'MAX', 'MIN']
    for col in temp_cols:
        gsod_df.loc[gsod_df[col].isin([9999.9, 999.9]), col] = np.nan
        gsod_df[f"{col}_C"] = (gsod_df[col] - 32) * 5 / 9

    # Rainfall (inches → mm)
    gsod_df.loc[gsod_df['PRCP'] == 99.99, 'PRCP'] = np.nan
    gsod_df['PRCP_mm'] = gsod_df['PRCP'] * 25.4

    # Wind Speed (knots → m/s)
    wind_cols = ['WDSP', 'MXSPD', 'GUST']
    for col in wind_cols:
        gsod_df.loc[gsod_df[col] == 999.9, col] = np.nan
        gsod_df[f"{col}_ms"] = gsod_df[col] * 0.514444

    # Pressure (hPa or mb) - keep as-is but replace 9999.9 with NaN
    gsod_df.loc[gsod_df['SLP'] == 9999.9, 'SLP'] = np.nan

    # DATE to datetime if not already
    if 'DATE' in gsod_df.columns:
        gsod_df['DATE'] = pd.to_datetime(gsod_df['DATE'], errors='coerce')

    return gsod_df


In [16]:


# Load Philippine station list
stations_ph = pd.read_csv('isd-history-ph.csv', dtype=str)

# Create combined identifier column
stations_ph['USAF_WBAN'] = stations_ph['USAF'].str.strip() + stations_ph['WBAN'].str.strip()
station_ids_ph = set(stations_ph['USAF_WBAN'].values)

# Output folder
output_dir = Path('./gsod_ph_cleaned')
output_dir.mkdir(exist_ok=True)

# Loop through years and files
base_dir = Path.home() / 'data' / 'gsod' / 'csv'

# # for year_folder in sorted(base_dir.glob('20*')):
# for year in range(1990, 2025):
#     year_folder = base_dir / str(year)
#     print(f"Processing year: {year_folder.name}")

#     for file_path in tqdm(year_folder.glob('*.csv')):
#         file_name = file_path.stem  # Example: '98427099999'
#         if file_name not in station_ids_ph:
#             continue  # Skip non-Philippine stations

#         # Process file
#         df = pd.read_csv(file_path)
        
#         # Apply your formatting functions here:
#         df = format_gsod_full(df)
#         # Add rainfall formatter here as well

#         # Save cleaned version
#         out_file = output_dir / f"{year_folder.name}_{file_name}.parquet"
#         df.to_parquet(out_file, index=False)


for year in range(1990, 2025):
    year_folder = base_dir / str(year)
    print(f"Processing year: {year_folder.name}")

    for file_path in tqdm(year_folder.glob('*.csv')):
        file_name = file_path.stem  # Example: '98427099999'
        if file_name not in station_ids_ph:
            continue  # Skip non-Philippine stations

        # Process file
        df = pd.read_csv(file_path)

        # Apply your formatting functions here:
        df = format_gsod_full(df)

        # ⬇️ Insert station metadata here ⬇️
        usaf = file_name[:6]
        wban = file_name[6:]
        station_meta = stations_ph[
            (stations_ph['USAF'].str.strip() == usaf) & 
            (stations_ph['WBAN'].str.strip() == wban)
        ].iloc[0]

        df['station_id'] = file_name
        df['station_name'] = station_meta['STATION NAME']
        df['lat'] = float(station_meta['LAT']) if pd.notnull(station_meta['LAT']) else None
        df['lon'] = float(station_meta['LON']) if pd.notnull(station_meta['LON']) else None
        df['elevation_m'] = float(station_meta['ELEV(M)']) if pd.notnull(station_meta['ELEV(M)']) else None
        # ⬆️ Metadata insertion ends here ⬆️

        # Save cleaned version
        out_file = output_dir / f"{year_folder.name}_{file_name}.parquet"
        df.to_parquet(out_file, index=False)


Processing year: 1990


0it [00:00, ?it/s]

Processing year: 1991


0it [00:00, ?it/s]

Processing year: 1992


0it [00:00, ?it/s]

Processing year: 1993


0it [00:00, ?it/s]

Processing year: 1994


0it [00:00, ?it/s]

Processing year: 1995


0it [00:00, ?it/s]

Processing year: 1996


0it [00:00, ?it/s]

Processing year: 1997


0it [00:00, ?it/s]

Processing year: 1998


0it [00:00, ?it/s]

Processing year: 1999


0it [00:00, ?it/s]

Processing year: 2000


0it [00:00, ?it/s]

Processing year: 2001


0it [00:00, ?it/s]

Processing year: 2002


0it [00:00, ?it/s]

Processing year: 2003


0it [00:00, ?it/s]

Processing year: 2004


0it [00:00, ?it/s]

Processing year: 2005


0it [00:00, ?it/s]

Processing year: 2006


0it [00:00, ?it/s]

Processing year: 2007


0it [00:00, ?it/s]

Processing year: 2008


0it [00:00, ?it/s]

Processing year: 2009


0it [00:00, ?it/s]

Processing year: 2010


0it [00:00, ?it/s]

Processing year: 2011


0it [00:00, ?it/s]

Processing year: 2012


0it [00:00, ?it/s]

Processing year: 2013


0it [00:00, ?it/s]

Processing year: 2014


0it [00:00, ?it/s]

Processing year: 2015


0it [00:00, ?it/s]

Processing year: 2016


0it [00:00, ?it/s]

Processing year: 2017


0it [00:00, ?it/s]

Processing year: 2018


0it [00:00, ?it/s]

Processing year: 2019


0it [00:00, ?it/s]

Processing year: 2020


0it [00:00, ?it/s]

Processing year: 2021


0it [00:00, ?it/s]

Processing year: 2022


0it [00:00, ?it/s]

Processing year: 2023


0it [00:00, ?it/s]

Processing year: 2024


0it [00:00, ?it/s]