In [4]:
import os
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
import xarray as xr
from pyproj import Proj
from scipy.spatial.distance import cdist
from metpy.calc import dewpoint_from_relative_humidity
from metpy.units import units

In [2]:
hourafter = True
source = 'mPING'
if hourafter:
    save_dir = source + '_hourafter'
else:
    save_dir = source

In [5]:
path_precip = "/glade/p/cisl/aiml/ai2es/winter_ptypes/"
path_rap = "/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/"
path_save = "/glade/p/cisl/aiml/ai2es/winter_ptypes/precip_rap/"

if source == 'ASOS':
    precip_files = [f for f in os.listdir(path_precip) if f.endswith('.csv')]
    precip_files = [f for f in precip_files if f.startswith('ASOS')]
if source == 'mPING':
    precip_files = [f for f in os.listdir(path_precip) if 'eliot' in f]
precip_files.sort()

In [7]:
path_rap = "/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/"
date = '20190202'
hour = '14'
ds = xr.open_dataset(os.path.join(path_rap, date, f"rap_130_{date}_{hour}00_000.nc"))
ds

In [4]:
precip_types = ['ra', 'sn', 'pl', 'fzra']
df = []
for file in precip_files:
    df_temp = pd.read_csv(os.path.join(path_precip, file))
    df_temp['precip'] = list(set(file.split('.')).intersection(set(precip_types)))[0]

    if df_temp.isna().sum().sum() > 0:
        print(f"Dropping {df_temp.isna().sum().sum()} rows from {file} because NaNs are present.")
        df_temp.dropna(inplace=True)

    try:
        datetime.strptime(df_temp.index[0], '%M/%d/%Y')
        df_temp = df_temp.reset_index().rename(columns={'index':'obdate'})
    except:
        pass

    if 'Z' in df_temp['obtime'][0]:
        df_temp['obtime'] = df_temp['obtime'].str[:-1]
        df_temp['datetime'] = pd.to_datetime(df_temp['obdate'] + ' ' + df_temp['obtime'],
                                         format="%Y-%m-%d %H:%M:%S")        
        df_temp['obdate'] = df_temp['datetime'].dt.strftime('%m/%d/%Y')
    else:
        df_temp['datetime'] = pd.to_datetime(df_temp['obdate'] + ' ' + df_temp['obtime'],
                                         format="%m/%d/%Y %H:%M:%S")
    if hourafter:
        df_temp['datetime'] = df_temp['datetime'].dt.ceil(freq='H')        
    else:
        df_temp['datetime'] = df_temp['datetime'].dt.floor(freq='H')
    df.append(df_temp)
df = pd.concat(df, axis=0)
df = df.sort_values(by='datetime')
del df_temp
print("NaNs: ", df.isna().sum().sum())

Dropping 3 rows from ASOS.5.2016.ra.csv because NaNs are present.
Dropping 1 rows from ASOS.5.2019.ra.csv because NaNs are present.
NaNs:  0


In [5]:
start_date = pd.to_datetime('20150101', format='%Y%m%d')
end_date = pd.to_datetime('20220630', format='%Y%m%d')
df = df[(df['datetime'] > start_date) & (df['datetime'] <= end_date)]

In [None]:
def find_coord_indices(lon_array, lat_array, lon_points, lat_points, dist_proj='lcc_RAP'):
    """
    Find indices of nearest lon/lat pair on a grid. Supports rectilinear and curilinear grids.
    lon_points / lat_points must be received as a list.
    Args:
        lon_array (np.array): Longitude values of coarse grid you are matching against
        lat_array (np.array): Latitude values of coarse grid you are matching against
        lon_points (list): List of Longitude points from orginal grid/object
        lat_points (list): List of Latitude points from original grid/object
        dist_proj (str): Name of projection for pyproj to calculate distances
    Returns (list):
        List of i, j (Lon/Lat) indices for coarse grid.
    """
    if dist_proj == 'lcc_WRF':
        proj = Proj(proj='lcc', R=6371229, lat_0=38, lon_0=-97.5, lat_1=32, lat_2=46)
    if dist_proj == 'lcc_RAP':
        proj = Proj(proj='lcc', R=6371229, lat_0=25, lon_0=265, lat_1=25, lat_2=25)

    proj_lon, proj_lat = np.array(proj(lon_array, lat_array))  # transform to distances using specified projection
    lonlat = np.column_stack(
        (proj_lon.ravel(), proj_lat.ravel()))  # Stack all coarse x, y distances for array shape (n, 2)
    ll = np.array(proj(lon_points, lat_points)).T  # transform lists of fine grid x, y to match shape (n, 2)
    idx = cdist(lonlat, ll).argmin(0)  # Calculate all distances and get index of minimum

    return np.column_stack((np.unravel_index(idx, lon_array.shape)))

In [None]:
varsSave = ['SNOW_WATER_EQ',
            'HGT_ON_SFC',
            'SNOW_DEPTH',
            'EL_HGT',
            'TROP_PRES',
            'CRAIN',
            'CFRZR',
            'CICEP',
            'CSNOW',
            'TMP_ON_SURFACE',
            'TEMPERATURE_2M',
            'MEAN_SEA_LEVEL',
            'PRES_ON_SURFACE',
            'POT_TEMP_2M',
            'DEWPOINT_2M',
            'DEWPOINT_DEPRES_2M',
            'UGRD_10M',
            'VGRD_10M',
            'PRES_ON_0CISOTHM',
            'HGT_ON_0CISOTHM']
varsPressure = ['HGT', 'TMP', 'RH', 'UGRD', 'VGRD', 'VVEL']
varsSurface = list(set(varsSave) - set(varsPressure))

In [None]:
def df_flatten(ds, x, y, varsP, varsS):

    df = ds.isel(x=x, y=y).to_dataframe()[varsP]
    idx0 = df.index.levels[0]
    idx1 = df.index.levels[1].astype(int).astype(str)
    df.index = df.index.set_levels([idx0, idx1])
    df = df.unstack(level='press').sort_index()
    df.columns = df.columns.map('_'.join)

    varsAvailable = list(set(varsS).intersection(set(ds.variables)))
    dfS = ds[varsAvailable].isel(x=x,y=y).to_dataframe()[varsAvailable]

    df = df.join(dfS).reset_index(drop=True)

    return df

def calc_dewpoint(df): # Create T_DEWPOINT columns from RH and TMP
    if df.isnull().any().any():
        print(df[df.isnull().any(axis=1)][['datetime'] + list(df.columns[df.isna().any()])])
        df = df[~df.isnull().any(axis=1)]
    for p in list(range(100, 1025, 25)):
        df[f'RH_{p}'] = df[f'RH_{p}'].replace(0.0, 1.0)
        df_RH = units.Quantity(np.array(df[f'RH_{p}'])/100., "dimensionless")
        df_TMP = units.Quantity(np.array(df[f'TMP_{p}']), "K")
        df[f'T_DEWPOINT_{p}'] = dewpoint_from_relative_humidity(df_TMP, df_RH)
    return df

def convert_KtoC(df, varsUnits_dict):
    # change variables from Kelvin to Celsius
    for var, units in varsUnits_dict.items():
        if units == 'K':
            try:
                df[var] = df[var] - 273.15
                varsUnits_dict[var] = 'C'
            except:
                continue
    return df

def add_units(df, varsUnits_dict):
    # Rename columns to include units
    for column in list(df.columns):
        if column in list(varsUnits_dict.keys()):
            if varsUnits_dict[column] == '-':
                pass
            else:
                df.rename(columns={column: column + '_' + varsUnits_dict[column]}, inplace=True)
    return df

In [None]:
columns = list(df.columns) + [v+'_'+str(i) for v in varsPressure for i in list(range(100, 1025, 25))] + varsSurface
date_group = df[:50].groupby('obdate')
for name, date_chunk in date_group:
    with open(os.path.join(path_save, "varsUnits_dict.pkl"), 'rb') as f:
        varsUnits_dict = pickle.load(f)
    df_save = []
    date = datetime.strptime(name, '%m/%d/%Y').strftime('%Y%m%d')
    datetime_group = date_chunk.groupby('datetime')
    for name, datetime_chunk in datetime_group:
        hour = name.strftime('%H')
        # try to open a dataset if one is available and not corrupted
        try:
            ds = xr.open_dataset(os.path.join(path_rap, date, f"rap_130_{date}_{hour}00_000.nc"))
        except FileNotFoundError:
            try:
                ds= xr.open_dataset(os.path.join(path_rap, date, f"ruc2anl_130_{date}_{hour}00_000.nc"))
            except Exception as e:
                print("\t- ERROR: ", date, hour, e)
                continue

        # calculate projected indices
        idx = find_coord_indices(ds['longitude'].values, ds['latitude'].values,
                                 list(datetime_chunk['lon']), list(datetime_chunk['lat']))
        datetime_chunk['x'], datetime_chunk['y'] = idx[:, 1], idx[:, 0]
        
        # drop duplicate columns
        duplicate_counts = datetime_chunk.groupby(['precip', 'datetime', 'x', 'y']).count()
        shape_orig = datetime_chunk.shape
        datetime_chunk = datetime_chunk.drop_duplicates(subset=['precip', 'datetime', 'x', 'y'],
                                                        keep='first', ignore_index=True)
        datetime_chunk['precip_count'] = list(duplicate_counts['obdate'])
        # print(f"Removed {(shape_orig[0] - datetime_chunk.shape[0]):,} duplicates from {source} dataset")

        columns = ["datetime", "x", "y", "report_count", "ra_percent", "sn_percent", "pl_percent", "fzra_percent"]
        df_new = pd.DataFrame(columns=columns)
        
        i=0
        group = datetime_chunk.groupby(["datetime", "x", "y"])
        for name, chunk in group:
            df_new.loc[i, 'datetime'], df_new.loc[i, 'x'] , df_new.loc[i, 'y']  = name[0], name[1], name[2]
            df_new.loc[i, 'report_count'] = chunk["precip_count"].sum()
            for precip_type in precip_types:
                if chunk[chunk['precip'] == precip_type].shape[0]:
                    precip_count = float(chunk.loc[chunk['precip'] == precip_type, 'precip_count'].values)
                    df_new.loc[i, f"{precip_type}_percent"] = precip_count / df_new.loc[i, 'report_count']
                else:
                    df_new.loc[i, f"{precip_type}_percent"] = 0.0
            i += 1
            
        # create new merged dataframe
        for index, row in df_new.iterrows():
            try:
                ds_temp = df_flatten(ds, row['x'], row['y'], varsPressure, varsSurface) 
            except Exception as e:
                print("\t- ERROR: flattening not possible: ", date, hour, e)
                continue
            df_temp = pd.DataFrame(row).T.join(ds_temp.rename(index={0: row.name}))
            df_save.append(df_temp)
    if len(df_save) > 0:
        df_save = pd.concat(df_save, axis=0)
        df_save = calc_dewpoint(df_save)
        df_save = convert_KtoC(df_save, varsUnits_dict)
        df_save = add_units(df_save, varsUnits_dict)
        df_save = df_save.sort_values(by="datetime")
        print(f"For {date}, was able to load {df_save.shape[0]} rows out of {date_chunk.shape[0]}")
        # df_save.to_parquet(os.path.join(path_save, f"{save_dir}/{source}_rap_{date}.parquet"))
    else:
        print(f"Nothing to save for {date}")
        continue
df_save

In [8]:
days = [datetime.strptime(x, '%m/%d/%Y').strftime('%Y%m%d') for x in list(df['obdate'].unique())]
if source == 'ASOS':
    files = [x[9:17] for x in [x for x in os.walk(os.path.join(path_save, f"{save_dir}"))][0][2]]
if source == 'mPING':
    files = [x[10:18] for x in [x for x in os.walk(os.path.join(path_save, f"{save_dir}"))][0][2]]

print(datetime.now())
missing = list(set(days) - set(files))
print(f"{len(missing)} {source} files missing out of {len(set(days))} - {(1-len(missing)/len(set(days))):.1%}% finished")

2022-12-20 12:31:17.626930
327 ASOS files missing out of 1574 - 79.2%% finished


In [7]:
with open(f"./missing_{save_dir}.pkl", "wb") as f:
    pickle.dump(missing, f)

In [8]:
number_splits = 9
fraction = len(missing) // number_splits + 1
start = 0
for i in range(number_splits):
    end = min(len(missing), start + fraction)
    print(start, end)
    start += fraction

0 176
176 352
352 528
528 704
704 880
880 1056
1056 1232
1232 1408
1408 1581
