In [34]:
import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import xarray as xr
from pyproj import CRS, Transformer, Proj
from scipy.spatial.distance import cdist
from metpy.calc import dewpoint_from_relative_humidity
from metpy.units import units
from tqdm.notebook import tqdm


In [35]:
path_precip = "/glade/p/cisl/aiml/ai2es/winter_ptypes/"
path_rap = "/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/"
path_save = "/glade/p/cisl/aiml/ai2es/winter_ptypes/precip_rap/"

precip_files = [f for f in os.listdir(path_precip) if f.endswith('.csv')]
precip_files.sort()


In [3]:
def find_coord_indices(lon_array, lat_array, lon_points, lat_points, dist_proj='lcc_RAP'):
    """
    Find indices of nearest lon/lat pair on a grid. Supports rectilinear and curilinear grids.
    lon_points / lat_points must be received as a list.
    Args:
        lon_array (np.array): Longitude values of coarse grid you are matching against
        lat_array (np.array): Latitude values of coarse grid you are matching against
        lon_points (list): List of Longitude points from orginal grid/object
        lat_points (list): List of Latitude points from original grid/object
        dist_proj (str): Name of projection for pyproj to calculate distances
    Returns (list):
        List of i, j (Lon/Lat) indices for coarse grid.
    """
    if dist_proj == 'lcc_WRF':
        proj = Proj(proj='lcc', R=6371229, lat_0=38, lon_0=-97.5, lat_1=32, lat_2=46)  ## from WRF HWT data
    if dist_proj == 'lcc_RAP':
        proj = Proj(proj='lcc', R=6371229, lat_0=25, lon_0=265, lat_1=25, lat_2=25)

    proj_lon, proj_lat = np.array(proj(lon_array, lat_array))  # transform to distances using specified projection
    lonlat = np.column_stack(
        (proj_lon.ravel(), proj_lat.ravel()))  # Stack all coarse x, y distances for array shape (n, 2)
    ll = np.array(proj(lon_points, lat_points)).T  # transform lists of fine grid x, y to match shape (n, 2)
    idx = cdist(lonlat, ll).argmin(0)  # Calculate all distances and get index of minimum

    return np.column_stack((np.unravel_index(idx, lon_array.shape))).tolist()


In [4]:
precip_types = ['ra', 'sn', 'pl', 'fzra']
df_ASOS = pd.DataFrame()
df_mPING = pd.DataFrame()
for precip in tqdm(precip_files):
    df_temp = pd.read_csv(os.path.join(path_precip, precip))
    df_temp['precip'] = list(set(precip.split('.')).intersection(set(precip_types)))[0]
    
    if df_temp.isna().sum().sum() > 0:
        print(f"Dropping {df_temp.isna().sum().sum()} rows from {precip} because NaNs are present.")
        df_temp.dropna(inplace=True)
    
    try:
        datetime.strptime(df_temp.index[0], '%M/%d/%Y')
        df_temp = df_temp.reset_index().rename(columns={'index':'obdate'})
    except:
        pass
    
    df_temp['datetime'] = pd.to_datetime(df_temp['obdate'] + ' ' + df_temp['obtime'], format="%m/%d/%Y %H:%M:%S")
    df_temp['datetime'] = df_temp['datetime'].dt.floor(freq='H')  
    
    if precip.startswith('ASOS'):    
        df_ASOS = df_ASOS.append(df_temp, ignore_index=True)
    else:
        df_mPING = df_mPING.append(df_temp, ignore_index=True)

del df_temp


  0%|          | 0/24 [00:00<?, ?it/s]

Dropping 3 rows from ASOS.5.2016.ra.csv because NaNs are present.
Dropping 1 rows from ASOS.5.2019.ra.csv because NaNs are present.


In [5]:
# delete duplicate ASOS and mPING entries and add a column of counts to the dataframe

duplicate_counts = df_ASOS.groupby(['obdate', 'lat', 'lon', 'precip', 'datetime']).count()
shape_orig_ASOS = df_ASOS.shape
df_ASOS = df_ASOS.drop_duplicates(subset=['obdate', 'lat', 'lon', 'precip', 'datetime'], keep='first', ignore_index=True)
df_ASOS['hourly_count'] = list(duplicate_counts['obtime'])
print(f"Removed {(shape_orig_ASOS[0] - df_ASOS.shape[0]):,} duplicates from ASOS dataset")

duplicate_counts = df_mPING.groupby(['obdate', 'lat', 'lon', 'precip', 'datetime']).count()
shape_orig_mPING = df_mPING.shape
df_mPING = df_mPING.drop_duplicates(subset=['obdate', 'lat', 'lon', 'precip', 'datetime'], keep='first', ignore_index=True)
df_mPING['hourly_count'] = list(duplicate_counts['obtime'])
print(f"Removed {(shape_orig_mPING[0] - df_mPING.shape[0]):,} duplicates from mPING dataset")


Removed 29,926,149 duplicates from ASOS dataset
Removed 143,773 duplicates from mPING dataset


In [38]:
days_ASOS = [datetime.strptime(x, '%m/%d/%Y').strftime('%Y%m%d') for x in list(df_ASOS['obdate'].unique())]
days_mPING = [datetime.strptime(x, '%m/%d/%Y').strftime('%Y%m%d') for x in list(df_mPING['obdate'].unique())]

save_ASOS = "/glade/p/cisl/aiml/ai2es/winter_ptypes/precip_rap/ASOS_RH0/"
save_mPING = "/glade/p/cisl/aiml/ai2es/winter_ptypes/precip_rap/mPING_raw/"


In [39]:
files_ASOS = [x[9:17] for x in [x for x in os.walk(save_ASOS)][0][2]]
files_mPING = [x[10:18] for x in [x for x in os.walk(save_mPING)][0][2]]

print(datetime.now())
missing_ASOS = list(set(days_ASOS) - set(files_ASOS))
missing_mPING = list(set(days_mPING) - set(files_mPING))
print(f"{len(missing_ASOS)} ASOS files missing out of {len(set(days_ASOS))} - {(1-len(missing_ASOS)/len(set(days_ASOS))):.1%} finished")
print(f"{len(missing_mPING)} mPING files missing out of {len(set(days_mPING))}- {(1-len(missing_mPING)/len(set(days_mPING))):.1%}% finished")


2022-03-22 11:47:07.645134
176 ASOS files missing out of 2060 - 91.5% finished
2704 mPING files missing out of 2704- 0.0%% finished


In [40]:
with open("./missing_ASOS.pkl", "wb") as f:
    pickle.dump(missing_ASOS, f)
    
with open("./missing_mPING.pkl", "wb") as f:
    pickle.dump(missing_mPING, f)
    

In [13]:
with open("./missing_ASOS.pkl", "rb") as f:
    missing_ASOS = pickle.load(f)

with open("./missing_mPING.pkl", "rb") as f:
    missing_mPING = pickle.load(f)
    

In [None]:
date = dates[0] #yyyymmdd
hour = 0 #1-23
ds = xr.open_dataset(os.path.join(path_data, date, f"ruc2anl_130_{date}_{hour:02d}00_000.nc"))


In [7]:
varsSave = ['SNOW_WATER_EQ',
            'HGT_ON_SFC',
            'SNOW_DEPTH',
            'EL_HGT',
            'TROP_PRES',
            'CRAIN',
            'CFRZR',
            'CICEP',
            'CSNOW',
            'TMP_ON_SURFACE',
            'MEAN_SEA_LEVEL',
            'PRES_ON_SURFACE',
            'POT_TEMP_2M',
            'DEWPOINT_2M',
            'DEWPOINT_DEPRES_2M',
            'UGRD_10M',
            'VGRD_10M',
            'PRES_ON_0CISOTHM',
            'HGT_ON_0CISOTHM']

varsPressure = ['HGT', 'TMP', 'RH', 'UGRD', 'VGRD', 'VVEL']

varsSurface = list(set(varsSave) - set(varsPressure))


In [8]:
def df_flatten(ds, x, y, varsP, varsS):
    
    df = ds.isel(x=x,y=y).to_dataframe()[varsP]
    idx0 = df.index.levels[0].astype(int).astype(str)
    idx1 = df.index.levels[1]
    df.index = df.index.set_levels([idx0, idx1])
    df = df.unstack(level='press').sort_index()
    df.columns = df.columns.map('_'.join)
    
    varsAvailable = list(set(varsS).intersection(set(ds.variables)))
    dfS = ds[varsAvailable].isel(x=x,y=y).to_dataframe()[varsAvailable]
    
    df = df.join(dfS).reset_index(drop=True)
    
    return df


In [78]:
def calc_dewpoint(df):# Create T_DEWPOINT columns from RH and TMP
    print(df[df.isnull().any(axis=1)][['datetime'] + list(df.columns[df.isna().any()])])
    df = df[~df.isnull().any(axis=1)]
    for p in list(range(100, 1025, 25)):
        df_RH = units.Quantity(np.array(df[f'RH_{p}'])/100., "dimensionless")
        df_TMP =  units.Quantity(np.array(df[f'TMP_{p}']), "K")
        df[f'T_DEWPOINT_{p}'] = dewpoint_from_relative_humidity(df_TMP, df_RH) 
    return df


In [11]:
def add_units(df, varsUnits_dict):
    # Rename columns to include units
    for column in list(df.columns):
        if column in list(varsUnits_dict.keys()):
            df.rename(columns={column: column + '_' + varsUnits_dict[column]}, inplace=True)
    return df


In [31]:
with open(os.path.join(path_save, "varsUnits_dict.pkl"), 'rb') as f:
    varsUnits_dict = pickle.load(f)
    

In [10]:
columns = list(df_ASOS.columns) + [v+'_'+str(i) for v in varsPressure for i in list(range(100, 1025, 25))] + varsSurface
date_group = df_ASOS.groupby('obdate')
for name, date_chunk in date_group:
    with open(os.path.join(path_save, "varsUnits_dict.pkl"), 'rb') as f:
        varsUnits_dict = pickle.load(f)
    df_save = pd.DataFrame(index=np.arange(date_chunk.shape[0]), columns=columns)
    date = datetime.strptime(name, '%m/%d/%Y').strftime('%Y%m%d')    
    datetime_group = date_chunk.groupby('datetime')
    for name, datetime_chunk in datetime_group:
        hour = name.strftime('%H')
        # try to open a dataset if one is available and not corrupted
        try:
            ds = xr.open_dataset(os.path.join(path_rap, date, f"rap_130_{date}_{hour}00_000.nc"))
        except FileNotFoundError:
            try:
                ds= xr.open_dataset(os.path.join(path_rap, date, f"ruc2anl_130_{date}_{hour}00_000.nc"))
            except Exception as e:
                print(date, hour, e)
                continue

        # calculate projected indices
        datetime_chunk['idx'] = find_coord_indices(ds['longitude'].values, ds['latitude'].values,
                                                   list(datetime_chunk['lon']), list(datetime_chunk['lat']))

        # create new merged dataframe
        for index, row in datetime_chunk.iterrows():
            try:
                ds_temp = df_flatten(ds, row['idx'][1], row['idx'][0], varsPressure, varsSurface) 
            except Exception as e:
                print("\t- ", date, hour, e)
                continue
            df_temp = pd.DataFrame(row).T.join(ds_temp.rename(index={0:row.name}))
            df_save = df_save.append(df_temp, ignore_index = True)

    # add dewpoint, convert K to C, rename columns to add units, sort by datetime, and save
    df_save = calc_dewpoint(df_save)
    df_save = convert_KtoC(df_save, varsUnits_dict)
    df_save = add_units(df_save, varsUnits_dict)
    df_save = df_save.sort_values(by="datetime")
    print(f"For {date}, was able to load {df_save.shape[0]} rows out of {date_chunk.shape[0]}")
    if 0 in df_save.shape:
        print(f"Nothing to save for {date}")
    else:
        df_save.to_parquet(os.path.join(path_save, f"ASOS_RH0/ASOS_rap_{date}.parquet"))
    

  0%|          | 0/2060 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(99, 285)


  0%|          | 0/23 [00:00<?, ?it/s]

(23, 285)


  0%|          | 0/12 [00:00<?, ?it/s]

(12, 285)


  0%|          | 0/7 [00:00<?, ?it/s]

(7, 285)


  0%|          | 0/507 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(507, 285)


  0%|          | 0/1130 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1130, 285)


  0%|          | 0/1180 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1180, 285)


  0%|          | 0/1233 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1233, 285)


  0%|          | 0/1270 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1270, 285)


  0%|          | 0/1150 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1150, 285)


  0%|          | 0/1088 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1088, 285)


  0%|          | 0/1132 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1132, 285)


  0%|          | 0/1187 [00:00<?, ?it/s]

(1187, 285)


  0%|          | 0/1335 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1335, 285)


  0%|          | 0/1286 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1286, 285)


  0%|          | 0/1043 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1043, 285)


  0%|          | 0/1025 [00:00<?, ?it/s]

(1025, 285)


  0%|          | 0/1044 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1044, 285)


  0%|          | 0/1021 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(1021, 285)


  0%|          | 0/1034 [00:00<?, ?it/s]

(1034, 285)


  0%|          | 0/1011 [00:00<?, ?it/s]

(1011, 285)


  0%|          | 0/988 [00:00<?, ?it/s]

(988, 285)


  0%|          | 0/973 [00:00<?, ?it/s]

(973, 285)


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

(16, 285)
20140101 01 [Errno 2] No such file or directory: b'/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/20140101/ruc2anl_130_20140101_0100_000.nc'
20140101 02 [Errno 2] No such file or directory: b'/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/20140101/ruc2anl_130_20140101_0200_000.nc'
20140101 05 [Errno 2] No such file or directory: b'/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/20140101/ruc2anl_130_20140101_0500_000.nc'
20140101 06 [Errno 2] No such file or directory: b'/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/20140101/ruc2anl_130_20140101_0600_000.nc'
20140101 07 [Errno 2] No such file or directory: b'/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/20140101/ruc2anl_130_20140101_0700_000.nc'
20140101 08 [Errno 2] No such file or directory: b'/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/20140101/ruc2anl_130_20140101_0800_000.nc'
20140101 09 [Errno 2] No such file or directory: b'/glade/p/cisl/aiml/conv_risk_intel/rap_ncei_nc/20140101/ruc2anl_130_20140101_0900_000.nc'
201

  0%|          | 0/24 [00:00<?, ?it/s]

  0%|          | 0/306 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(306, 285)


  0%|          | 0/316 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(316, 285)


  0%|          | 0/320 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(320, 285)


  0%|          | 0/209 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(209, 285)


  0%|          | 0/43 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(43, 285)


  0%|          | 0/105 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(105, 285)


  0%|          | 0/245 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(245, 285)


  0%|          | 0/516 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(516, 285)


  0%|          | 0/548 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(548, 285)


  0%|          | 0/552 [00:00<?, ?it/s]

(552, 285)


  0%|          | 0/669 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(669, 285)


  0%|          | 0/694 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(694, 285)


  0%|          | 0/775 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(775, 285)


  0%|          | 0/820 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(820, 285)


  0%|          | 0/801 [00:00<?, ?it/s]

(801, 285)


  0%|          | 0/741 [00:00<?, ?it/s]

(741, 285)


  0%|          | 0/817 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(817, 285)


  0%|          | 0/714 [00:00<?, ?it/s]

(714, 285)


  0%|          | 0/795 [00:00<?, ?it/s]

(795, 285)


  0%|          | 0/887 [00:00<?, ?it/s]

(887, 285)


  0%|          | 0/870 [00:00<?, ?it/s]

(870, 285)


  0%|          | 0/840 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(840, 285)


  0%|          | 0/827 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(827, 285)


  0%|          | 0/826 [00:00<?, ?it/s]

  result_magnitude = func(*stripped_args, **stripped_kwargs)
  magnitude = magnitude_op(new_self._magnitude, other._magnitude)


(826, 285)


In [None]:
columns = list(df_mPING.columns) + [v+'_'+str(i) for v in varsPressure for i in list(range(100, 1025, 25))] + varsSurface
date_group = df_mPING.groupby('obdate')
for name, date_chunk in date_group:
    with open(os.path.join(path_save, "varsUnits_dict.pkl"), 'rb') as f:
        varsUnits_dict = pickle.load(f)
    df_save = pd.DataFrame(columns=columns)
    date = datetime.strptime(name, '%m/%d/%Y').strftime('%Y%m%d')    
    datetime_group = date_chunk.groupby('datetime')
    for name, datetime_chunk in datetime_group:
        hour = name.strftime('%H')
        # try to open a dataset if one is available and not corrupted
        try:
            ds = xr.open_dataset(os.path.join(path_rap, date, f"rap_130_{date}_{hour}00_000.nc"))
        except FileNotFoundError:
            try:
                ds= xr.open_dataset(os.path.join(path_rap, date, f"ruc2anl_130_{date}_{hour}00_000.nc"))
            except Exception as e:
                print(date, hour, e)
                continue

        # calculate projected indices
        datetime_chunk['idx'] = find_coord_indices(ds['longitude'].values, ds['latitude'].values,
                                                   list(datetime_chunk['lon']), list(datetime_chunk['lat']))

        # create new merged dataframe
        for index, row in datetime_chunk.iterrows():
            try:
                ds_temp = df_flatten(ds, row['idx'][1], row['idx'][0], varsPressure, varsSurface) 
            except Exception as e:
                print("\t- flattening not possible: ", date, hour, e)
                continue
            df_temp = pd.DataFrame(row).T.join(ds_temp.rename(index={0:row.name}))
            df_save = df_save.append(df_temp, ignore_index = True)

    # add dewpoint, convert K to C, rename columns to add units, sort by datetime, and save
    df_save = calc_dewpoint(df_save)
    df_save = convert_KtoC(df_save, varsUnits_dict)
    df_save = add_units(df_save, varsUnits_dict)
    df_save = df_save.sort_values(by="datetime")
    print(f"For {date}, was able to load {df_save.shape[0]} rows out of {date_chunk.shape[0]}")
    if 0 in df_save.shape:
        print(f"Nothing to save for {date}")
    else:
        df_save.to_parquet(os.path.join(path_save, f"mPING_RH0/mPING_rap_{date}.parquet"))


In [17]:
date = '20180924'
df_temp = pd.read_parquet(os.path.join(path_save, f"ASOS/ASOS_rap_{date}.parquet"))
print(date, df_temp.shape)
df_temp


20180924 (2219, 286)


Unnamed: 0,obdate,obtime,lat,lon,precip,datetime,precip_count_byhr,HGT_100_m,HGT_125_m,HGT_150_m,...,T_DEWPOINT_775_C,T_DEWPOINT_800_C,T_DEWPOINT_825_C,T_DEWPOINT_850_C,T_DEWPOINT_875_C,T_DEWPOINT_900_C,T_DEWPOINT_925_C,T_DEWPOINT_950_C,T_DEWPOINT_975_C,T_DEWPOINT_1000_C
2219,09/24/2018,00:00:31,45.698,-110.441,ra,2018-09-24 00:00:00,4,16356.068359,14951.678711,13791.625000,...,1.429656,1.388061,2.353550,3.527393,5.006421,6.395425,7.856617,9.218117,10.554970,11.915785
2252,09/24/2018,00:00:31,32.321,-90.078,ra,2018-09-24 00:00:00,5,16621.287109,15303.709961,14212.812500,...,10.273777,12.408267,14.165571,15.507485,17.265322,18.404615,18.809875,19.394560,20.800310,23.262024
2253,09/24/2018,00:10:31,35.831,-90.646,ra,2018-09-24 00:00:00,3,16590.849609,15267.241211,14172.031250,...,10.436361,11.426638,12.126513,12.936627,14.256284,15.590380,16.634279,17.180340,17.426332,18.410957
2254,09/24/2018,00:25:31,38.041,-84.606,ra,2018-09-24 00:00:00,3,16566.193359,15229.303711,14121.000000,...,10.698597,11.680826,12.268491,12.970375,14.120492,15.674821,16.390438,16.680851,16.864399,18.119402
2255,09/24/2018,00:00:31,34.727,-92.239,ra,2018-09-24 00:00:00,8,16599.537109,15282.209961,14189.093750,...,10.789054,11.467782,12.163944,14.154389,16.949282,18.470636,18.970104,17.617397,17.239021,18.103844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4343,09/24/2018,23:00:31,41.914,-88.246,ra,2018-09-24 23:00:00,8,16515.677734,15147.386719,14021.707031,...,8.452464,10.072167,11.745052,12.300518,13.053684,15.224464,16.004210,16.912743,17.163216,18.431194
4344,09/24/2018,23:10:31,30.400,-86.472,ra,2018-09-24 23:00:00,3,16642.958984,15330.167969,14238.613281,...,11.171460,12.629581,14.265008,15.831111,17.664354,19.348049,20.623487,21.941286,22.984804,23.522947
4345,09/24/2018,23:00:31,42.231,-83.331,ra,2018-09-24 23:00:00,10,16509.708984,15148.917969,14031.832031,...,5.723518,7.526731,9.331834,11.329353,14.242844,15.368957,15.544155,16.383518,17.640785,18.731733
4339,09/24/2018,23:00:31,42.409,-83.010,ra,2018-09-24 23:00:00,10,16505.021484,15145.074219,14030.207031,...,4.872472,7.367021,9.318354,11.079956,13.911264,15.306516,15.313981,16.071302,17.294260,18.292698


In [9]:
files_ASOS = [f for f in os.listdir(os.path.join(path_save, "ASOS_raw")) if f.endswith('.parquet')]
files_mPING = [f for f in os.listdir(os.path.join(path_save, "mPING_RH0")) if f.endswith('.parquet')]


In [12]:
for i,f in enumerate(files_ASOS):
    df_temp = pd.read_parquet(os.path.join(path_save, "ASOS_RH0", f))
    if df_temp.isna().sum().sum() > 0:
        print(i, f)
        print(df_temp[df_temp.isnull().any(axis=1)][['datetime'] + list(df_temp.columns[df_temp.isna().any()])])
        break


0 ASOS_rap_20140309.parquet
                datetime  T_DEWPOINT_200_C  T_DEWPOINT_525_C  \
1772 2014-03-09 01:00:00        -65.840767        -28.563976   
1717 2014-03-09 01:00:00               NaN        -33.032887   
1945 2014-03-09 03:00:00        -67.305504        -30.288883   
2134 2014-03-09 05:00:00        -66.440269        -29.385080   
2225 2014-03-09 06:00:00        -66.859535        -32.396675   
2375 2014-03-09 08:00:00        -68.008308        -13.859902   
2365 2014-03-09 08:00:00        -63.337532        -15.017653   
2678 2014-03-09 12:00:00               NaN        -25.644409   
2741 2014-03-09 13:00:00               NaN        -21.274437   
2750 2014-03-09 13:00:00               NaN        -21.669996   
2747 2014-03-09 13:00:00               NaN        -22.437828   
2754 2014-03-09 13:00:00        -69.130432        -44.840088   
2724 2014-03-09 13:00:00               NaN        -22.921701   
2886 2014-03-09 16:00:00        -72.696411               NaN   
3020 2014-03

In [13]:
idx = 2365
level = '750'
print(f'-- {level} --')
print("TMP", df_temp.at[idx, f'TMP_{level}_C'])
print("RH", df_temp.at[idx, f'RH_{level}_percent'])
print("DEWPOINT",df_temp.at[idx, f'T_DEWPOINT_{level}_C'])

level = '775'
print(f'-- {level} --')
print("TMP",df_temp.at[idx, f'TMP_{level}_C'])
print("RH", df_temp.at[idx, f'RH_{level}_percent'])
print("DEWPOINT",df_temp.at[idx, f'T_DEWPOINT_{level}_C'])

level = '800'
print(f'-- {level} --')
print("TMP",df_temp.at[idx, f'TMP_{level}_C'])
print("RH", df_temp.at[idx, f'RH_{level}_percent'])
print("DEWPOINT",df_temp.at[idx, f'T_DEWPOINT_{level}_C'])

level = '825'
print(f'-- {level} --')
print("TMP",df_temp.at[idx, f'TMP_{level}_C'])
print("RH", df_temp.at[idx, f'RH_{level}_percent'])
print("DEWPOINT",df_temp.at[idx, f'T_DEWPOINT_{level}_C'])


-- 750 --
TMP 7.3673706
RH 19.0
DEWPOINT -14.779534
-- 775 --
TMP 9.346344
RH 0.0
DEWPOINT nan
-- 800 --
TMP 10.532135
RH 0.0
DEWPOINT nan
-- 825 --
TMP 10.504852
RH 9.0
DEWPOINT -21.10855


In [10]:
pressure_levels = list(range(100, 1025, 25))
dewpoint_levels = [f'T_DEWPOINT_{p}_C' for p in list(range(100, 1025, 25))]

RH0_count_ASOS = 0
RH0_file_count_ASOS = 0
for i,f in enumerate(files_ASOS):
    df_temp = pd.read_parquet(os.path.join(path_save, "ASOS_raw", f))
    df_temp = df_temp.reset_index(drop=True)
    if df_temp.isna().sum().sum() > 0:
        idx, idx_col = np.where(df_temp[dewpoint_levels].isnull())
        RH0_count_ASOS += len(idx)
        RH0_file_count_ASOS += 1
        for ix, ixc in zip(idx, idx_col):
            press = pressure_levels[ixc]
            if df_temp.at[ix, f'RH_{press}_percent'] == 0.0:
                df_temp.at[ix, f'RH_{press}_percent'] == 1.0
                RH = units.Quantity(1.0/100., "dimensionless")
            else:
                print(f"RH not 0.0% at {ix, ixc} in file {f}")
                break
            TMP =  units.Quantity(df_temp.at[ix, f'TMP_{press}_C'] + 273.15, "K")
            df_temp.at[ix, dewpoint_levels[ixc]] = np.array(dewpoint_from_relative_humidity(TMP, RH))
    df_temp.to_parquet(os.path.join(path_save, "ASOS", f))

print(f"Files affected by 0% RH: {RH0_file_count_ASOS}")
print(f"Elements affected by 0% RH: {RH0_count_ASOS}")


Files affected by 0% RH: 746
Elements affected by 0% RH: 366160


In [33]:
rh_levels = [f'RH_{p}_percent' for p in list(range(100, 1025, 25))]

RH0_count_mPING = 0
RH0_file_count_mPING = 0
for i,f in enumerate(files_mPING):
    df_temp = pd.read_parquet(os.path.join(path_save, "mPING_RH0", f))
    df_temp = df_temp.reset_index(drop=True)
    df_temp[rh_levels] = df_temp[rh_levels].replace(0.0, 1.0)
    df_temp.to_parquet(os.path.join(path_save, "mPING", f))

print(f"Files affected by 0% RH: {RH0_file_count_mPING}")
print(f"Elements affected by 0% RH: {RH0_count_mPING}")


Files affected by 0% RH: 0
Elements affected by 0% RH: 0


In [21]:
RH0_count_mPING = 0
RH0_file_count_mPING = 0
for i,f in enumerate(files_mPING):
    df_temp = pd.read_parquet(os.path.join(path_save, "mPING_RH0", f))
    df_temp = df_temp.reset_index(drop=True)
    if df_temp.isna().sum().sum() > 0:
        idx, idx_col = np.where(df_temp[dewpoint_levels].isnull())
        RH0_count_mPING += len(idx)
        RH0_file_count_mPING += 1
        for ix, ixc in zip(idx, idx_col):
            press = pressure_levels[ixc]
            if df_temp.at[ix, f'RH_{press}_percent'] == 0.0:
                
                RH = units.Quantity(1.0/100., "dimensionless")
            else:
                print(f"RH not 0.0% at {ix, ixc} in file {f}")
                break
            TMP =  units.Quantity(df_temp.at[ix, f'TMP_{press}_C'] + 273.15, "K")
            df_temp.at[ix, dewpoint_levels[ixc]] = np.array(dewpoint_from_relative_humidity(TMP, RH))
    df_temp.to_parquet(os.path.join(path_save, "mPING", f))

print(f"Files affected by 0% RH: {RH0_file_count_mPING}")
print(f"Elements affected by 0% RH: {RH0_count_mPING}")


Files affected by 0% RH: 982
Elements affected by 0% RH: 98333


In [22]:
# check if any file has all NaN rows
for i,f in enumerate(files_ASOS):
    df_temp = pd.read_parquet(os.path.join(path_save, "ASOS", f))
    if df_temp.isnull().apply(lambda x: all(x), axis=1).any():
        print(i, f)
        print(df_temp[df_temp.isnull().any(axis=1)][['datetime'] + list(df_temp.columns[df_temp.isna().any()])])
        break

for i,f in enumerate(files_mPING):
    df_temp = pd.read_parquet(os.path.join(path_save, "mPING", f))
    if df_temp.isnull().apply(lambda x: all(x), axis=1).any():
        print(i, f)
        print(df_temp[df_temp.isnull().any(axis=1)][['datetime'] + list(df_temp.columns[df_temp.isna().any()])])
        break
        