In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
%matplotlib inline

# P-Type
The goal is to load our p-type dataset and subset it to a small enough but still representative file for easy loading. We load our original file and then subset and rearrange the data to make it more user-friendly.

In [19]:
mping_file = "/glade/campaign/cisl/aiml/ai2es/winter_ptypes/ptype_qc/mPING_interpolated_QC2.parquet"
data = pd.read_parquet(mping_file)
data.reset_index(inplace=True)

Print all the columns.

In [20]:
for col in data.columns:
    print(col)

level_0
T_DEWPOINT_C_0_m
T_DEWPOINT_C_250_m
T_DEWPOINT_C_500_m
T_DEWPOINT_C_750_m
T_DEWPOINT_C_1000_m
T_DEWPOINT_C_1250_m
T_DEWPOINT_C_1500_m
T_DEWPOINT_C_1750_m
T_DEWPOINT_C_2000_m
T_DEWPOINT_C_2250_m
T_DEWPOINT_C_2500_m
T_DEWPOINT_C_2750_m
T_DEWPOINT_C_3000_m
T_DEWPOINT_C_3250_m
T_DEWPOINT_C_3500_m
T_DEWPOINT_C_3750_m
T_DEWPOINT_C_4000_m
T_DEWPOINT_C_4250_m
T_DEWPOINT_C_4500_m
T_DEWPOINT_C_4750_m
T_DEWPOINT_C_5000_m
T_DEWPOINT_C_5250_m
T_DEWPOINT_C_5500_m
T_DEWPOINT_C_5750_m
T_DEWPOINT_C_6000_m
T_DEWPOINT_C_6250_m
T_DEWPOINT_C_6500_m
T_DEWPOINT_C_6750_m
T_DEWPOINT_C_7000_m
T_DEWPOINT_C_7250_m
T_DEWPOINT_C_7500_m
T_DEWPOINT_C_7750_m
T_DEWPOINT_C_8000_m
T_DEWPOINT_C_8250_m
T_DEWPOINT_C_8500_m
T_DEWPOINT_C_8750_m
T_DEWPOINT_C_9000_m
T_DEWPOINT_C_9250_m
T_DEWPOINT_C_9500_m
T_DEWPOINT_C_9750_m
T_DEWPOINT_C_10000_m
T_DEWPOINT_C_10250_m
T_DEWPOINT_C_10500_m
T_DEWPOINT_C_10750_m
T_DEWPOINT_C_11000_m
T_DEWPOINT_C_11250_m
T_DEWPOINT_C_11500_m
T_DEWPOINT_C_11750_m
T_DEWPOINT_C_12000_m
T_DEWPOIN

The first filtering step is removing the RH and VVEL columns on pressure levels that are not needed.

In [21]:
rh_cols =  data.columns[data.columns.str.contains("RH")]
print(rh_cols)
vvel_cols = data.columns[data.columns.str.contains("VVEL")]
print(vvel_cols)
drop_cols = np.concatenate([rh_cols.values, vvel_cols.values])
data_smaller = data.drop(columns=drop_cols)

Index(['RH_1000_percent', 'RH_975_percent', 'RH_950_percent', 'RH_925_percent',
       'RH_900_percent', 'RH_875_percent', 'RH_850_percent', 'RH_825_percent',
       'RH_800_percent', 'RH_775_percent', 'RH_750_percent', 'RH_725_percent',
       'RH_700_percent', 'RH_675_percent', 'RH_650_percent', 'RH_625_percent',
       'RH_600_percent', 'RH_575_percent', 'RH_550_percent', 'RH_525_percent',
       'RH_500_percent', 'RH_475_percent', 'RH_450_percent', 'RH_425_percent',
       'RH_400_percent', 'RH_375_percent', 'RH_350_percent', 'RH_325_percent',
       'RH_300_percent', 'RH_275_percent', 'RH_250_percent', 'RH_225_percent',
       'RH_200_percent', 'RH_175_percent', 'RH_150_percent', 'RH_125_percent',
       'RH_100_percent'],
      dtype='object')
Index(['VVEL_1000_Pa/s', 'VVEL_975_Pa/s', 'VVEL_950_Pa/s', 'VVEL_925_Pa/s',
       'VVEL_900_Pa/s', 'VVEL_875_Pa/s', 'VVEL_850_Pa/s', 'VVEL_825_Pa/s',
       'VVEL_800_Pa/s', 'VVEL_775_Pa/s', 'VVEL_750_Pa/s', 'VVEL_725_Pa/s',
       'VVEL_7

The columns are then regrouped into metadata columns and input columns to the ML model. The inputs to our ML model are the temperature, dewpoint, u, and v winds at different levels.

In [29]:
meta_cols = ["datetime", # valid time of the record 
             "x_m", # x-coordinate in m (Lambert conformal projection) 
             "y_m", # y-coordinate in m
             "lon", # Longitude of grid cell
             "lat", # latitude of grid cell
             "usa", # binary flag of whether or not grid cell is in the US
             "report_count", # How many mPING reports are in each grid cell over the hour period
             "ra_percent", # Percentage of rain reports (0-1)
             "sn_percent", # Percentage of snow reports (0-1)
             "pl_percent", # Percentage of sleet/ice pellet reports (0-1)
             "fzra_percent", # Percentage of freezing rain reports (0-1)
             "cprecip", # Does the RAP produce precipitation at this location
             "CRAIN", # RAP Rain p-type
             "CSNOW", # RAP Snow p-type
             "CICEP", # RAP Ice pellet p-type
             "CFRZR", # RAP Freezing rain p-type
             "HGT_ON_SFC_m", # Height of surface above sea level
             "PRES_ON_SURFACE_Pa", # Pressure at the surface in Pa
             "MEAN_SEA_LEVEL_Pa", # Mean sea level pressure in Pa
             "TMP_ON_SURFACE_C", # Temperature at the surface
             "DEWPOINT_2M_C", # Dewpoint at 2 m above ground level
             "POT_TEMP_2M_C", # Potential temperature 2 m above ground level
             "TEMPERATURE_2M_C", # Temperature 2 m above ground level
             "SNOW_DEPTH_m", # Depth of the snow in RAP
             "HGT_ON_0CISOTHM_m", # Height of the freezing level above sea level
             "wetbulb_temp_0m_C", # Wet bulb temperature at the surface
             "wetbulb3.0_filter", # Whether the wetbulb temperature is <=3 C + frozen p-type
             "wetbulb4.0_filter", # Whether the wetbulb temperature is <=4 C + frozen p-type            
             "wetbulb5.0_filter", # Whether the wetbulb temperature is <=5 C + frozen p-type
             "wetbulb6.0_filter", # Whether the wetbulb temperature is <=6 C + frozen p-type
            ]
data_smaller[meta_cols]
temp_cols = data_smaller.columns[data_smaller.columns.str.contains("TEMP_C_")].values
dewp_cols = data_smaller.columns[data_smaller.columns.str.contains("T_DEWPOINT_C_")].values
u_cols = data_smaller.columns[data_smaller.columns.str.contains("UGRD_m/s_")].values
v_cols = data_smaller.columns[data_smaller.columns.str.contains("VGRD_m/s_")].values
pres_cols = data_smaller.columns[data_smaller.columns.str.contains("PRES_Pa_")].values
data_cols = np.concatenate([temp_cols, dewp_cols, u_cols, v_cols, pres_cols])
all_cols = np.concatenate([meta_cols, data_cols])
data_smaller[all_cols]

Unnamed: 0,datetime,x_m,y_m,lon,lat,usa,report_count,ra_percent,sn_percent,pl_percent,...,PRES_Pa_14250_m,PRES_Pa_14500_m,PRES_Pa_14750_m,PRES_Pa_15000_m,PRES_Pa_15250_m,PRES_Pa_15500_m,PRES_Pa_15750_m,PRES_Pa_16000_m,PRES_Pa_16250_m,PRES_Pa_16500_m
0,2015-01-01 01:00:00,225,84,-97.940979,29.902275,1.0,3,1.0,0.0,0.0,...,138.757945,133.082737,127.407530,122.293426,117.592729,112.892031,108.191333,103.490635,98.789938,94.089240
1,2015-01-01 01:00:00,305,195,-85.684326,42.898926,1.0,1,0.0,1.0,0.0,...,124.801602,120.321203,115.840803,111.360403,106.880004,102.399604,97.919204,93.438805,88.958405,84.478005
2,2015-01-01 02:00:00,114,131,-114.296082,34.470554,1.0,1,0.0,1.0,0.0,...,131.897914,126.541871,121.838565,117.399128,112.959690,108.520252,104.080814,99.641377,95.201939,90.762501
3,2015-01-01 02:00:00,135,110,-110.899445,32.289280,1.0,6,1.0,0.0,0.0,...,121.816182,117.320057,112.823932,108.327807,103.831682,99.335557,94.839432,90.343307,85.847182,81.351057
4,2015-01-01 02:00:00,136,110,-110.757507,32.303352,1.0,1,1.0,0.0,0.0,...,120.923701,116.427576,111.931451,107.435326,102.939201,98.443076,93.946951,89.450826,84.954702,80.458577
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1281079,2022-06-30 00:00:00,179,162,-105.185547,38.985016,1.0,2,1.0,0.0,0.0,...,93.558669,88.857122,84.155575,79.454027,74.752480,70.050933,65.349386,60.647838,55.946291,51.244744
1281080,2022-06-30 00:00:00,178,173,-105.465805,40.266838,1.0,1,1.0,0.0,0.0,...,95.491584,90.818052,86.144521,81.470989,76.797457,72.123925,67.450393,62.776861,58.103329,53.429797
1281081,2022-06-30 00:00:00,127,129,-112.380432,34.443092,1.0,2,0.0,0.0,0.0,...,110.161068,105.416131,100.671193,95.926256,91.181318,86.436381,81.691444,76.946506,72.201569,67.456632
1281082,2022-06-30 00:00:00,395,206,-71.371246,42.900169,1.0,1,1.0,0.0,0.0,...,140.924502,135.557172,130.189841,124.854022,120.439613,116.025205,111.610797,107.196389,102.781981,98.367573


Save the full dataset to disk

In [30]:
data_smaller[all_cols].to_parquet("/glade/derecho/scratch/dgagne/ptype_qc_all.parquet")

Pick random subsets of points and save to disk

In [38]:
np.random.seed(21259)
rand_indices = np.random.choice(np.arange(data_smaller.shape[0]), size=50000, replace=False)

data_smallest = data_smaller.loc[rand_indices, all_cols]
data_smallest.reset_index(inplace=True)
data_smallest.to_parquet("/glade/derecho/scratch/dgagne/ptype_qc_50k.parquet")

In [39]:
np.random.seed(21259)
rand_indices = np.random.choice(np.arange(data_smaller.shape[0]), size=10000, replace=False)

data_tiny = data_smaller.loc[rand_indices, all_cols]
data_tiny.reset_index(inplace=True)
data_tiny.to_parquet("/glade/derecho/scratch/dgagne/ptype_qc_10k.parquet")

In [40]:
# Load data from zenodo
new_data = pd.read_parquet("https://zenodo.org/record/10719142/files/ptype_qc_10k.parquet")

In [41]:
new_data

Unnamed: 0,index,datetime,x_m,y_m,lon,lat,usa,report_count,ra_percent,sn_percent,...,PRES_Pa_14250_m,PRES_Pa_14500_m,PRES_Pa_14750_m,PRES_Pa_15000_m,PRES_Pa_15250_m,PRES_Pa_15500_m,PRES_Pa_15750_m,PRES_Pa_16000_m,PRES_Pa_16250_m,PRES_Pa_16500_m
0,483809,2018-08-03 00:00:00,294,176,-87.581390,40.772118,1.0,1,1.0,0.0,...,142.094012,136.703635,131.313258,125.922881,121.304003,116.844498,112.384994,107.925489,103.465984,99.006480
1,531283,2018-11-15 23:00:00,384,191,-73.459595,41.394444,1.0,1,0.0,1.0,...,136.537534,130.948653,125.359773,120.690254,116.083989,111.477724,106.871459,102.265194,97.658929,93.052664
2,589093,2019-03-04 01:00:00,381,185,-74.063812,40.756828,1.0,1,0.0,1.0,...,133.227241,127.831437,122.863523,118.368061,113.872600,109.377138,104.881676,100.386215,95.890753,91.395291
3,806167,2020-02-29 04:00:00,304,125,-86.531311,34.662445,1.0,1,1.0,0.0,...,130.719615,125.404711,120.955116,116.576841,112.198566,107.820291,103.442017,99.063742,94.685467,90.307192
4,156222,2016-06-04 23:00:00,328,180,-82.292358,40.943787,1.0,1,1.0,0.0,...,136.613960,130.991936,125.369912,120.774832,116.252081,111.729330,107.206579,102.683829,98.161078,93.638327
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,647578,2019-06-05 01:00:00,232,75,-96.943054,28.824713,1.0,1,1.0,0.0,...,149.903566,144.118674,138.333782,132.548889,126.763997,121.646187,116.821031,111.995875,107.170719,102.345563
9996,685978,2019-08-22 12:00:00,318,132,-84.414703,35.381802,1.0,1,1.0,0.0,...,141.790520,136.100367,130.410213,124.772686,120.152217,115.531749,110.911281,106.290813,101.670344,97.049876
9997,762515,2020-01-16 06:00:00,304,125,-86.531311,34.662445,1.0,1,1.0,0.0,...,135.762597,129.999993,124.390200,119.782296,115.174393,110.566490,105.958586,101.350683,96.742779,92.134876
9998,717270,2019-11-11 12:00:00,323,194,-82.869446,42.619057,1.0,1,0.0,1.0,...,128.590046,123.445381,118.965035,114.484688,110.004342,105.523996,101.043649,96.563303,92.082957,87.602610
