In [1]:
#Code to format .csv files into a nicely-publishable format

#Also creates additional files such as station info, radar info, etc.

In [2]:
import pandas as pd
import numpy as np
import xarray as xr
import glob
import datetime as dt
import pytz

def load_scws(rid,tz):
    print("loading "+rid+"...")
    df1 = pd.read_csv("/g/data/eg3/ab4502/ExtremeWind/points/"+rid+"_scw_envs_df.csv")
    
    df1["cluster_new"] = df1.cluster.map({0:2,2:1,1:0})
    df1 = df1.set_index(pd.DatetimeIndex(df1.dt_utc))
    df1 = add_lt(df1,tz)    
    df1["year"] = df1.index.year
    df1["month"] = df1.index.month
    df1["hour"] = df1["lt"].dt.hour
    df1["rid"] = rid  
    df1["scw"] = 1
    
    return df1

def load_nulls(rid,tz):
    
    df2 = pd.read_csv("/g/data/eg3/ab4502/ExtremeWind/points/"+rid+"_non_scw_envs_df.csv")
    
    df2["cluster_new"] = df2.cluster.map({0:2,2:1,1:0})
    df2 = df2.set_index(pd.DatetimeIndex(df2.dt_utc))
    df2 = add_lt(df2,tz)    
    df2["year"] = df2.index.year
    df2["month"] = df2.index.month
    df2["hour"] = df2["lt"].dt.hour
    df2["rid"] = rid   
    df2["scw"] = 0
    
    return df2

def add_lt(df,tz):
    df["lt"] = df.index.tz_localize(pytz.utc).tz_convert(pytz.timezone(tz))
    return df

def remove_suspect_gusts(df):
    dts = ["2010-12-14 07:03:00","2011-01-11 03:49:00","2015-12-15 23:33:00","2020-02-09 01:00:00","2020-02-09 03:18:00","2020-05-25 06:11:00",
          "2012-11-02 18:58:00","2012-12-20 21:19:00","2012-12-15 13:00:00","2012-12-29 16:15:00","2012-12-30 06:25:00","2012-12-30 18:01:00","2013-01-02 08:15:00",
          "2013-01-05 03:36:00","2013-01-12 15:22:00","2013-02-11 07:56:00"]
    return df[np.in1d(df.dt_utc,dts,invert=True)]

def assign_storm_class(data):

    data["aspect_ratio"] = data.major_axis_length / data.minor_axis_length     
    
    #Linear
    data.loc[(data.aspect_ratio>=3) & (data.major_axis_length>=100),"class2"] = "Linear"
    #Non-linear
    data.loc[(data.aspect_ratio<3) & (data.major_axis_length>=100),"class2"] = "Non-linear"
    #Cellular
    data.loc[(data.local_max == 1),"class2"] = "Cellular"
    #Cluster of cells
    data.loc[(data.local_max>=2) & (data.major_axis_length<100),"class2"] = "Cell cluster"
    #Supercell
    data.loc[(data.max_alt>=7) & (data.azi_shear60>4) & ((data.aspect_ratio<3) | (data.major_axis_length<100)),"class2"] = "Supercellular"
    #Linear hybrid
    data.loc[(data.max_alt>=7) & (data.azi_shear60>4) & ((data.major_axis_length>=100)),"class2"] = "Embedded supercell"
    
    return data

In [3]:
df_events = assign_storm_class(remove_suspect_gusts(load_scws("2","Australia/Melbourne")))
df_nulls = assign_storm_class(load_nulls("2","Australia/Melbourne"))

loading 2...


In [14]:
df_nulls[["in10km","uid10"]]

Unnamed: 0_level_0,in10km,uid10
dt_utc,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-01-01 00:01:00,0.0,-1.0
2008-01-01 00:04:00,0.0,-1.0
2008-01-01 00:06:00,0.0,-1.0
2008-01-01 00:06:00,0.0,-1.0
2008-01-01 00:06:00,0.0,-1.0
...,...,...
2020-12-31 23:41:00,0.0,913.0
2020-12-31 23:47:00,0.0,913.0
2020-12-31 23:47:00,0.0,913.0
2020-12-31 23:50:00,0.0,913.0


In [57]:
#df_nulls[(gust_list + lightning_list + radar_list + ind_list)]
df_nulls.loc[df_nulls["in10km"]==0,np.array(radar_list)[~np.in1d(radar_list,["in10km","rid"])]] = np.nan

In [30]:
#Columns to keep for publishing

ind_list = [\
            #ERA5 details
            "time_y","era5_lat","era5_lon",        
            #Clustering
            "cluster_new",            
            #Wind indices
            "Umean06","Umean01","U10","wg10","s06","ebwd","Umeanwindinf","srhe_left","srh06_left",\
            #Downburst indices
            "dmi","lr_subcloud","lr_freezing","lr03","lr13","wmsi_ml","bdsd","hmi","convgust_wet","convgust_dry",\
            "gustex","dmgwind","dmgwind_fixed","dcape","wmpi","windex","ddraft_temp","te_diff","tei","wndg",\
            #Storm mode
            "dcp","scp","scp_fixed",\
            #Severe storm indices
            "sherb","eff_sherb","sweat","mucape*s06","mlcape*s06","effcape*s06","t_totals","k_index",\
            #Instability indices
            "eff_cape","eff_lcl","ml_cape","ml_lcl","mu_cape","mu_lcl","qmean01","qmean06"
           ]

gust_list = ["stn_id","gust","wgr_4","scw"]

radar_list = ["rid","speed","angle","class2",
              "in10km","major_axis_length",
              "minor_axis_length","local_max",
              "max_alt","azi_shear60"]

lightning_list = ["Lightning_observed"]

In [5]:
renames = {
         'Umean06': "Umean06",
         'Umean01': "Umean01",
         'U10': "U10",
         'wg10': "WindGust10",
         's06': "S06",
         'ebwd': "EBWD",
         'Umeanwindinf': "Umeanwindinf",
         'srhe_left': "SRHE",
         'srh06_left': "SRH06",
         'dmi': "DMI",
         'lr_subcloud': "LR_subcloud",
         'lr_freezing': "LR_freezing",
         'lr03': "LR03",
         'lr13': "LR13",
         'wmsi_ml': "WMSI",
         'bdsd': "BDSD",
         'bdsd_cv': "BDSD_CV",
         'hmi': "HMI",
         'convgust_wet': "ConvGust_wet",
         'convgust_dry': "ConvGust_dry",
         'gustex': "GUSTEX",
         'dmgwind': "DmgWind",
         'dmgwind_fixed': "DmgWind_fixed",
         'dcape': "DCAPE",
         'wmpi': "WMPI",
         'windex': "WINDEX",
         'ddraft_temp': "DowndraftTemp",
         'te_diff': "ThetaeDiff",
         'tei': "TEI",
         'wndg': "WNDG",
         'dcp': "DCP",
         'scp': "SCP",
         'scp_fixed': "SCP_fixed",
         'sherb': "SHERB",
         'eff_sherb': "SHERBE",
         'sweat': "SWEAT",
         'mucape*s06': "MUCS6",
         'mlcape*s06': "MLCS6",
         'effcape*s06': "EffCS6",
         't_totals': "T_Totals",
         'k_index': "K_Index",
         'eff_cape': "Eff_CAPE",
         'eff_lcl': "Eff_LCL",
         'ml_cape': "MLCAPE",
         'ml_lcl': "ML_LCL",
         'mu_cape': "MUCAPE",
         'mu_lcl': "MU_LCL",
         'qmean01': "Qmean01",
         'qmean06': "Qmean06",
        'angle': "Storm_angle",
        'azi_shear60': "Azimuthal_shear",
        'class2': "Parent_storm_class",
        'cluster_new':"Environmental_cluster",
        'gust':"Wind_gust_observed",
        'in10km':"Storm_in10km",
        'local_max':"Local_reflectivity_maxima",
        'major_axis_length':"Major_axis_length",
        'max_alt':"Maximum_storm_altitude",
        'minor_axis_length':"Minor_axis_length",
        'rid':"Radar_id",
        'stn_id':"Station_id",
        'scw':"SCW",
        'speed':"Storm_speed",
        'wgr_4':"Peak_to_mean_wind_gust_ratio",
        'time_y':"ERA5_time",
        'era5_lat':"ERA5_latitude",
        'era5_lon':"ERA5_longitude"}
            

In [9]:
pd.concat([df_events[(gust_list + lightning_list + radar_list + ind_list)].rename(columns=renames),
                df_nulls[(gust_list + lightning_list + radar_list + ind_list)].rename(columns=renames)],
    axis=0).sort_values("dt_utc")#.to_csv("/scratch/eg3/ab4502/scw_data_pub/gust_observations_2.csv")

Unnamed: 0_level_0,Station_id,Wind_gust_observed,Peak_to_mean_wind_gust_ratio,SCW,Lightning_observed,Radar_id,Storm_speed,Storm_angle,Parent_storm_class,Storm_in10km,...,T_Totals,K_Index,Eff_CAPE,Eff_LCL,MLCAPE,ML_LCL,MUCAPE,MU_LCL,Qmean01,Qmean06
dt_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2008-01-01 00:01:00,86371,7.7,1.307273,0,0.0,2,,,,0.0,...,39.131836,14.551758,216.6250,3096.5000,0.0000,3565.3125,259.0625,3469.1875,6.018555,3.795898
2008-01-01 00:04:00,86077,12.9,1.207000,0,0.0,2,,,,0.0,...,40.753906,15.080078,266.1875,3096.5000,0.0000,3722.8750,266.1875,3576.0625,6.073242,3.845703
2008-01-01 00:06:00,86361,12.9,1.377043,0,0.0,2,,,,0.0,...,39.131836,14.954102,216.6250,3096.5000,0.0000,3476.6875,259.0625,3469.1875,6.018555,3.813477
2008-01-01 00:06:00,87113,15.9,1.310973,0,0.0,2,,,,0.0,...,40.933594,17.815430,0.0000,0.0000,0.0000,3770.6250,141.3125,3635.6875,7.179688,4.372070
2008-01-01 00:06:00,89002,13.9,1.216749,0,0.0,2,,,,0.0,...,39.663086,15.872070,0.0000,0.0000,0.0000,3693.4375,1.1250,3487.2500,5.441406,3.855469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31 23:41:00,86038,5.7,2.061897,0,0.0,2,2.573,99.169,Cellular,0.0,...,52.777344,34.307617,609.5000,1259.8125,609.5000,1190.1250,1167.3125,1528.1875,10.283203,5.759766
2020-12-31 23:47:00,86282,9.3,3.254237,0,0.0,2,2.573,99.169,Cell cluster,0.0,...,52.777344,34.307617,609.5000,1259.8125,609.5000,1121.6875,1167.3125,1528.1875,10.283203,5.832031
2020-12-31 23:47:00,86104,4.6,3.789272,0,0.0,2,2.573,99.169,Cell cluster,0.0,...,51.908203,33.244140,609.5000,1234.5000,609.5000,1266.8125,1167.3125,1988.7500,10.283203,5.759766
2020-12-31 23:50:00,90035,12.4,1.101678,0,0.0,2,2.573,99.169,Cell cluster,0.0,...,52.333984,32.017580,468.0625,1668.5625,426.3125,988.6875,854.5000,1782.1875,10.220703,5.685547


In [34]:
def station_stats(state, rid, start_year, max_stn_dist=100, return_dropped=True, print_stats=True):
    #Read station info
    stn_df = read_stn_info(state)

    #Read TINT-created grid, for the purpose of getting the radar metadata
    # grid = h5py.File("/g/data/eg3/ab4502/TINTobjects/"+file_id+".h5", "r")
    # lat0 = grid.attrs['source_origin_latitude']
    # lon0 = grid.attrs['source_origin_longitude']
    f=xr.open_dataset(glob.glob("/g/data/rq0/level_2/"+rid+"/COLUMNMAXREFLECTIVITY/*.nc")[-1])
    lat0=f.attrs["origin_latitude"]
    lon0=f.attrs["origin_longitude"]
    print(f.attrs["instrument_name"])
    
    #Calculate distance between radar and each station
    stn_df["dist_from_radar_km"] = latlon_dist(lat0, lon0, stn_df.lat.values, stn_df.lon.values)

    #Subset stations based on a max distance threshold, and having at least one year overlap with the radar data
    stn_df = stn_df[\
                        (stn_df.dist_from_radar_km <= max_stn_dist) &\
                        (stn_df.y2 >= int(start_year))]

    drops = stn_df[(np.in1d(stn_df.stn_no, return_drop_list(state)))]

    if return_dropped:
        if print_stats:
            print("Percent QC: ",pd.to_numeric(stn_df["Y%"],errors="coerce").mean(),"\n")
            print("Total stations: ",stn_df.shape[0] - drops.shape[0],"\n")
            print("Dropped stations: \n",drops[["stn_no","stn_name","hgt_asl","lat","lon"]])

        return stn_df[(np.in1d(stn_df.stn_no, return_drop_list(state), invert=True))]
    else:
        if print_stats:
            print("All stations within 100 km: ")
            print(stn_df[["stn_no","stn_name","hgt_asl","y1","y2","lat","lon","Y%","dist_from_radar_km"]],"\n\n")
        return stn_df.shape[0] - drops.shape[0]

def return_drop_list(state):

        #Contains lists of stations to drop from each state. Generally either too high above sea level, or offshore.    

        assert state in ["qld","nsw","vic","sa","wa","vic_nsw","nt","tas"]

        if state=="qld":
                return [41175, 200840, 200601, 200736, 200783, 200701, 200831, 200732, 200704, 200001,\
                                200283, 39122, 39059, 27058, 27054,
                       40927, 40926, 40925, 40043, 31037]
        elif state=="vic":
                return [83084, 86376, 79103, 82139, 86381, 85291, 83024, 83085, 79101, 86344]
        elif state=="nsw":
                return [56238, 72161, 56243, 63292, 70349, 62100, 71075, 71032, 200288, 200839, 66196,
                       66062,70349, 69017] #Sydney observatory is manually dropped here, but is automatically dropped
                                # in post_process_tracks.py as it is not QC'd
        elif state=="wa":
                return [9091,9193,9255,9256]
        elif state=="sa":
                return []
        elif state=="vic_nsw":
                return [83084, 86376, 79103, 82139, 86381, 85291, 83024, 83085, 79101, 86344, 56238, 72161, 56243, 63292, 70349, 62100, 71075, 71032, 200288, 200839, 66196]
        elif state=="tas":
                return [94087]
        elif state=="nt":
                return [14056, 14274]    
    
def read_stn_info(state):

        names = ["id", "stn_no", "district", "stn_name", "site_open", "site_close", "lat", "lon", "latlon_method", "state",\
                        "hgt_asl", "hgt_asl_baro", "wmo_idx", "y1", "y2", "comp%", "Y%", "N%", "W%", "S%", "I%", "#"]
        if state=="vic_nsw":
                stn_df1 = pd.read_csv(glob.glob("/g/data/eg3/ab4502/ExtremeWind/obs/aws/vic_one_min_gust/HD01D_StnDet_*.txt")[0],\
                    names=names, header=0)
                stn_df2 = pd.read_csv(glob.glob("/g/data/eg3/ab4502/ExtremeWind/obs/aws/nsw_one_min_gust/HD01D_StnDet_*.txt")[0],\
                    names=names, header=0)
                stn_df = pd.concat([stn_df1, stn_df2], axis=0)
        elif state=="nt":
                stn_df = pd.concat([pd.read_csv(f, names=names, header=None) for f in glob.glob("/g/data/eg3/ab4502/ExtremeWind/obs/aws/nt_one_min_gust/HD01D_StnDet_*.txt")],axis=0).\
                    sort_values("stn_name")                
        elif state=="tas":
                stn_df = pd.concat([pd.read_csv(f, names=names, header=None) for f in glob.glob("/g/data/eg3/ab4502/ExtremeWind/obs/aws/tas_one_min_gust/HD01D_StnDet_*.txt")],axis=0).\
                    sort_values("stn_name")                                
        else:
                stn_df = pd.read_csv(glob.glob("/g/data/eg3/ab4502/ExtremeWind/obs/aws/"+state+"_one_min_gust/HD01D_StnDet_*.txt")[0],\
                    names=names, header=0)
        
                
        stn_df["y1"] = pd.to_numeric(stn_df.y1, errors="coerce")
        stn_df["y2"] = pd.to_numeric(stn_df.y2, errors="coerce")

        return stn_df

def latlon_dist(lat, lon, lats, lons):

        #Calculate great circle distance (Harversine) between a lat lon point (lat, lon) and a list of lat lon
        # points (lats, lons)

        R = 6373.0

        lat1 = np.deg2rad(lat)
        lon1 = np.deg2rad(lon)
        lat2 = np.deg2rad(lats)
        lon2 = np.deg2rad(lons)

        dlon = lon2 - lon1
        dlat = lat2 - lat1

        a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

        return (R * c)    
    
station_details = pd.DataFrame()    
for rid, state, start_year in zip(
    ["2","66","69","70","71","64","8","72","75","19","73","78","49","4","40","48","68","63","76","77"],
    ["vic","qld","nsw","wa","nsw","sa","qld","qld","qld","qld","qld","qld","vic_nsw","nsw","nsw","wa","vic","nt","tas","nt"],
    [2008,2006,2010,2013,2009,2005,2007,2010,2012,2014,2015,2015,2006,2013,2013,2014,2014,2001,2012,2012]):

        cols = ["stn_no","stn_name","lat","lon","hgt_asl","y1","y2","comp%","Y%","dist_from_radar_km"]
        station_renames = {"stn_no":"Station_id","stn_name":"Station_name","lat":"Latitude","lon":"Longitude","hgt_asl":"Station_height_above_sea_level_meters",
                         "y1":"Start_year_for_data_used_here","y2":"End_year_for_data_used_here","comp%":"Percentage_of_data_complete",
                           "Y%":"Percentage_of_data_passing_quality_control","dist_from_radar_km":"Distance_from_radar_kilometers"}
        station_df = station_stats(state,rid,start_year,print_stats=False)[cols].rename(columns=station_renames).set_index("Station_id")
        station_df["Closest_radar_id"] = rid
        station_details = pd.concat([station_details, station_df],axis=0)


Melbourne
Brisbane (Mt Stapylton)
Namoi (Blackjack Mountain)
Perth (Serpentine)
Sydney (Terrey Hills)
Adelaide (Buckland Park)
Gympie
Emerald
Mount Isa
Cairns
Townsville (Hervey Range)
Weipa
Yarrawonga
Newcastle
Canberra (Captains Flat)
Kalgoorlie
Bairnsdale
Darwin (Berrimah)
Hobart (Mt Koonya)
Warruwi


In [46]:
station_details.sort_index().to_csv("/scratch/eg3/ab4502/scw_data_pub/station_details.csv")