In [1]:
import pandas as pd
import numpy as np
import glob
import datetime as dt
import tqdm
import pytz

In [2]:
def load_scws(rid,tz):
    print("loading "+rid+"...")
    df1 = pd.read_csv("/g/data/eg3/ab4502/ExtremeWind/points/"+rid+"_scw_envs_df.csv")
    
    df1["cluster_new"] = df1.cluster.map({0:2,2:1,1:0})
    df1 = df1.set_index(pd.DatetimeIndex(df1.dt_utc))
    df1 = add_lt(df1,tz)    
    df1["year"] = df1.index.year
    df1["month"] = df1.index.month
    df1["hour"] = df1["lt"].dt.hour
    df1["rid"] = rid  
    df1["scw"] = 1
    
    return df1

def load_nulls(rid,tz):
    
    df2 = pd.read_csv("/g/data/eg3/ab4502/ExtremeWind/points/"+rid+"_non_scw_envs_df.csv")
    
    df2["cluster_new"] = df2.cluster.map({0:2,2:1,1:0})
    df2 = df2.set_index(pd.DatetimeIndex(df2.dt_utc))
    df2 = add_lt(df2,tz)    
    df2["year"] = df2.index.year
    df2["month"] = df2.index.month
    df2["hour"] = df2["lt"].dt.hour
    df2["rid"] = rid   
    df2["scw"] = 0
    
    return df2

def add_lt(df,tz):
    df["lt"] = df.index.tz_localize(pytz.utc).tz_convert(pytz.timezone(tz))
    return df

def remove_suspect_gusts(df):
    dts = ["2010-12-14 07:03:00","2011-01-11 03:49:00","2015-12-15 23:33:00","2020-02-09 01:00:00","2020-02-09 03:18:00","2020-05-25 06:11:00",
          "2012-11-02 18:58:00","2012-12-20 21:19:00","2012-12-15 13:00:00","2012-12-29 16:15:00","2012-12-30 06:25:00","2012-12-30 18:01:00","2013-01-02 08:15:00",
          "2013-01-05 03:36:00","2013-01-12 15:22:00","2013-02-11 07:56:00"]
    return df[np.in1d(df.dt_utc,dts,invert=True)]

def assign_storm_class(data):

    #Linear
    data.loc[(data.aspect_ratio>=3) & (data.major_axis_length>=100),"class2"] = "Linear"
    #Non-linear
    data.loc[(data.aspect_ratio<3) & (data.major_axis_length>=100),"class2"] = "Non-linear"
    #Cellular
    data.loc[(data.local_max == 1),"class2"] = "Cellular"
    #Cluster of cells
    data.loc[(data.local_max>=2) & (data.major_axis_length<100),"class2"] = "Cell cluster"
    #Supercell
    data.loc[(data.max_alt>=7) & (data.azi_shear60>4) & ((data.aspect_ratio<3) | (data.major_axis_length<100)),"class2"] = "Supercellular"
    #Linear hybrid
    data.loc[(data.max_alt>=7) & (data.azi_shear60>4) & ((data.major_axis_length>=100)),"class2"] = "Embedded supercell"
    
    return data

In [3]:
tzs = {"68":'Australia/Melbourne',
       "64":'Australia/Adelaide',
       "8":'Australia/Brisbane',
       "72":'Australia/Queensland',
       "75":'Australia/Queensland',
       "19":'Australia/Queensland',
       "73":'Australia/Queensland',
       "78":'Australia/Queensland',
       "77":'Australia/Darwin',
       "49":'Australia/Victoria',
       "4":'Australia/Sydney',
       "40":'Australia/Canberra',
       "48":'Australia/West',
       "2":'Australia/Melbourne',
       "66":'Australia/Brisbane',
       "69":'Australia/NSW',
       "70":'Australia/Perth',
       "71":'Australia/Sydney',
      "63":'Australia/Darwin',
      "76":'Australia/Hobart',
      "77":"Australia/Darwin"}
       
#rids = ["2","66","69","70","71","64","8","72","75","19","73","78","49","4","40","48","68","63","76","77"]
rids = ["2"]
df = pd.DataFrame()
for rid in rids:
    df = pd.concat([df,remove_suspect_gusts(load_scws(rid,tzs[rid]))],axis=0)
    df = pd.concat([df,load_nulls(rid,tzs[rid])],axis=0)    

loading 2...


In [4]:
from statsmodels.tools.tools import add_constant
from statsmodels.discrete.discrete_model import Logit
import warnings
from sklearn.linear_model import LogisticRegression
import multiprocessing

def resample_events(df, event, N, M, conserve_prop=True, fixed_ratio=None): 
                
    ratio = round(df.shape[0] / df[event].sum())
    event_inds = df[df[event]==1].index.values
    non_inds = df[df[event]==0].index.values 
    rand_event_inds = []; rand_non_event_inds = []
    for i in np.arange(N):
        rand_event_inds.append(event_inds[np.random.randint(0, high=len(event_inds), size=M)])
        if conserve_prop:
            rand_non_event_inds.append(non_inds[np.random.randint(0, high=len(non_inds), size=int(M*ratio))])
        else:
            rand_non_event_inds.append(non_inds[np.random.randint(0, high=len(non_inds), size=int(round((df[event]==0).sum()*fixed_ratio)))])
    return [rand_event_inds, rand_non_event_inds]

In [5]:
logit = LogisticRegression(class_weight="balanced", solver="liblinear",max_iter=1000)
#pool = multiprocessing.Pool()
N=100
np.random.seed(seed=0)

In [6]:
preds = np.array(['ml_cape', 'mu_cape', 'sb_cape',\
     'ml_cin', 'sb_cin', 'mu_cin', 'ml_lcl', 'mu_lcl', 'sb_lcl', 'eff_cape',\
     'eff_cin', 'eff_lcl', 'lr01', 'lr03', 'lr13', 'lr36', 'lr24', 'lr_freezing',\
     'lr_subcloud', 'qmean01', 'qmean03', 'qmean06', 'qmeansubcloud', 'q_melting',\
     'q1', 'q3', 'q6', 'rhmin01', 'rhmin03', 'rhmin13', 'rhminsubcloud', 'tei', 'wbz',\
     'mhgt', 'mu_el', 'ml_el', 'sb_el', 'eff_el', 'pwat', \
     'te_diff', 'dpd850', 'dpd700', 'dcape', 'ddraft_temp', 'sfc_thetae',\
     'srhe_left', 'srh01_left', 'srh03_left', 'srh06_left', 'ebwd', 's010', 's06',\
     's03', 's01', 's13', 's36', 'scld', 'U500', 'U10', 'U1', 'U3', 'U6', 'Ust_left',\
     'Usr01_left', 'Usr03_left', 'Usr06_left', 'Uwindinf', 'Umeanwindinf',\
     'Umean800_600', 'Umean06', 'Umean01', 'Umean03'])

#mod = Logit(df[event],add_constant(df[preds])["const"]).fit()

In [9]:
df.shape

(1654763, 181)