In [3]:
import pandas as pd
import numpy as np
import glob
import datetime as dt
import tqdm
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

In [4]:
def load_scws(rid):
    print("loading "+rid+"...")
    df1 = pd.read_csv("/g/data/eg3/ab4502/ExtremeWind/points/"+rid+"_scw_envs_df.csv")
    df2 = pd.read_csv("/g/data/eg3/ab4502/ExtremeWind/points/"+rid+"_non_scw_envs_df.csv")
    
    df1["cluster_new"] = df1.cluster.map({1:2,2:1,0:0})
    df1 = df1.set_index(pd.DatetimeIndex(df1.dt_utc))
    df1["month"] = df1.index.month
    df1["hour"] = df1.index.hour
    df1["year"] = df1.index.year
    df1["aspect_ratio"] = df1.major_axis_length / df1.minor_axis_length    
    
    df2["cluster_new"] = df2.cluster.map({1:2,2:1,0:0})
    df2 = df2.set_index(pd.DatetimeIndex(df2.dt_utc))
    df2["month"] = df2.index.month
    df2["hour"] = df2.index.hour
    df2["year"] = df2.index.year    
    df2["aspect_ratio"] = df2.major_axis_length / df2.minor_axis_length     
    
    df1["rid"] = rid
    df2["rid"] = rid
    
    return df1, df2

In [5]:
melb_scw, melb_null = load_scws("2")
bris_scw, bris_null = load_scws("66")
namoi_scw, namoi_null = load_scws("69")
perth_scw, perth_null = load_scws("70")
syd_scw, syd_null = load_scws("71")

loading 2...
loading 66...
loading 69...
loading 70...
loading 71...


In [8]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [12]:
var = ["Umean06","U1","wg10","mu_cape","qmean01","lr13","ebwd"]

scw = pd.concat([melb_scw[var], syd_scw[var], bris_scw[var], perth_scw[var], namoi_scw[var]], axis=0)
null = pd.concat([melb_null[var], syd_null[var], bris_null[var], perth_null[var], namoi_scw[var]], axis=0)
scw["target"]=1; null["target"]=0

X = pd.concat([scw,null],axis=0)[var]
y = pd.concat([scw,null],axis=0)["target"]

In [32]:
rf_clf = BalancedRandomForestClassifier(n_estimators=100,verbose=100,oob_score=True)
rf_clf.fit(X,y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
building tree 1 of 100
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s
building tree 2 of 100
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s
building tree 3 of 100
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.8s remaining:    0.0s
building tree 4 of 100
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    5.0s remaining:    0.0s
building tree 5 of 100
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    6.3s remaining:    0.0s
building tree 6 of 100
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.5s remaining:    0.0s
building tree 7 of 100
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    8.8s remaining:    0.0s
building tree 8 of 100
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   10.0s remaining:    0.0s
building tree 9 of 100
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   11.3s remaining: 

  warn("Some inputs do not have OOB scores. "


BalancedRandomForestClassifier(oob_score=True, verbose=100)

In [33]:
pd.DataFrame({"importance":rf_clf.feature_importances_},index=var).sort_values("importance")

Unnamed: 0,importance
qmean01,0.075817
Umean06,0.111276
lr13,0.126979
U1,0.136224
mu_cape,0.151233
ebwd,0.19437
wg10,0.204101


In [38]:
rf_clf

False

In [39]:
from sklearn.model_selection import cross_val_score

In [50]:
preds=rf_clf.predict(null[var])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    2.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  

In [53]:
(preds==1).sum()

511891

In [54]:
preds.shape

(4747978,)

In [59]:
t=0.8
(melb_null["bdsd"] > t).sum() + (perth_null["bdsd"] > t).sum() + (syd_null["bdsd"] > t).sum() + (bris_null["bdsd"] > t).sum() + (namoi_null["bdsd"] > t).sum()

338748

In [60]:
(preds==1).sum()

511891