In [1]:
import pandas as pd
from metar import Metar
import numpy as np
from IOfuncs import *
import datetime as dt
import warnings
from json import JSONDecodeError
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFdr,SelectFpr,f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
warnings.filterwarnings('ignore')

In [2]:
def make_ml_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = 2):
    if isinstance(metar_path, str):
        metar_path = read_metar(metar_path)
    metarDF = pd.DataFrame()
    glampDF = pd.DataFrame()
    hrrrDF = pd.DataFrame()    
    
    for time in range(-6, -delay_hours, 1):
        metar_at_time = get_metar_at_time(taf_time + dt.timedelta(hours = time), metar_path).T
        metarDF[f'metar {time}'] = metar_at_time
    
    work_time = dt.timedelta(hours=-delay_hours)
    glamp_data = get_glamp_at_time(taf_time + work_time, glamp_path, station, download=True)
    hrrr_data = get_hrrr_at_time(taf_time + work_time, hrrr_path, lat, lon, download=True)
    glamp_synoptic_offset = (taf_time.hour - delay_hours) % 6 - 1
    for time in range(-delay_hours, 7, 1):
        glampDF[f'glamp {time}'] = glamp_data.iloc[time + delay_hours + glamp_synoptic_offset]
        hrrrDF[f'hrrr {time}'] = hrrr_data.iloc[time + delay_hours]    
        
    
    df = pd.concat([metarDF, glampDF, hrrrDF])
    df.drop(['ftime', 'ftime_utc', 'model', 'runtime', 'runtime_utc', 'station', 'metar', 'peak_wind_time', 'valid', 'Unnamed: 0'], inplace=True)

    v = df.unstack().to_frame().sort_index(level=1).T
    v.columns = v.columns.map('_'.join)

    final = v.dropna(axis = 1)
    
    return final

In [3]:
taf_time = dt.datetime(year = 2021, month = 8, day = 21, hour = 18, minute = 0)

In [4]:
def make_ml_training_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, asos5_path, delay_hours = 2, tplus_hours = 6):
    if isinstance(metar_path, str):
        metar_path = full_metar_list = read_metar(metar_path)
    if isinstance(asos5_path, str):
        asos5_path = read_metar(asos5_path)
    
    df = make_ml_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = delay_hours)
    
    for i in range(tplus_hours):
        df[f'flight category {i}'] = get_conditions_from_asos(taf_time + dt.timedelta(hours = i), metar_path)
        time_series = pd.date_range(taf_time + dt.timedelta(hours = i), taf_time + dt.timedelta(hours = i+1), freq = '5T')
        verification_series = np.asarray(asos5_path.truncate(before = taf_time + dt.timedelta(hours = i), 
                                                             after = taf_time + dt.timedelta(hours = i+1, minutes = -1))['conditions'])
        df[f'verification list {i}'] = [None]
        df[f'verification list {i}'][0] = verification_series
        

    return df

In [5]:
metar = read_metar('Data/BOS.csv')
asos5 = read_metar('Data/BOS_5min.csv')

In [6]:
%%time
make_ml_training_data_row(taf_time, 'kbos', 42.3656, -71.0096, metar, 'Data/GLAMP data/', 'Data/hrrr/', asos5)

CPU times: user 114 ms, sys: 2.31 ms, total: 116 ms
Wall time: 188 ms


Unnamed: 0,hrrr -1_DPT_1000mb,hrrr -2_DPT_1000mb,hrrr 0_DPT_1000mb,hrrr 1_DPT_1000mb,hrrr 2_DPT_1000mb,hrrr 3_DPT_1000mb,hrrr 4_DPT_1000mb,hrrr 5_DPT_1000mb,hrrr 6_DPT_1000mb,hrrr -1_DPT_2m_above_ground,...,flight category 1,verification list 1,flight category 2,verification list 2,flight category 3,verification list 3,flight category 4,verification list 4,flight category 5,verification list 5
0,292.8,293.5,292.2,292.0,291.8,294.0,293.5,292.2,293.2,293.2,...,3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"


In [7]:
def make_ml_training_data_set(timelist, station, lat, lon, metar_path, glamp_path, hrrr_path, asos5_path, delay_hours = 2, frequency = '5H'):
    training_df = pd.DataFrame()
    time_series = pd.Series()
    for timepair in timelist:
        start_time = timepair[0]
        end_time = timepair[1]
        time_series = pd.concat([time_series, pd.Series(pd.date_range(start_time, end_time, freq = frequency))])
    if isinstance(metar_path, str):
        metar_path = read_metar(metar_path)
    if isinstance(asos5_path, str):
        asos5_path = read_metar(asos5_path)
    for time in tqdm(time_series):
        try:
            training_row = make_ml_training_data_row(time, station, lat, lon, metar_path, glamp_path, hrrr_path, asos5_path, delay_hours = delay_hours)
            training_df = pd.concat([training_df, training_row])
        except (FileNotFoundError, JSONDecodeError):
            continue

    training_df = training_df.fillna(-99999)
    return training_df

In [8]:
def prob_of_detection(predicted_results, actual_results, flight_cat):
    result_locations = np.where(actual_results==flight_cat)[0]
    num_predict = np.sum(predicted_results[result_locations]==flight_cat)
    return num_predict / len(result_locations)

def false_alarm_rate(predicted_results, actual_results, flight_cat):
    num_predict = np.sum(predicted_results==flight_cat)
    predict_locations = np.where(predicted_results==flight_cat)[0]
    predict_subset = predicted_results[predict_locations]
    actual_subset = actual_results[predict_locations]
    
    false_alarm_count = np.sum(predict_subset!=actual_subset)
    
    return false_alarm_count/num_predict

def critical_success_index(predicted_results, actual_results, flight_cat):
    num_predict = np.sum(predicted_results==flight_cat)
    predict_locations = np.where(predicted_results==flight_cat)[0]
    non_predict_locations = np.where(predicted_results!=flight_cat)[0]
    predict_subset = predicted_results[predict_locations]
    actual_subset = actual_results[predict_locations]
    actual_subset_compliment = actual_results[non_predict_locations]
    
    hits = np.sum(predict_subset==actual_subset)
    false_alarm_count = np.sum(predict_subset!=actual_subset)
    misses = np.sum(actual_subset_compliment==flight_cat)
    
    return hits / (hits + false_alarm_count + misses)

In [9]:
def data_split(data):
    y_keys = np.asarray([key for key in data if 'flight category' in key])
    val_keys = np.asarray([key for key in data if 'verification list' in key])
    X = data.drop(np.concatenate([y_keys, val_keys]), axis=1)
    y_list = data[y_keys]
    val_list = data[val_keys]
    
    return X, y_list, val_list


In [10]:
start_date = dt.datetime(year = 2020, month = 1, day = 1, hour = 0, minute = 0)
end_date = dt.datetime.now()
data = make_ml_training_data_set([(start_date, end_date)], 'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 'Data/GLAMP data/', 
                                 'Data/hrrr/', 'Data/BOS_5min.csv', frequency = 'H')

  0%|▏                                                                             | 55/24160 [00:23<2:52:26,  2.33it/s]


KeyboardInterrupt: 

In [11]:
data.to_csv('since2020.csv')

NameError: name 'data' is not defined

In [12]:
start_date_winter_1 = dt.datetime(year = 2020, month = 12, day = 1, hour = 0, minute = 0)
end_date_winter_1 = dt.datetime(year = 2021, month = 2, day = 28, hour = 23, minute = 0)
start_date_winter_2 = dt.datetime(year = 2019, month = 12, day = 1, hour = 0, minute = 0)
end_date_winter_2 = dt.datetime(year = 2020, month = 2, day = 28, hour = 23, minute = 0)
winter_data = make_ml_training_data_set([(start_date_winter_1, end_date_winter_1), (start_date_winter_2, end_date_winter_2)], 
                                        'kbos', 42.3656, -71.0096, 'Data/BOS.csv',
                                        'Data/GLAMP data/', 'Data/hrrr/', 'Data/BOS_5min.csv', frequency = 'H')

100%|███████████████████████████████████████████████████████████████████████████████| 4320/4320 [22:04<00:00,  3.26it/s]


In [13]:
start_date_spring_1 = dt.datetime(year = 2021, month = 3, day = 1, hour = 0, minute = 0)
end_date_spring_1 = dt.datetime(year = 2021, month = 5, day = 31, hour = 23, minute = 0)
start_date_spring_2 = dt.datetime(year = 2020, month = 3, day = 1, hour = 0, minute = 0)
end_date_spring_2 = dt.datetime(year = 2020, month = 5, day = 31, hour = 23, minute = 0)
spring_data = make_ml_training_data_set([(start_date_spring_1, end_date_spring_1), (start_date_spring_2, end_date_spring_2)], 
                                         'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 
                                        'Data/GLAMP data/', 'Data/hrrr/', 'Data/BOS_5min.csv', frequency = 'H')

100%|███████████████████████████████████████████████████████████████████████████████| 4416/4416 [23:03<00:00,  3.19it/s]


In [117]:
def test_accuracy_metrics(classifier_rf, training_data, flight_cat, use_ci=None):
    result_df = pd.DataFrame()
    (X, y_list, val_list) = data_split(training_data)
    for y, val in zip(y_list,val_list):
        X_train, X_test, y_train, y_test, _, val_test = train_test_split(X, y_list[y], val_list[val], train_size=0.7, random_state=42)
        classifier_rf.fit(X_train, y_train)
        if use_ci is not None: #keep only ones where prob is above CI
            initial_count = np.sum(classifier_rf.predict(X_test)==flight_cat)
            predict_probs_mask = classifier_rf.predict_proba(X_test)[:,flight_cat] >= use_ci
            val_test = val_test[predict_probs_mask]
            X_test = X_test[predict_probs_mask]
            print(f'Total predicted events of this type: {initial_count}, number of events with prob above ci: {np.sum(predict_probs_mask)}')
        prob = np.repeat(classifier_rf.predict(X_test), [len(verifications) for verifications in val_test])
        results = np.concatenate(np.asarray(val_test))
        result_df[y] = (prob_of_detection(prob, results, flight_cat), 
                        false_alarm_rate(prob, results, flight_cat), 
                        critical_success_index(prob, results, flight_cat))
        result_df.rename({0: 'POD', 1: 'FAR', 2: 'CSI'}, inplace=True)
    return result_df

In [136]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs = -1)
classifier_nn = MLPClassifier(random_state=42)
#params determined via hyperparam tuning
n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]
max_depth = [x for x in range(10, 120, 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
tuned_rf = RandomizedSearchCV(estimator = classifier_rf, param_distributions = random_grid, random_state = 42, n_jobs = -1)
k = 0.35

In [137]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 3, use_ci = k) #VFR

Total predicted events: 581, number of events with prob above ci: 586
Total predicted events: 578, number of events with prob above ci: 584
Total predicted events: 584, number of events with prob above ci: 587
Total predicted events: 577, number of events with prob above ci: 580
Total predicted events: 581, number of events with prob above ci: 583
Total predicted events: 584, number of events with prob above ci: 586
CPU times: user 8.02 s, sys: 2.68 s, total: 10.7 s
Wall time: 3.51 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.999513,0.999346,0.998699,0.99984,0.99725,1.0
FAR,0.02224,0.020349,0.029249,0.013909,0.016746,0.020962
CSI,0.977294,0.979023,0.969525,0.985936,0.980595,0.979038


In [138]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 3) #VFR

CPU times: user 6.97 s, sys: 2.06 s, total: 9.03 s
Wall time: 2.6 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.994993,0.995928,0.996106,0.997442,0.993714,0.995961
FAR,0.023307,0.022229,0.029249,0.013909,0.016746,0.020962
CSI,0.971915,0.973877,0.967081,0.983604,0.977175,0.975166


In [139]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 3, use_ci = k) #VFR

Total predicted events: 515, number of events with prob above ci: 529
Total predicted events: 504, number of events with prob above ci: 524
Total predicted events: 510, number of events with prob above ci: 529
Total predicted events: 506, number of events with prob above ci: 526
Total predicted events: 514, number of events with prob above ci: 530
Total predicted events: 524, number of events with prob above ci: 540
CPU times: user 9.32 s, sys: 2.61 s, total: 11.9 s
Wall time: 3.62 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.987689,0.987674,0.988172,0.975541,0.989702,0.988204
FAR,0.059355,0.054482,0.050642,0.044388,0.064637,0.064516
CSI,0.929744,0.934491,0.938691,0.933251,0.926347,0.925153


In [140]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 3) #VFR

CPU times: user 8.39 s, sys: 2.11 s, total: 10.5 s
Wall time: 2.78 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.963473,0.955167,0.969965,0.963877,0.967742,0.976461
FAR,0.061876,0.055834,0.050642,0.044388,0.066932,0.065825
CSI,0.905904,0.9041,0.922246,0.922571,0.904923,0.913601


In [141]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 2, use_ci = k) #MVFR

Total predicted events: 29, number of events with prob above ci: 38
Total predicted events: 36, number of events with prob above ci: 46
Total predicted events: 25, number of events with prob above ci: 37
Total predicted events: 35, number of events with prob above ci: 46
Total predicted events: 31, number of events with prob above ci: 38
Total predicted events: 25, number of events with prob above ci: 39
CPU times: user 7.76 s, sys: 2.35 s, total: 10.1 s
Wall time: 3.17 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.825397,0.784848,0.732,0.901099,0.817427,0.772059
FAR,0.315789,0.314815,0.296154,0.286957,0.386293,0.198473
CSI,0.597701,0.576837,0.559633,0.66129,0.539726,0.648148


In [142]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 2) #MVFR

CPU times: user 7.36 s, sys: 2.11 s, total: 9.47 s
Wall time: 2.69 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.54026,0.625604,0.469231,0.634021,0.553371,0.526316
FAR,0.315789,0.314815,0.296154,0.286957,0.386293,0.198473
CSI,0.432432,0.485929,0.391863,0.505133,0.410417,0.465632


In [143]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 2, use_ci = k) #MVFR

Total predicted events: 88, number of events with prob above ci: 115
Total predicted events: 99, number of events with prob above ci: 113
Total predicted events: 90, number of events with prob above ci: 117
Total predicted events: 95, number of events with prob above ci: 121
Total predicted events: 86, number of events with prob above ci: 103
Total predicted events: 70, number of events with prob above ci: 88
CPU times: user 8.69 s, sys: 2.45 s, total: 11.1 s
Wall time: 3.28 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.778947,0.81913,0.82852,0.847122,0.82963,0.880597
FAR,0.315871,0.364372,0.334783,0.337553,0.292259,0.211832
CSI,0.572903,0.557396,0.584713,0.591709,0.617931,0.712069


In [144]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 2) #MVFR

CPU times: user 7.88 s, sys: 2.32 s, total: 10.2 s
Wall time: 2.68 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.61326,0.66948,0.652916,0.67525,0.609227,0.578512
FAR,0.315871,0.376147,0.334783,0.344444,0.303876,0.216418
CSI,0.477933,0.476954,0.491435,0.498416,0.481243,0.498812


In [145]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 1, use_ci = k) #IFR

Total predicted events: 41, number of events with prob above ci: 47
Total predicted events: 34, number of events with prob above ci: 41
Total predicted events: 43, number of events with prob above ci: 53
Total predicted events: 36, number of events with prob above ci: 44
Total predicted events: 37, number of events with prob above ci: 43
Total predicted events: 40, number of events with prob above ci: 46
CPU times: user 7.4 s, sys: 2.56 s, total: 9.96 s
Wall time: 3.13 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.915094,0.843077,0.908078,0.853186,0.867257,0.947853
FAR,0.324826,0.255435,0.281938,0.214286,0.259446,0.255422
CSI,0.635371,0.653938,0.669405,0.692135,0.665158,0.715278


In [146]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 1) #IFR

CPU times: user 7.35 s, sys: 1.99 s, total: 9.35 s
Wall time: 2.7 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.678322,0.625571,0.737557,0.733796,0.693364,0.804688
FAR,0.324826,0.255435,0.281938,0.209476,0.257353,0.274648
CSI,0.511424,0.515038,0.57193,0.614341,0.559041,0.616766


In [147]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 1, use_ci = k) #IFR

Total predicted events: 28, number of events with prob above ci: 33
Total predicted events: 26, number of events with prob above ci: 32
Total predicted events: 29, number of events with prob above ci: 33
Total predicted events: 30, number of events with prob above ci: 33
Total predicted events: 29, number of events with prob above ci: 35
Total predicted events: 36, number of events with prob above ci: 43
CPU times: user 8.59 s, sys: 2.59 s, total: 11.2 s
Wall time: 3.26 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.860825,0.699029,0.904523,0.951872,0.896552,0.923077
FAR,0.350195,0.283582,0.318182,0.290837,0.315789,0.360502
CSI,0.588028,0.547529,0.636042,0.684615,0.634146,0.607143


In [148]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 1) #IFR

CPU times: user 8.28 s, sys: 2.11 s, total: 10.4 s
Wall time: 2.77 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.577855,0.509091,0.532394,0.566879,0.608696,0.733813
FAR,0.374532,0.288136,0.315217,0.307393,0.3,0.381818
CSI,0.429306,0.422111,0.427602,0.452926,0.482759,0.50495


In [149]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 0, use_ci = k) #LIFR

Total predicted events: 10, number of events with prob above ci: 13
Total predicted events: 13, number of events with prob above ci: 15
Total predicted events: 9, number of events with prob above ci: 12
Total predicted events: 13, number of events with prob above ci: 13
Total predicted events: 12, number of events with prob above ci: 13
Total predicted events: 12, number of events with prob above ci: 14
CPU times: user 7.91 s, sys: 2.48 s, total: 10.4 s
Wall time: 3.28 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.836364,1.0,0.915789,0.957627,1.0,0.99187
FAR,0.192982,0.243056,0.13,0.162963,0.162963,0.089552
CSI,0.69697,0.756944,0.805556,0.807143,0.837037,0.903704


In [150]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 0) #LIFR

CPU times: user 6.94 s, sys: 2.23 s, total: 9.17 s
Wall time: 2.62 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.609272,0.717105,0.608392,0.807143,0.824818,0.835616
FAR,0.192982,0.243056,0.13,0.204225,0.162963,0.089552
CSI,0.531792,0.582888,0.557692,0.668639,0.710692,0.772152


In [151]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 0, use_ci = k) #LIFR

Total predicted events: 16, number of events with prob above ci: 18
Total predicted events: 18, number of events with prob above ci: 19
Total predicted events: 18, number of events with prob above ci: 17
Total predicted events: 16, number of events with prob above ci: 19
Total predicted events: 18, number of events with prob above ci: 21
Total predicted events: 17, number of events with prob above ci: 16
CPU times: user 9.2 s, sys: 2.54 s, total: 11.7 s
Wall time: 3.41 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.875,0.766667,0.959184,0.921875,0.829268,1.0
FAR,0.204545,0.192982,0.364865,0.233766,0.291667,0.326923
CSI,0.714286,0.647887,0.618421,0.719512,0.618182,0.673077


In [152]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 0) #LIFR

CPU times: user 8.09 s, sys: 2.12 s, total: 10.2 s
Wall time: 2.7 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.636364,0.630137,0.716216,0.786667,0.622951,0.425743
FAR,0.204545,0.333333,0.3375,0.233766,0.283019,0.328125
CSI,0.546875,0.479167,0.524752,0.634409,0.5,0.352459


In [153]:
%%time
(X, y_list, val_list) = data_split(winter_data)
X_train, X_test, y_train, y_test = train_test_split(X, y_list['flight category 0'], train_size=0.7, random_state=42)
classifier_rf.fit(X_train, y_train)

imports = classifier_rf.feature_importances_

CPU times: user 1.45 s, sys: 312 ms, total: 1.76 s
Wall time: 418 ms


In [157]:
classifier_rf.predict_proba(X_test)[:,1]>0.35

array([False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [None]:
importances = [(feature, sig) for feature, sig in zip(classifier_rf.feature_names_in_, imports)]
importances.sort(key = lambda x: x[1])
importances

In [None]:
%%time
result = permutation_importance(
    classifier_rf, X_test, y_test, random_state=42, n_jobs=-1
)
#takes about 10 min

In [None]:
importances = [(feature, sig) for feature, sig in zip(classifier_rf.feature_names_in_, result['importances_mean'])]
#importances.sort(key = lambda x: x[0].split('_')[1])
importances.sort(key = lambda x: -x[1])
importances = pd.DataFrame(importances)

In [None]:
np.max(importances[1]), np.min(importances[1]), np.mean(importances[1]), np.std(importances[1])

In [None]:
plt.plot(importances[1])

In [None]:
with pd.option_context('display.max_rows', 999):
    print(importances)

In [None]:
data

In [None]:
condition_list = []
asos5 = read_metar('Data/BOS_5min.csv')
for _, metar_at_time in tqdm(asos5.iterrows()):
    vis = metar_at_time['vsby']
    cld_list = np.asarray(metar_at_time[['skyc1', 'skyc2', 'skyc3', 'skyc4']])
    hgt_list = np.asarray(metar_at_time[['skyl1', 'skyl2', 'skyl3', 'skyl4']])
    ovc_hgt = 100000
    bkn_hgt = 100000

    if 3 in list(cld_list):
        ovc_hgt = hgt_list[cld_list == 3]
        if len(ovc_hgt) > 1:
            ovc_hgt = np.min(ovc_hgt)
    if 2 in list(cld_list):
        bkn_hgt = hgt_list[cld_list == 2]
        if len(bkn_hgt) > 1:
            bkn_hgt = np.min(bkn_hgt)
    ceiling = np.min([ovc_hgt, bkn_hgt])

    if ceiling < 500 or vis < 1:
        conditions = 0
    elif ceiling < 1000 or vis < 3:
        conditions = 1
    elif ceiling < 3000 or vis < 5:
        conditions = 2
    else:
        conditions = 3
    condition_list.append(conditions)
asos5['conditions'] = condition_list

In [None]:
asos5.to_csv('Data/BOS_5min.csv')