In [1]:
import pandas as pd
from metar import Metar
import numpy as np
from IOfuncs import *
import datetime as dt
import warnings
from json import JSONDecodeError
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFdr,SelectFpr,f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance
from sklearn.neural_network import MLPClassifier
warnings.filterwarnings('ignore')

In [2]:
def make_ml_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = 2):
    if isinstance(metar_path, str):
        metar_path = read_metar(metar_path)
    metarDF = pd.DataFrame()
    glampDF = pd.DataFrame()
    hrrrDF = pd.DataFrame()    
    
    for time in range(-6, -delay_hours, 1):
        metar_at_time = get_metar_at_time(taf_time + dt.timedelta(hours = time), metar_path).T
        metarDF[f'metar {time}'] = metar_at_time
    
    work_time = dt.timedelta(hours=-delay_hours)
    glamp_data = get_glamp_at_time(taf_time + work_time, glamp_path, station, download=True)
    hrrr_data = get_hrrr_at_time(taf_time + work_time, hrrr_path, lat, lon, download=True)
    glamp_synoptic_offset = (taf_time.hour - delay_hours) % 6 - 1
    for time in range(-delay_hours, 7, 1):
        glampDF[f'glamp {time}'] = glamp_data.iloc[time + delay_hours + glamp_synoptic_offset]
        hrrrDF[f'hrrr {time}'] = hrrr_data.iloc[time + delay_hours]    
        
    
    df = pd.concat([metarDF, glampDF, hrrrDF])
    df.drop(['ftime', 'ftime_utc', 'model', 'runtime', 'runtime_utc', 'station', 'metar', 'peak_wind_time', 'valid', 'Unnamed: 0'], inplace=True)

    v = df.unstack().to_frame().sort_index(level=1).T
    v.columns = v.columns.map('_'.join)

    final = v.dropna(axis = 1)
    
    return final

In [3]:
taf_time = dt.datetime(year = 2021, month = 8, day = 21, hour = 18, minute = 0)

In [4]:
def make_ml_training_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, asos5_path, delay_hours = 2, tplus_hours = 6):
    if isinstance(metar_path, str):
        metar_path = full_metar_list = read_metar(metar_path)
    if isinstance(asos5_path, str):
        asos5_path = read_metar(asos5_path)
    
    df = make_ml_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = delay_hours)
    
    for i in range(tplus_hours):
        df[f'flight category {i}'] = get_conditions_from_asos(taf_time + dt.timedelta(hours = i), metar_path)
        time_series = pd.date_range(taf_time + dt.timedelta(hours = i), taf_time + dt.timedelta(hours = i+1), freq = '5T')
        verification_series = np.asarray(asos5_path.truncate(before = taf_time + dt.timedelta(hours = i), 
                                                             after = taf_time + dt.timedelta(hours = i+1, minutes = -1))['conditions'])
        df[f'verification list {i}'] = [None]
        df[f'verification list {i}'][0] = verification_series
        

    return df

In [5]:
metar = read_metar('Data/BOS.csv')
asos5 = read_metar('Data/BOS_5min.csv')

In [6]:
%%time
make_ml_training_data_row(taf_time, 'kbos', 42.3656, -71.0096, metar, 'Data/GLAMP data/', 'Data/hrrr/', asos5)

CPU times: user 92.3 ms, sys: 113 µs, total: 92.4 ms
Wall time: 135 ms


Unnamed: 0,hrrr -1_DPT_1000mb,hrrr -2_DPT_1000mb,hrrr 0_DPT_1000mb,hrrr 1_DPT_1000mb,hrrr 2_DPT_1000mb,hrrr 3_DPT_1000mb,hrrr 4_DPT_1000mb,hrrr 5_DPT_1000mb,hrrr 6_DPT_1000mb,hrrr -1_DPT_2m_above_ground,...,flight category 1,verification list 1,flight category 2,verification list 2,flight category 3,verification list 3,flight category 4,verification list 4,flight category 5,verification list 5
0,292.8,293.5,292.2,292.0,291.8,294.0,293.5,292.2,293.2,293.2,...,3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]",3,"[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]"


In [7]:
def make_ml_training_data_set(timelist, station, lat, lon, metar_path, glamp_path, hrrr_path, asos5_path, delay_hours = 2, frequency = '5H'):
    training_df = pd.DataFrame()
    time_series = pd.Series()
    for timepair in timelist:
        start_time = timepair[0]
        end_time = timepair[1]
        time_series = pd.concat([time_series, pd.Series(pd.date_range(start_time, end_time, freq = frequency))])
    if isinstance(metar_path, str):
        metar_path = read_metar(metar_path)
    if isinstance(asos5_path, str):
        asos5_path = read_metar(asos5_path)
    for time in tqdm(time_series):
        try:
            training_row = make_ml_training_data_row(time, station, lat, lon, metar_path, glamp_path, hrrr_path, asos5_path, delay_hours = delay_hours)
            training_df = training_df.append(training_row)
        except (FileNotFoundError, JSONDecodeError):
            continue

    training_df = training_df.fillna(-99999)
    return training_df

In [8]:
def prob_of_detection(predicted_results, actual_results, flight_cat):
    result_locations = np.where(actual_results==flight_cat)[0]
    num_predict = np.sum(predicted_results[result_locations]==flight_cat)
    return num_predict / len(result_locations)

def false_alarm_rate(predicted_results, actual_results, flight_cat):
    num_predict = np.sum(predicted_results==flight_cat)
    predict_locations = np.where(predicted_results==flight_cat)[0]
    predict_subset = predicted_results[predict_locations]
    actual_subset = actual_results[predict_locations]
    
    false_alarm_count = np.sum(predict_subset!=actual_subset)
    
    return false_alarm_count/num_predict

def critical_success_index(predicted_results, actual_results, flight_cat):
    num_predict = np.sum(predicted_results==flight_cat)
    predict_locations = np.where(predicted_results==flight_cat)[0]
    non_predict_locations = np.where(predicted_results!=flight_cat)[0]
    predict_subset = predicted_results[predict_locations]
    actual_subset = actual_results[predict_locations]
    actual_subset_compliment = actual_results[non_predict_locations]
    
    hits = np.sum(predict_subset==actual_subset)
    false_alarm_count = np.sum(predict_subset!=actual_subset)
    misses = np.sum(actual_subset_compliment==flight_cat)
    
    return hits / (hits + false_alarm_count + misses)

In [9]:
def data_split(data):
    y_keys = np.asarray([key for key in data if 'flight category' in key])
    val_keys = np.asarray([key for key in data if 'verification list' in key])
    X = data.drop(np.concatenate([y_keys, val_keys]), axis=1)
    y_list = data[y_keys]
    val_list = data[val_keys]
    
    return X, y_list, val_list


In [10]:
taf_time = dt.datetime(year = 2021, month = 8, day = 21, hour = 18, minute = 0)
asos5 = read_metar('Data/BOS_5min.csv')

In [11]:
def glamp_predict(taf_time, station, glamp_path, asos5_path, delay_hours = 2, t_plus_max = 6):
    if isinstance(asos5_path, str):
        asos5_path = read_metar(asos5_path)
    work_time = dt.timedelta(hours=-delay_hours)
    glamp_data = get_glamp_at_time(taf_time + work_time, glamp_path, station, download=True)
    glamp_data['ftime'] = pd.to_datetime(glamp_data['ftime'])
    df = pd.DataFrame()
    verifications = []
    for time in range(t_plus_max):
        index = np.where(glamp_data['ftime']==(taf_time + dt.timedelta(hours=time)))[0][0]
        cei, vis = glamp_data.iloc[index]['cig'], glamp_data.iloc[index]['vis']
        if cei == 1 or cei == 2 or vis == 1 or vis == 2:
            prediction = 0
        elif cei == 3 or vis == 3 or vis == 4:
            prediction = 1
        elif cei == 4 or cei == 5 or vis == 5:
            prediction = 2
        else:
            prediction = 3
        df[f't plus {time} prediction'] = [prediction]
        verification_series = np.asarray(asos5_path.truncate(before = taf_time + dt.timedelta(hours = time), 
                                                             after = taf_time + dt.timedelta(hours = time+1, minutes = -1))['conditions'])
        df[f't plus {time} verification'] = [None]
        df[f't plus {time} verification'][0] = verification_series
    
    return df

In [12]:
def glamp_time_series(timelist, station, glamp_path, asos5_path, delay_hours=2, t_plus_max=6, frequency = 'H'):
    result = pd.DataFrame()
    time_series = pd.Series()
    for timepair in timelist:
        start_time = timepair[0]
        end_time = timepair[1]
        time_series = pd.concat([time_series, pd.Series(pd.date_range(start_time, end_time, freq = frequency))])
    for time in tqdm(time_series):
        try:
            result = result.append(glamp_predict(time, station, glamp_path, asos5_path, delay_hours = delay_hours, t_plus_max = t_plus_max))
        except FileNotFoundError:
            continue
    return result

In [13]:
start_date = dt.datetime(year = 2020, month = 8, day = 21, hour = 0, minute = 0)
end_date = dt.datetime.now()
data = make_ml_training_data_set([(start_date, end_date)], 'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 'Data/GLAMP data/', 
                                 'Data/hrrr/', 'Data/BOS_5min.csv', frequency = 'H')

  0%|                                                                              | 27/18660 [00:10<1:50:45,  2.80it/s]

<xarray.Dataset>
Dimensions:     ()
Coordinates:
    x           float64 2.15e+06
    y           float64 7.407e+05
Data variables:
    chunk_id    object ...
    chunk_x     int32 ...
    chunk_y     int32 ...
    in_chunk_x  int32 ...
    in_chunk_y  int32 ...
    index_x     int32 ...
    index_y     int32 ...
    latitude    float64 ...
    longitude   float64 ...


  0%|                                                                              | 27/18660 [00:13<2:31:39,  2.05it/s]


ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
start_date_winter_1 = dt.datetime(year = 2020, month = 12, day = 1, hour = 0, minute = 0)
end_date_winter_1 = dt.datetime(year = 2021, month = 2, day = 28, hour = 23, minute = 0)
start_date_winter_2 = dt.datetime(year = 2019, month = 12, day = 1, hour = 0, minute = 0)
end_date_winter_2 = dt.datetime(year = 2020, month = 2, day = 28, hour = 23, minute = 0)
winter_data = make_ml_training_data_set([(start_date_winter_1, end_date_winter_1), (start_date_winter_2, end_date_winter_2)], 
                                        'kbos', 42.3656, -71.0096, 'Data/BOS.csv',
                                        'Data/GLAMP data/', 'Data/hrrr/', 'Data/BOS_5min.csv', frequency = 'H')

In [None]:
start_date_spring_1 = dt.datetime(year = 2021, month = 3, day = 1, hour = 0, minute = 0)
end_date_spring_1 = dt.datetime(year = 2021, month = 5, day = 31, hour = 23, minute = 0)
start_date_spring_2 = dt.datetime(year = 2020, month = 3, day = 1, hour = 0, minute = 0)
end_date_spring_2 = dt.datetime(year = 2020, month = 5, day = 31, hour = 23, minute = 0)
spring_data = make_ml_training_data_set([(start_date_spring_1, end_date_spring_1), (start_date_spring_2, end_date_spring_2)], 
                                         'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 
                                        'Data/GLAMP data/', 'Data/hrrr/', 'Data/BOS_5min.csv', frequency = 'H')

In [None]:
start_date_summer_1 = dt.datetime(year = 2021, month = 6, day = 1, hour = 0, minute = 0)
end_date_summer_1 = dt.datetime(year = 2021, month = 8, day = 31, hour = 23, minute = 0)
start_date_summer_2 = dt.datetime(year = 2020, month = 6, day = 1, hour = 0, minute = 0)
end_date_summer_2 = dt.datetime(year = 2020, month = 8, day = 31, hour = 23, minute = 0)
summer_data = make_ml_training_data_set([(start_date_summer_1, end_date_summer_1), (start_date_summer_2, end_date_summer_2)], 
                                         'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 
                                        'Data/GLAMP data/', 'Data/hrrr/', 'Data/BOS_5min.csv', frequency = 'H')

In [None]:
start_date_fall_1 = dt.datetime(year = 2021, month = 9, day = 1, hour = 0, minute = 0)
end_date_fall_1 = dt.datetime(year = 2021, month = 11, day = 30, hour = 23, minute = 0)
start_date_fall_2 = dt.datetime(year = 2020, month = 9, day = 1, hour = 0, minute = 0)
end_date_fall_2 = dt.datetime(year = 2020, month = 11, day = 30, hour = 23, minute = 0)
fall_data = make_ml_training_data_set([(start_date_fall_1, end_date_fall_1), (start_date_fall_2, end_date_fall_2)], 
                                         'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 
                                        'Data/GLAMP data/', 'Data/hrrr/', 'Data/BOS_5min.csv', frequency = 'H')

In [None]:
#The ci interval currently isn't implemented correctly
def test_accuracy_metrics(classifier_rf, training_data, flight_cat, use_ci=None, compare=None):
    result_df = pd.DataFrame()
    (X, y_list, val_list) = data_split(training_data)
    for y, val in zip(y_list,val_list):
        X_train, X_test, y_train, y_test, _, val_test = train_test_split(X, y_list[y], val_list[val], train_size=0.7, random_state=42)
        classifier_rf.fit(X_train, y_train)
        predictions = classifier_rf.predict(X_test)
        if use_ci is not None: #keep only ones where prob is above CI
            predict_probs_mask = classifier_rf.predict_proba(X_test)[:,flight_cat] >= use_ci
            for i,prediction in enumerate(predictions):
                if prediction == flight_cat and not predict_probs_mask[i]:
                    predictions[i] = -1
        prob = np.repeat(predictions, [len(verifications) for verifications in val_test])
        results = np.concatenate(np.asarray(val_test))
        if not compare is None:
            result_df[y] = (prob_of_detection(prob, results, flight_cat), 
                            false_alarm_rate(prob, results, flight_cat), 
                            critical_success_index(prob, results, flight_cat))
        else:
            result_df[y] = (prob_of_detection(prob, results, flight_cat) >= compare[0], 
                            false_alarm_rate(prob, results, flight_cat) <= compare[1], 
                            critical_success_index(prob, results, flight_cat) >= compare[2])            
        result_df.rename({0: 'POD', 1: 'FAR', 2: 'CSI'}, inplace=True)
    return result_df

In [None]:
def gfs_accuracy_metrics(gfs_data, flight_cat, t_plus_max=6):
    result_df = pd.DataFrame()
    for time in range(t_plus_max):
        prob = np.repeat(np.asarray(gfs_data[f't plus {time} prediction']), 
                                [len(verifications) for verifications in gfs_data[f't plus {time} verification']])
        results = np.concatenate(np.asarray(gfs_data[f't plus {time} verification']))
        result_df[f't plus {time} data'] = (prob_of_detection(prob, results, flight_cat), 
                            false_alarm_rate(prob, results, flight_cat), 
                            critical_success_index(prob, results, flight_cat))
    return result_df

In [None]:
def overall_comparison(classifier_rf, training_data, compare):
    flight_cats = [0, 1, 2, 3]
    total_hits = 0
    total_tests = 0
    for cat in flight_cats:
        result = test_accuracy_metrics(classifier_rf, training_data, cat, compare=compare)
        total_hits += result.to_numpy().sum()
        total_tests += len(result) * len(result.T)
    return total_hits, total_tests

In [None]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs = -1, class_weight = "balanced")
classifier_nn = MLPClassifier(random_state=42)
#params determined via hyperparam tuning
n_estimators = [int(x) for x in np.linspace(200, 2000, 10)]
max_depth = [x for x in range(10, 120, 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
tuned_rf = RandomizedSearchCV(estimator = classifier_rf, param_distributions = random_grid, random_state = 42, n_jobs = -1)
k = 0.5
gipra_goal = [0.65, 0.38, 0.47]
station_goal = [0.75, 0.25, 0.6]

In [344]:
gfs_winter = glamp_time_series([(start_date_winter_1, end_date_winter_1)], 
                  'kbos', 'Data/GLAMP data/', asos5)

100%|███████████████████████████████████████████████████████████████████████████████| 2160/2160 [01:05<00:00, 32.89it/s]


In [31]:
x = np.arange(-1, 2)
y = np.arange(-1, 2)
x_g, y_g = np.meshgrid(x,y)
x_g.flatten(), y_g.flatten()

(array([-1,  0,  1, -1,  0,  1, -1,  0,  1]),
 array([-1, -1, -1,  0,  0,  0,  1,  1,  1]))

In [356]:
gfs_accuracy_metrics(gfs_winter, 0)

Unnamed: 0,t plus 0 data,t plus 1 data,t plus 2 data,t plus 3 data,t plus 4 data,t plus 5 data
0,0.675497,0.645695,0.652318,0.701987,0.791391,0.798013
1,0.58871,0.664948,0.70597,0.725389,0.720468,0.741139
2,0.343434,0.283019,0.254194,0.24594,0.260349,0.242944


In [352]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 0)

CPU times: user 7.25 s, sys: 2.01 s, total: 9.27 s
Wall time: 2.6 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.818182,0.657534,0.594595,0.786667,0.565574,0.524752
FAR,0.318182,0.142857,0.352941,0.233766,0.378378,0.231884
CSI,0.592105,0.592593,0.44898,0.634409,0.420732,0.452991


In [126]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 3) #VFR

CPU times: user 11.8 s, sys: 3.1 s, total: 14.9 s
Wall time: 5.77 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.993539,0.998208,0.997404,0.995524,0.997582,0.995153
FAR,0.02272,0.026684,0.032421,0.012214,0.019642,0.015345
CSI,0.971108,0.971619,0.965149,0.983418,0.978034,0.979955


In [127]:
%%time
test_accuracy_metrics(classifier_rf, summer_data, 3) #VFR

CPU times: user 13.1 s, sys: 3.36 s, total: 16.4 s
Wall time: 5.06 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.982595,0.982088,0.981535,0.984618,0.976833,0.981244
FAR,0.058237,0.048189,0.060061,0.069622,0.070081,0.06168
CSI,0.926311,0.935569,0.923607,0.917049,0.909852,0.921788


In [128]:
%%time
test_accuracy_metrics(classifier_rf, fall_data, 3) #VFR

CPU times: user 24.4 s, sys: 3.22 s, total: 27.6 s
Wall time: 7.44 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.979034,0.988166,0.979395,0.98188,0.985417,0.982226
FAR,0.052926,0.054789,0.054579,0.056719,0.055619,0.056197
CSI,0.928247,0.934631,0.926983,0.927141,0.931364,0.927955


In [129]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 2) #MVFR

CPU times: user 12.8 s, sys: 3.33 s, total: 16.1 s
Wall time: 5.54 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.618785,0.668073,0.715505,0.763948,0.620081,0.578512
FAR,0.302181,0.314574,0.335535,0.306494,0.291473,0.240506
CSI,0.488017,0.511302,0.525601,0.571123,0.494054,0.488941


In [130]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 2) #MVFR

CPU times: user 10.2 s, sys: 3.21 s, total: 13.4 s
Wall time: 4.53 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.553247,0.516908,0.389744,0.662371,0.497191,0.546366
FAR,0.270548,0.298361,0.371901,0.261494,0.332075,0.212996
CSI,0.459052,0.423762,0.316667,0.536534,0.398649,0.475983


In [131]:
%%time
test_accuracy_metrics(classifier_rf, summer_data, 2) #MVFR

CPU times: user 12.9 s, sys: 3.1 s, total: 16 s
Wall time: 4.93 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.444844,0.473411,0.40621,0.426846,0.402473,0.428005
FAR,0.235052,0.272908,0.32618,0.326271,0.379237,0.234043
CSI,0.39135,0.401982,0.339459,0.353726,0.323043,0.378505


In [132]:
%%time
test_accuracy_metrics(classifier_rf, fall_data, 2) #MVFR

CPU times: user 21.4 s, sys: 2.95 s, total: 24.3 s
Wall time: 6.04 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.560032,0.539732,0.589604,0.582888,0.593727,0.541387
FAR,0.257479,0.132743,0.27619,0.235471,0.162551,0.234984
CSI,0.468961,0.498547,0.481317,0.494171,0.532374,0.464194


In [133]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 1) #IFR

CPU times: user 11.8 s, sys: 3.02 s, total: 14.9 s
Wall time: 4.94 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.647059,0.572727,0.509859,0.56051,0.550725,0.708633
FAR,0.277992,0.297398,0.334559,0.22467,0.312217,0.374603
CSI,0.518006,0.460976,0.40583,0.482192,0.44058,0.497475


In [134]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 1) #IFR

CPU times: user 10.9 s, sys: 3 s, total: 13.9 s
Wall time: 5.27 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.724942,0.643836,0.723982,0.798611,0.741419,0.8125
FAR,0.29955,0.271318,0.284116,0.188235,0.230404,0.330472
CSI,0.553381,0.519337,0.56239,0.673828,0.606742,0.579926


In [135]:
%%time
test_accuracy_metrics(classifier_rf, summer_data, 1) #IFR

CPU times: user 13.1 s, sys: 3.07 s, total: 16.2 s
Wall time: 5.31 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.727273,0.676596,0.614141,0.471311,0.627083,0.63286
FAR,0.442202,0.396584,0.353191,0.40874,0.359574,0.427523
CSI,0.461305,0.468336,0.459909,0.355487,0.46379,0.429752


In [136]:
%%time
test_accuracy_metrics(classifier_rf, fall_data, 1) #IFR

CPU times: user 21.1 s, sys: 3.01 s, total: 24.1 s
Wall time: 5.94 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.63249,0.687671,0.540501,0.534743,0.656897,0.625874
FAR,0.35724,0.324361,0.319109,0.410982,0.422727,0.448382
CSI,0.46798,0.516993,0.431257,0.389439,0.443539,0.414832


In [137]:
%%time
test_accuracy_metrics(classifier_rf, winter_data, 0) #LIFR

CPU times: user 12.2 s, sys: 3.04 s, total: 15.2 s
Wall time: 5.1 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.818182,0.657534,0.594595,0.786667,0.565574,0.524752
FAR,0.318182,0.142857,0.352941,0.233766,0.378378,0.231884
CSI,0.592105,0.592593,0.44898,0.634409,0.420732,0.452991


In [138]:
%%time
test_accuracy_metrics(classifier_rf, spring_data, 0) #LIFR

CPU times: user 10.7 s, sys: 3.04 s, total: 13.7 s
Wall time: 4.94 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.609272,0.717105,0.608392,0.807143,0.824818,0.760274
FAR,0.269841,0.296774,0.09375,0.181159,0.162963,0.075
CSI,0.497297,0.550505,0.572368,0.684848,0.710692,0.716129


In [139]:
%%time
test_accuracy_metrics(classifier_rf, summer_data, 0) #LIFR

CPU times: user 13.3 s, sys: 3.07 s, total: 16.4 s
Wall time: 5.47 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.406699,0.508876,0.689655,0.502857,0.552632,0.605556
FAR,0.440789,0.508571,0.465241,0.47619,0.16,0.32716
CSI,0.307971,0.333333,0.431034,0.345098,0.5,0.467811


In [140]:
%%time
test_accuracy_metrics(classifier_rf, fall_data, 0) #LIFR

CPU times: user 21.3 s, sys: 2.97 s, total: 24.3 s
Wall time: 6.04 s


Unnamed: 0,flight category 0,flight category 1,flight category 2,flight category 3,flight category 4,flight category 5
POD,0.648649,0.758278,0.726937,0.655172,0.446203,0.482866
FAR,0.307692,0.266026,0.318339,0.243028,0.284264,0.314159
CSI,0.503497,0.594805,0.5427,0.541311,0.379032,0.395408


In [144]:
%%time
overall_comparison(classifier_rf, winter_data, gipra_goal)

CPU times: user 45.5 s, sys: 12.6 s, total: 58 s
Wall time: 18.5 s


(55, 72)

In [145]:
%%time
overall_comparison(classifier_rf, spring_data, gipra_goal)

CPU times: user 43.3 s, sys: 12.3 s, total: 55.5 s
Wall time: 19.1 s


(60, 72)

In [146]:
%%time
overall_comparison(classifier_rf, summer_data, gipra_goal)

CPU times: user 49.5 s, sys: 12.6 s, total: 1min 2s
Wall time: 19 s


(32, 72)

In [147]:
%%time
overall_comparison(classifier_rf, fall_data, gipra_goal)

CPU times: user 1min 27s, sys: 11.8 s, total: 1min 39s
Wall time: 25.7 s


(47, 72)

In [150]:
%%time
overall_comparison(classifier_rf, winter_data, station_goal)

CPU times: user 45.6 s, sys: 12.6 s, total: 58.2 s
Wall time: 18.3 s


(27, 72)

In [151]:
%%time
overall_comparison(classifier_rf, spring_data, station_goal)

CPU times: user 40.7 s, sys: 12.1 s, total: 52.8 s
Wall time: 17.7 s


(35, 72)

In [152]:
%%time
overall_comparison(classifier_rf, summer_data, station_goal)

CPU times: user 50.8 s, sys: 12.4 s, total: 1min 3s
Wall time: 20.1 s


(21, 72)

In [153]:
%%time
overall_comparison(classifier_rf, fall_data, station_goal)

CPU times: user 1min 26s, sys: 11.7 s, total: 1min 37s
Wall time: 24.5 s


(24, 72)

In [141]:
%%time
(X, y_list, val_list) = data_split(winter_data)
X_train, X_test, y_train, y_test = train_test_split(X, y_list['flight category 0'], train_size=0.7, random_state=42)
classifier_rf.fit(X_train, y_train)

imports = classifier_rf.feature_importances_

CPU times: user 1.86 s, sys: 508 ms, total: 2.37 s
Wall time: 743 ms


In [34]:
classifier_rf.predict_proba(X_test)[classifier_rf.predict_proba(X_test)[:,1]>0.5]

array([[0.05, 0.62, 0.17, 0.16],
       [0.01, 0.96, 0.03, 0.  ],
       [0.04, 0.53, 0.22, 0.21],
       [0.02, 0.76, 0.14, 0.08],
       [0.02, 0.51, 0.45, 0.02],
       [0.03, 0.54, 0.28, 0.15],
       [0.03, 0.61, 0.18, 0.18],
       [0.05, 0.68, 0.16, 0.11],
       [0.01, 0.79, 0.1 , 0.1 ],
       [0.05, 0.67, 0.07, 0.21],
       [0.32, 0.52, 0.08, 0.08],
       [0.34, 0.58, 0.06, 0.02],
       [0.11, 0.74, 0.1 , 0.05],
       [0.41, 0.51, 0.05, 0.03],
       [0.02, 0.76, 0.13, 0.09],
       [0.03, 0.67, 0.28, 0.02],
       [0.02, 0.78, 0.11, 0.09],
       [0.31, 0.59, 0.05, 0.05]])

In [None]:
importances = [(feature, sig) for feature, sig in zip(classifier_rf.feature_names_in_, imports)]
importances.sort(key = lambda x: x[1])
importances

In [None]:
%%time
result = permutation_importance(
    classifier_rf, X_test, y_test, random_state=42, n_jobs=-1
)
#takes about 10 min

In [None]:
importances = [(feature, sig) for feature, sig in zip(classifier_rf.feature_names_in_, result['importances_mean'])]
#importances.sort(key = lambda x: x[0].split('_')[1])
importances.sort(key = lambda x: -x[1])
importances = pd.DataFrame(importances)

In [None]:
np.max(importances[1]), np.min(importances[1]), np.mean(importances[1]), np.std(importances[1])

In [None]:
plt.plot(importances[1])

In [None]:
with pd.option_context('display.max_rows', 999):
    print(importances)

In [None]:
data

In [None]:
condition_list = []
asos5 = read_metar('Data/BOS_5min.csv')
for _, metar_at_time in tqdm(asos5.iterrows()):
    vis = metar_at_time['vsby']
    cld_list = np.asarray(metar_at_time[['skyc1', 'skyc2', 'skyc3', 'skyc4']])
    hgt_list = np.asarray(metar_at_time[['skyl1', 'skyl2', 'skyl3', 'skyl4']])
    ovc_hgt = 100000
    bkn_hgt = 100000

    if 3 in list(cld_list):
        ovc_hgt = hgt_list[cld_list == 3]
        if len(ovc_hgt) > 1:
            ovc_hgt = np.min(ovc_hgt)
    if 2 in list(cld_list):
        bkn_hgt = hgt_list[cld_list == 2]
        if len(bkn_hgt) > 1:
            bkn_hgt = np.min(bkn_hgt)
    ceiling = np.min([ovc_hgt, bkn_hgt])

    if ceiling < 500 or vis < 1:
        conditions = 0
    elif ceiling < 1000 or vis < 3:
        conditions = 1
    elif ceiling < 3000 or vis < 5:
        conditions = 2
    else:
        conditions = 3
    condition_list.append(conditions)
asos5['conditions'] = condition_list

In [None]:
asos5.to_csv('Data/BOS_5min.csv')