In [1]:
import pandas as pd
from metar import Metar
import numpy as np
from IOfuncs import *
import datetime as dt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings('ignore')

In [2]:
def make_ml_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = 2):
    if isinstance(metar_path, str):
        metar_path = read_metar(metar_path)
    metarDF = pd.DataFrame()
    glampDF = pd.DataFrame()
    hrrrDF = pd.DataFrame()    
    
    #right now hardcoded to look at metars for 2 hours before TAF due and then predictions after
    for time in range(-6, -delay_hours, 1):
        metar_at_time = get_metar_at_time(taf_time + dt.timedelta(hours = time), metar_path).T
        metarDF[f'metar {time}'] = metar_at_time
    
    work_time = dt.timedelta(hours=-delay_hours)
    glamp_data = get_glamp_at_time(taf_time + work_time, glamp_path, station, download=True)
    hrrr_data = get_hrrr_at_time(taf_time + work_time, hrrr_path, lat, lon, download=True)
    glamp_synoptic_offset = (taf_time.hour - delay_hours) % 6 - 1
    for time in range(-delay_hours, 7, 1):
        glampDF[f'glamp {time}'] = glamp_data.iloc[time + delay_hours + glamp_synoptic_offset]
        hrrrDF[f'hrrr {time}'] = hrrr_data.iloc[time + delay_hours]    
        
    
    df = pd.concat([metarDF, glampDF, hrrrDF])
    df.drop(['ftime', 'ftime_utc', 'model', 'runtime', 'runtime_utc', 'station', 'metar', 'peak_wind_time', 'valid', 'Unnamed: 0'], inplace=True)

    v = df.unstack().to_frame().sort_index(level=1).T
    v.columns = v.columns.map('_'.join)

    final = v.dropna(axis = 1)
    
    return final

In [3]:
taf_time = dt.datetime(year = 2021, month = 8, day = 21, hour = 18, minute = 16)

In [4]:
make_ml_data_row(taf_time, 'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 'Data/GLAMP data/', 'Data/hrrr/')

Unnamed: 0,hrrr -1_DPT_1000mb,hrrr -2_DPT_1000mb,hrrr 0_DPT_1000mb,hrrr 1_DPT_1000mb,hrrr 2_DPT_1000mb,hrrr 3_DPT_1000mb,hrrr 4_DPT_1000mb,hrrr 5_DPT_1000mb,hrrr 6_DPT_1000mb,hrrr -1_DPT_2m_above_ground,...,glamp 6_wdr,glamp -1_wsp,glamp -2_wsp,glamp 0_wsp,glamp 1_wsp,glamp 2_wsp,glamp 3_wsp,glamp 4_wsp,glamp 5_wsp,glamp 6_wsp
0,292.8,293.5,292.2,292.0,291.8,294.0,293.5,292.2,293.2,293.2,...,90.0,8.0,7.0,9.0,9.0,9.0,9.0,8.0,8.0,7.0


In [5]:
def make_ml_training_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = 2):
    if isinstance(metar_path, str):
        metar_path = full_metar_list = read_metar(metar_path)
    
    df = make_ml_data_row(taf_time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = delay_hours)
    metar_at_time = get_metar_at_time(taf_time, metar_path)
    vis = metar_at_time['vsby']
    ceiling = find_ceiling_height(metar_at_time['metar'])
    if ceiling is None:
        ceiling = 100000
        
    if ceiling < 500 or vis < 1:
        conditions = 0
    elif ceiling < 1000 or vis < 3:
        conditions = 1
    elif ceiling < 3000 or vis < 5:
        conditions = 2
    else:
        conditions = 3
    
    df['flight category'] = conditions
    
    return df

In [6]:
make_ml_training_data_row(taf_time, 'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 'Data/GLAMP data/', 'Data/hrrr/')

Unnamed: 0,hrrr -1_DPT_1000mb,hrrr -2_DPT_1000mb,hrrr 0_DPT_1000mb,hrrr 1_DPT_1000mb,hrrr 2_DPT_1000mb,hrrr 3_DPT_1000mb,hrrr 4_DPT_1000mb,hrrr 5_DPT_1000mb,hrrr 6_DPT_1000mb,hrrr -1_DPT_2m_above_ground,...,glamp -1_wsp,glamp -2_wsp,glamp 0_wsp,glamp 1_wsp,glamp 2_wsp,glamp 3_wsp,glamp 4_wsp,glamp 5_wsp,glamp 6_wsp,flight category
0,292.8,293.5,292.2,292.0,291.8,294.0,293.5,292.2,293.2,293.2,...,8.0,7.0,9.0,9.0,9.0,9.0,8.0,8.0,7.0,3


In [7]:
def make_ml_training_data_set(start_time, end_time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = 2, frequency = '5H'):
    training_df = pd.DataFrame()
    time_series = pd.date_range(start_time, end_time, freq = frequency)
    if isinstance(metar_path, str):
        metar_path = read_metar(metar_path)
    for time in tqdm(time_series):
        try:
            training_row = make_ml_training_data_row(time, station, lat, lon, metar_path, glamp_path, hrrr_path, delay_hours = delay_hours)
            training_df = pd.concat([training_df, training_row])
        except FileNotFoundError:
            continue

    training_df = training_df.fillna(-99999)
    return training_df

In [None]:
start_date = dt.datetime(year = 2021, month = 1, day = 1, hour = 20, minute = 0)
end_date = dt.datetime(year = 2021, month = 12, day = 31, hour = 21, minute = 0)
data = make_ml_training_data_set(start_date, end_date, 'kbos', 42.3656, -71.0096, 'Data/BOS.csv', 'Data/GLAMP data/', 'Data/hrrr/', frequency = 'H')

 33%|█████████████████████████▏                                                  | 2899/8738 [17:34<77:39:44, 47.88s/it]

In [15]:
X = data.drop('flight category', axis=1)
y = data['flight category']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)

In [16]:
classifier_rf = RandomForestClassifier(random_state=42)

In [17]:
%%time
classifier_rf.fit(X_train, y_train)

CPU times: user 1.03 s, sys: 9.25 ms, total: 1.04 s
Wall time: 1.04 s


In [18]:
misses = 0 
for val in classifier_rf.predict(X_test) - y_test:
    if val != 0:
        misses += 1
1 - misses / len(X_test)

0.8983644859813085