In [7]:
#%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
import re
import sys
import os
from matplotlib import pyplot as plt
from IPython.core.display import display, HTML
from IPython.display import IFrame
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth',150)
pd.set_option('display.max_columns',150)
pd.set_option('display.max_rows',500)

plt.style.use('bmh')

from collections import OrderedDict
from pandas import ExcelWriter

import json
import pickle
from datetime import datetime as dt
from tqdm import tqdm
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
import gc
import itertools
from glob import glob
import shutil
from dateutil.relativedelta import relativedelta
import sweetviz

from sklearn import metrics
from sklearn import feature_selection
from sklearn import preprocessing

from window_ops.rolling import rolling_mean, rolling_max, rolling_min, seasonal_rolling_mean
from numba import njit

import mlforecast
from mlforecast import MLForecast
from mlforecast import target_transforms

import xgboost as xgb

In [1]:
def get_preprocessing(df):
    '''
    Data preprocessing function.
    df - dataframe with at least 2 columns: 
        - date - day of metar string
        - metar - METAR string
    '''
    # clean from RMK - we don't need
    df['metar'] = df['metar'].map(lambda x: x.split(' RMK')[0].strip().replace('SPECI ',''))
    df = df[df.metar.str[:4].isin(['KMIA','SPEC'])].reset_index(drop = True)

    # parsing
    from metar import Metar
    def parse_metar(x):
        try:
            m = Metar.Metar(x.metar, month = x.date.month, year = x.date.year)
        except Exception as e:
            m = Metar.Metar(x.metar[:4], month = x.date.month, year = x.date.year)
        return m
    out = df.apply(parse_metar, axis = 1)
    df_out = pd.DataFrame([x.__dict__ for x in out])

    # choose only useful features
    feats_used = [
        'time',
        'wind_dir',
        'wind_speed',
        'wind_gust',
        'vis',
        'temp',
        'dewpt',
        'press',
        'weather',
        'sky',
        # 'code',
    ]
    df_out = df_out[feats_used]

    # time preprocessing
    f1 = df_out.time.isnull()
    df_out = df_out[~f1].sort_values('time').reset_index().set_index('time')
    del df_out['index']
    df_out = df_out.resample('H').last()
    
    ####
    # values convertion
    # get values
    def conv_val(x):
        try:
            return x.value()
        except:
            return np.nan
    
    ####################
    # numerical features
    df_out['wind_dir'] = df_out['wind_dir'].map(conv_val)
    df_out['wind_speed'] = df_out['wind_speed'].map(conv_val)
    df_out['wind_gust'] = df_out['wind_gust'].map(conv_val)
    df_out['vis'] = df_out['vis'].map(conv_val)
    df_out['temp'] = df_out['temp'].map(conv_val)
    df_out['dewpt'] = df_out['dewpt'].map(conv_val)
    df_out['press'] = df_out['press'].map(conv_val)
    
    # corrections
    df_out.loc[df_out['wind_dir'] == 0,   'wind_dir'] = np.nan
    df_out.loc[df_out['wind_speed'] == 0, 'wind_gust'] = np.nan
    
    # fill nulls by the forward fill method
    feats_fill = ['temp','dewpt','press','vis','wind_speed']
    for feat in feats_fill:
        df_out[feat] = df_out[feat].fillna(method = 'ffill').fillna(method = 'bfill')
        
    # add flags that have value
    df_out['vis_unclear_flg'] = (df_out['vis'] < 10).astype(int)
    df_out['wind_speed_flg'] = (df_out['wind_speed'] > 0).astype(int)
    df_out['wind_gust_flg'] = (df_out['wind_gust'].notnull()).astype(int)
    df_out['wind_dir_flg'] = (df_out['wind_dir'].notnull()).astype(int)
    
    # fill nulls again
    df_out['wind_dir'] = df_out['wind_dir'].fillna(method = 'ffill').fillna(method = 'bfill')
    df_out['wind_gust'] = df_out['wind_gust'].fillna(method = 'ffill').fillna(method = 'bfill')
    
        
    #######################
    # categorical features
    df_out['id1'] = range(len(df_out))
    def conv_list(df, feat):
        '''
        # unflat lists
        '''
        out1 = list()
        out2 = list()
        for _, r in df.iterrows():
            out1+=r[feat]
            out2+=[r.id1]*len(r[feat])
        df1 = pd.DataFrame(out1)
        df1['id1'] = out2
        return df1

    
    # weather
    df1 = df_out[~df_out['weather'].isnull()]
    df1 = df1[df1['weather'].map(len) > 0][['weather','id1']]
    df2 = conv_list(df1, 'weather')
    
    df2['weather_rain'] = 0
    df2.loc[(df2[0] == '-') & (df2[2].isin(['RA','DZ'])), 'weather_rain'] = 1
    df2.loc[(df2[0] == '')  & (df2[2].isin(['RA','DZ'])), 'weather_rain'] = 2
    df2.loc[(df2[0] == '+') & (df2[2].isin(['RA','DZ'])), 'weather_rain'] = 3
    df2['weather_rain_flg'] = (df2['weather_rain'] > 0).astype(int)

    df2['weather_ts'] = 0
    df2.loc[(df2[1] == 'TS'), 'weather_ts'] = 1

    df2['weather_fog'] = 0
    df2.loc[(df2[3].isin(['BR'])), 'weather_fog'] = 1
    df2.loc[(df2[3].isin(['HZ'])), 'weather_fog'] = 2
    df2.loc[(df2[3].isin(['FG'])), 'weather_fog'] = 3
    df2.loc[(df2[3].isin(['FU'])), 'weather_fog'] = 4
    df2['weather_fog_flg'] = (df2['weather_fog'] > 0).astype(int)
    
    feats1 = ['weather_rain', 'weather_rain_flg', 'weather_ts', 'weather_fog', 'weather_fog_flg']
    df3 = df2.groupby('id1')[feats1].max().reset_index()
    ind = df_out.index
    df_out = df_out.merge(df3[['id1'] + feats1], how = 'left', on = 'id1')
    df_out.index = ind
    for f in feats1:
        df_out[f] = df_out[f].fillna(0)
    
    del df1
    del df2
    gc.collect()
    
    # sky
    df1 = df_out[~df_out['sky'].isnull()]
    df1 = df1[df1['sky'].map(len) > 0][['sky','id1']]
    df2 = conv_list(df1, 'sky')
    
    df2[1] = df2[1].map(conv_val).fillna(0)
    df2[2] = df2[2].fillna('')
    
    df21 = pd.concat([
        df2[['id1',0,1]].rename(columns = {0:'var'}), 
        df2[['id1',2,1]].rename(columns = {2:'var'}) ])
    sky_used = ['BKN','FEW','SCT','OVC','CLR','CB','TCU']
    filt = df21['var'].isin(sky_used)
    
    df3 = df21[filt].groupby(['id1','var']).size().unstack().fillna(0)
    cols = df3.columns
    pre = 'sky_cnt_'
    df3.columns = [pre+c for c in cols]
    ind = df_out.index
    df_out = df_out.merge(df3.reset_index(), how = 'left')
    df_out.index = ind
    for f in sky_used:
        df_out[pre+f] = df_out[pre+f].fillna(0)
        df_out['sky_flg_'+f] = (df_out[pre+f] > 0).astype(int)

    df3 = df21[filt].groupby(['id1','var'])[1].mean().unstack().fillna(0)
    cols = df3.columns
    pre = 'sky_avg_'
    df3.columns = [pre+c for c in cols]
    ind = df_out.index
    df_out = df_out.merge(df3.reset_index(), how = 'left')
    df_out.index = ind
    for f in sky_used:
        df_out[pre+f] = df_out[pre+f].fillna(0)
        
    del df1
    del df2
    del df21
    del df3
    gc.collect()
    
    # del df_out['sky']
    del df_out['weather']
    del df_out['id1']
    
    return df_out

In [8]:
def get_features(df, feats):
    '''
    Feature engineering function.
    '''
    df = df.reset_index()
    df['unique_id'] = 'a'

    # feature engine
    model = MLForecast(
        models = {},
        freq = 'H',
        target_transforms = [
        ],
        lags = [1] + [24],
        lag_transforms = {
            1:  [
                (rolling_mean, 3),
                (seasonal_rolling_mean, 24, 3), 
            ],
        },
        # date_features=['month','hour'],
    )
    # main feats
    df_all = df[['time']]
    for feat in feats:
        feats0 = ['unique_id','time',feat]
        df1 = model.preprocess(df[feats0], id_col = 'unique_id', time_col = 'time', target_col = feat)
        del df1['unique_id']
        cols = [f'{feat}_'+c.replace('rolling_mean','RM').replace('window_size','WS').replace('seasonal','S').replace('season_length','SL') for c in list(df1.columns)[2:]]
        df1.columns = list(df1.columns)[:2] + cols

        eps = 1e-12
        df1[f'{feat}_lag1to3'] = df1[f'{feat}_lag1'] / (df1[f'{feat}_RM_lag1_WS3'] + eps)
        df1[f'{feat}_lag24to72'] = df1[f'{feat}_lag24'] / (df1[f'{feat}_S_RM_lag1_SL24_WS3'] + eps)

        df_all = df_all.merge(df1, on = 'time', how = 'left')
        
    # date feats
    df_all['dt_month'] = df_all['time'].dt.month
    df_all['dt_hour'] = df_all['time'].dt.hour
    
    # return df_all
    return df_all[df_all.wind_dir.notnull()]