In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from scipy import stats
from statsmodels.robust.scale import mad

import ipywidgets as widgets
from IPython.display import display
from IPython.core.display import HTML
from ipywidgets import Layout

%matplotlib inline

## Global variables

In [2]:
EVENT_TYPE = 'weather_event' # options: weather_event or nas_event
WEATHER_COL = 'WEATHER_DELAY' # options: WEATHER_DELAY or NAS_DELAY

## Get data

In [3]:
print('Retrieving data from git')
url = 'https://raw.githubusercontent.com/bdbritt/weather_flight_delays/main/weather_delay_data_cleaned.csv'
# df = pd.read_csv('weather_delay_data.csv').sample(frac = 0.30)
df = pd.read_csv(url)
print(f'Retrieved {df.shape[0]} records')

Retrieving data from git
Retrieved 840029 records


In [4]:
# pad times with zeros
df['DEP_TIME'] = df['DEP_TIME'].astype(str).str.pad(4, fillchar='0')

# add weather even flag
df['weather_event'] = np.where(df['WEATHER_DELAY'] != 0, True, False)
df['nas_event'] = np.where(df['NAS_DELAY'] != 0, True, False)

In [5]:
wanted_cols = ['FL_DATE', 'ORIGIN', 'OP_CARRIER', 'DEP_TIME', 'DEP_DELAY', 
               'WEATHER_DELAY', 'NAS_DELAY','month', 'year',
              'travel_season', 'weather_event', 'nas_event']
df = df[wanted_cols]

In [64]:
# df = df.drop(['DEST', 'OP_CARRIER', 'ARR_TIME', 'ARR_DELAY', 'CRS_ARR_TIME', 'ACTUAL_ELAPSED_TIME'], axis=1)

In [65]:
df.head()

Unnamed: 0,FL_DATE,ORIGIN,OP_CARRIER,DEP_TIME,DEP_DELAY,WEATHER_DELAY,NAS_DELAY,month,year,travel_season,weather_event,nas_event
0,2018-01-01,ORD,UA,2230,76.0,11.0,0.0,1,2018,low,True,False
1,2018-01-01,ORD,UA,2111,41.0,26.0,0.0,1,2018,low,True,False
2,2018-01-01,ORD,UA,2020,45.0,31.0,0.0,1,2018,low,True,False
3,2018-01-01,ORD,UA,1044,104.0,75.0,0.0,1,2018,low,True,False
4,2018-01-01,ORD,UA,754,49.0,49.0,8.0,1,2018,low,True,True


## Main functions

In [6]:
def run_isf_model(X: np.array, outliers_fraction = 'auto') -> IsolationForest: 
    """
    """
    
    clf = IsolationForest(contamination = outliers_fraction, 
                          random_state=42, n_jobs=-1).fit(X)
    return clf


def update_data(df, model, col_index) ->pd.DataFrame:
    
    df['scores'] = model.decision_function(df.iloc[:, col_index].values)
    df['anomaly'] = model.predict(df.iloc[:, col_index].values)
    df['anomaly'] = df['anomaly'].apply(lambda x: True if x ==-1 else False)
    
    df.shape[0] - df['anomaly'].value_counts()[0]
    
    return df


def get_model_results(df: pd.DataFrame, col_index: list, model):
    
    df = df.copy()
    
    # 2d array of column values
    
    X = df.iloc[:, col_index].values
    
    df = update_data(df, model, col_index)
    
    return df


def run_model(df: pd.DataFrame, col_index: list, models: dict, group_id: list = ['ORIGIN', 'year']) -> pd.DataFrame:
    
    df = df.copy()
    groups = df.groupby(group_id)
    data = pd.concat([get_model_results(v, col_index, models[k]) for k, v in groups])
    
    return data


def get_coords(df, group):
    
    coords = {'DEN':{'lat':39.849312, 'lon': -104.673828},
          'ORD':{'lat':41.978611, 'lon': -87.904724}, 
          'ATL':{'lat':33.640411, 'lon': -84.419853},
          'DFW':{'lat':32.897480, 'lon': -97.040443},
          'LAX':{'lat':33.942791, 'lon': -118.410042}}
    
    df['lat'] = coords[group]['lat']
    df['lon'] = coords[group]['lon']
    return df

## Split the data into training

In [7]:
training_mask = (df['year']!= 2018) & (df[EVENT_TYPE] == True)
target_mask = (df['year']== 2018) & (df[EVENT_TYPE]==True)

training_weather = df.loc[training_mask].copy()
target_weather = df.loc[target_mask].copy()

print(f'training data size: {training_weather.shape[0]}, target data size: {target_weather.shape[0]}')

training data size: 126342, target data size: 41215


## Train a model per origin

In [8]:
origin_isf_model = {}
groups = training_weather.groupby('ORIGIN')
for origin, data in groups:
    X = data.iloc[:,[5]].values # weather delay and month
    model = run_isf_model(X)
    origin_isf_model[origin] = model

## Run IsolationForest

In [9]:
target_weather_isf = run_model(target_weather, [5], origin_isf_model, 'ORIGIN')

In [10]:
target_weather_isf.loc[target_weather_isf['anomaly']==True].groupby(['ORIGIN','travel_season'])['WEATHER_DELAY'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
ORIGIN,travel_season,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ATL,high,1055.0,143.963033,155.637855,1.0,71.0,94.0,152.5,1116.0
ATL,low,806.0,160.866005,162.812805,1.0,77.0,111.0,174.0,1036.0
ATL,shoulder,543.0,194.123389,257.889879,1.0,65.0,89.0,181.0,1206.0
DEN,high,382.0,131.727749,151.243036,1.0,74.0,98.5,163.5,1070.0
DEN,low,153.0,164.712418,256.86551,1.0,2.0,76.0,133.0,1050.0
DEN,shoulder,365.0,144.641096,197.189831,1.0,3.0,95.0,139.0,1046.0
DFW,high,824.0,121.899272,100.727467,1.0,77.75,106.5,146.25,1161.0
DFW,low,671.0,129.467958,142.225903,1.0,73.0,100.0,144.0,1214.0
DFW,shoulder,1236.0,128.210356,132.684898,1.0,73.0,99.0,149.0,1204.0
LAX,high,34.0,206.705882,211.82333,1.0,87.75,162.0,233.25,1017.0


In [11]:
target_weather_isf['anomaly'].value_counts(normalize=True)

False    0.760961
True     0.239039
Name: anomaly, dtype: float64

In [12]:
target_weather_isf.groupby(['ORIGIN','travel_season'])['anomaly'].value_counts(normalize=True).to_frame('anomaly_%').reset_index()

Unnamed: 0,ORIGIN,travel_season,anomaly,anomaly_%
0,ATL,high,False,0.808946
1,ATL,high,True,0.191054
2,ATL,low,False,0.73617
3,ATL,low,True,0.26383
4,ATL,shoulder,False,0.824442
5,ATL,shoulder,True,0.175558
6,DEN,high,False,0.731553
7,DEN,high,True,0.268447
8,DEN,low,False,0.817857
9,DEN,low,True,0.182143


## Analyze data

In [13]:
data = target_weather_isf.copy()

In [14]:
ALL = 'ALL'
def unique_sorted_values_plus_ALL(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    return unique


def get_describe(df):
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    df = df.groupby(['year', 'travel_season'])['WEATHER_DELAY'].describe().reset_index()
    display(df)

    
def get_bar_graph(df):
    origin = df['ORIGIN'].unique()[0]
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    
    df = df.groupby(['year', 'travel_season']).agg({'WEATHER_DELAY':'sum'}).reset_index()
    
    fig = px.bar(df, color='travel_season', y='WEATHER_DELAY', x='year', barmode='group', title=f'{origin} Total Season by Year')
    fig.show()


def determine_est_of_location(df, col1):
    """
    Prints common estimate of location 
    information for data
    """
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    print(f'\nMax: {df[col1].describe()[7]}')
    print(f'Min: {df[col1].describe()[3]}')
    print(f'Mean: {round(np.mean(df[col1]),2)}')
    print(f'Trimmed Mean: {round(stats.trim_mean(df[col1], proportiontocut=0.1),2)}')
    print(f'Median: {df[col1].median()}')


def determine_est_of_variability(df, col1):
    """
    Prints common estimate of variability 
    information for data
    """
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    print(f'\nSTD: {round(np.std(df[col1]),2)}') # population STD
    temp = df.describe(include=[np.number],percentiles=[.10,.90]).T
    tstd = stats.tstd(df[col1],(temp['10%'].tolist()[0],temp['90%'].tolist()[0]))
    print(f'Trimmed STD: {round(tstd,2)}')
    print(f'IQR: {df[col1].quantile(0.75) - df[col1].quantile(0.25)}')
    print(f'Mean absolute deviation: {round(df[col1].mad(),2)}')
    print(f'Median absolute deviation: {round(mad(df[col1]),2)}')


def get_histogram(df):
    origin = df['ORIGIN'].unique()[0]
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    
    fig = px.box(df, x="year", y="WEATHER_DELAY", color="travel_season")
    fig.show()
    
    
def get_line_graph(df):
    
    non_anomalies_monthly_sum = df.loc[df['anomaly']==False].groupby('month')['WEATHER_DELAY'].sum().to_frame('total_min').reset_index()
    anomalies_monthly_sum = df.loc[df['anomaly']==True].groupby('month')['WEATHER_DELAY'].sum().to_frame('total_min').reset_index()
    
    non_anomalies_monthly_sum['anomaly'] = False
    anomalies_monthly_sum['anomaly'] = True
    
    fig = go.Figure()
 
    chart_name = 'Total Minutes'
    
    fig.add_trace(go.Scatter(x=non_anomalies_monthly_sum['month'], y=non_anomalies_monthly_sum['total_min'], name='2018-Non',
                         line=dict(color='red', width=2)))
    
    fig.add_trace(go.Scatter(x=anomalies_monthly_sum['month'], y=anomalies_monthly_sum['total_min'], name='2018-Anomalies',
                         line=dict(color='red', width=2,
                              dash='dash')))
    
    fig.update_layout(
    title="Anomalies vs. Non",
    xaxis_title="Month",
    yaxis_title="Total Delay Minutes",
    legend_title="Legend",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ))
    
    
    fig.show()

    
def map_data(df, title, size_column='total_min', total=True, lat='lat', lon='lon'):
    """
    Main mapping function
    """
    
    if total:
        
        anomalies_monthly_sum = df.loc[df['anomaly']==True].groupby(['ORIGIN', 'travel_season'])['WEATHER_DELAY'].sum().to_frame('total_min').reset_index()
        groups = anomalies_monthly_sum.groupby('ORIGIN')
        
        df = pd.concat([get_coords(v, k) for k, v in groups])
        
        fig = px.scatter_geo(df, locationmode = 'USA-states', lat=lat, lon=lon,
                         hover_name="ORIGIN", size=size_column, animation_frame="travel_season",)
        fig.update_layout(
            title_text = title,
            showlegend = True,
            geo = dict(
                scope = 'usa',
                landcolor = 'rgb(217, 217, 217)',))
        fig.show()
        
    else:
        
        df = df.loc[df['WEATHER_DELAY']> 45].copy()
        anomalies_travel_cnt = df.loc[df['anomaly']==True].groupby(['ORIGIN', 'travel_season'])['WEATHER_DELAY'].count().to_frame('total_delays').reset_index()
        groups = anomalies_travel_cnt.groupby('ORIGIN')
        
        df = pd.concat([get_coords(v, k) for k, v in groups])

        fig = px.scatter_geo(df, locationmode = 'USA-states', lat=lat, lon=lon,
                         hover_name="ORIGIN", size=size_column, animation_frame="travel_season",)
        fig.update_layout(
            title_text = title,
            showlegend = True,
            geo = dict(
                scope = 'usa',
                landcolor = 'rgb(217, 217, 217)',))
        fig.show()


def get_line_graph_all(df):
    
    non_monthly_sum = df.loc[df['anomaly']==False].groupby(['ORIGIN','month'])['WEATHER_DELAY'].sum().to_frame('total_min').reset_index()
    anom_monthly_sum = df.loc[df['anomaly']==True].groupby(['ORIGIN', 'month'])['WEATHER_DELAY'].sum().to_frame('total_min').reset_index()
    
    non_monthly_sum['anomaly'] = False
    anom_monthly_sum['anomaly'] = True
    
    fig = go.Figure()
 
    chart_name = 'Total Minutes'
    
    # origin ATL
    fig.add_trace(go.Scatter(x=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='ATL']['month'], 
                             y=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='ATL']['total_min'], name='ATL 2018-Non',
                         line=dict(color='red', width=2)))
    
    fig.add_trace(go.Scatter(x=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='ATL']['month'],
                             y=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='ATL']['total_min'], name='ATL 2018-Anomalies',
                         line=dict(color='red', width=2,
                              dash='dash')))
    
    # origin DEN
    fig.add_trace(go.Scatter(x=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='DEN']['month'], 
                             y=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='DEN']['total_min'], name='DEN 2018-Non',
                         line=dict(color='blue', width=2)))
    
    fig.add_trace(go.Scatter(x=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='DEN']['month'], 
                             y=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='DEN']['total_min'], name='DEN 2018-Anomalies',
                         line=dict(color='blue', width=2,
                              dash='dash')))
    
    # origin DFW
    fig.add_trace(go.Scatter(x=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='DFW']['month'], 
                             y=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='DFW']['total_min'], name='DFW 2018-Non',
                         line=dict(color='green', width=2)))
    
    fig.add_trace(go.Scatter(x=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='DFW']['month'], 
                             y=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='DFW']['total_min'], name='DFW 2018-Anomalies',
                         line=dict(color='green', width=2,
                              dash='dash')))
    
    # origin LAX
    fig.add_trace(go.Scatter(x=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='LAX']['month'], 
                             y=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='LAX']['total_min'], name='LAX 2018-Non',
                         line=dict(color='goldenrod', width=2)))
    
    fig.add_trace(go.Scatter(x=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='LAX']['month'], 
                             y=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='LAX']['total_min'], name='LAX 2018-Anomalies',
                         line=dict(color='goldenrod', width=2,
                              dash='dash')))
    
     # origin LAX
    fig.add_trace(go.Scatter(x=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='ORD']['month'], 
                             y=non_monthly_sum.loc[non_monthly_sum['ORIGIN']=='ORD']['total_min'], name='ORD 2018-Non',
                         line=dict(color='magenta', width=2)))
    
    fig.add_trace(go.Scatter(x=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='ORD']['month'], 
                             y=anom_monthly_sum.loc[anom_monthly_sum['ORIGIN']=='ORD']['total_min'], name='ORD 2018-Anomalies',
                         line=dict(color='magenta', width=2,
                              dash='dash')))
    
    
    fig.update_layout(
    title="Anomalies vs. Non",
    xaxis_title="Month",
    yaxis_title="Total Delay Minutes",
    legend_title="Legend",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    ))
    
    
    fig.show()

def get_histogram(df):
    df = df.loc[df['anomaly']==True]
    
    fig = px.box(df, x="ORIGIN", y="WEATHER_DELAY", color="travel_season")
    fig.show()

In [15]:
dropdown_state = widgets.Dropdown(options = unique_sorted_values_plus_ALL(data.ORIGIN), description='Origin: ')

output_origin = widgets.Output()
describe_output = widgets.Output()
bar_graph_output = widgets.Output()
histo_graph_output = widgets.Output()
line_graph_output = widgets.Output()

def event_action():
    # clear the previous selection on each iteration
    output_origin.clear_output()
    describe_output.clear_output()
    bar_graph_output.clear_output()
    histo_graph_output.clear_output()
    line_graph_output.clear_output()
    
    if (dropdown_state.value == ALL):
        common_filter = data
        
    else:
        common_filter = data[data.ORIGIN == dropdown_state.value]
    
    with output_origin:
        print(f'Anamoly Record count: {common_filter.shape[0]}')
        determine_est_of_location(common_filter, WEATHER_COL)
        determine_est_of_variability(common_filter, WEATHER_COL)
        
        print('\nDelays 45 min >=')
        
        query_syn = 'WEATHER_DELAY >= 45'
        filter_45min = common_filter.query(query_syn)
        print(f'Record count: {filter_45min.shape[0]}')
        determine_est_of_location(filter_45min, WEATHER_COL)
        determine_est_of_variability(filter_45min, WEATHER_COL)
    
    with describe_output:
        get_describe(common_filter)
    
    with bar_graph_output:
        get_bar_graph(common_filter)
    
    with histo_graph_output:
        get_histogram(common_filter)
    
    with line_graph_output:
        get_line_graph(common_filter)
        
        
def dropdown_state_eventhandler(change):
    event_action()
    
def graphit():
    event_action()
    
def tab_chg(chg):
    if chg.old == {}:
        graphit()

dropdown_state.observe(dropdown_state_eventhandler, names='value')

In [16]:
input_widgets = widgets.HBox([dropdown_state])

tab = widgets.Tab([output_origin, describe_output, bar_graph_output, histo_graph_output, line_graph_output])
tab.set_title(0, 'Dataset')
tab.set_title(1, 'Describe Data')
tab.set_title(2, 'Travel Season By Year')
tab.set_title(3, 'Distribution')
tab.set_title(4, 'Line Chart')
# tab.set_title(5, 'State Density Plot')
tab.observe(tab_chg)

dashboard = widgets.VBox([tab], layout=Layout(height='700px'))
display(input_widgets, dashboard)

HBox(children=(Dropdown(description='Origin: ', options=('ALL', 'ATL', 'DEN', 'DFW', 'LAX', 'ORD'), value='ALL…

VBox(children=(Tab(children=(Output(), Output(), Output(), Output(), Output()), _titles={'0': 'Dataset', '1': …

## Map total delay mins

In [17]:
map_data(data, 'Total Anomaly Delay Mins By Travel Season')

# Map count of delays > 45 mins

In [18]:
map_data(data, 'Count of 45 min > Anomalies', 'total_delays', total=False)

In [19]:
get_line_graph_all(data)

In [31]:
get_histogram(data)

In [48]:
get_describe(data)

Unnamed: 0,year,travel_season,count,mean,std,min,25%,50%,75%,max
0,2018,high,3240.0,120.727469,138.409626,1.0,67.0,92.0,145.0,1161.0
1,2018,low,2733.0,122.212953,160.345452,1.0,3.0,88.0,142.0,1214.0
2,2018,shoulder,3879.0,129.21423,175.894686,1.0,3.0,88.0,142.0,1466.0
