In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split



import ipywidgets as widgets
from IPython.display import display
from IPython.core.display import HTML
from ipywidgets import Layout

%matplotlib inline

## Get data

In [2]:
print('Retrieving data from git')
url = 'https://raw.githubusercontent.com/bdbritt/weather_flight_delays/main/weather_delay_data_cleaned.csv'
# df = pd.read_csv('weather_delay_data.csv').sample(frac = 0.30)
df = pd.read_csv(url)
print(f'Retrieved {df.shape[0]} records')

Retrieving data from git
Retrieved 840029 records


In [3]:
# pad times with zeros
df['DEP_TIME'] = df['DEP_TIME'].astype(str).str.pad(4, fillchar='0')

# add weather even flag
df['weather_event'] = np.where(df['WEATHER_DELAY'] != 0, True, False)
df['nas_event'] = np.where(df['NAS_DELAY'] != 0, True, False)

In [4]:
wanted_cols = ['FL_DATE', 'ORIGIN', 'OP_CARRIER', 'DEP_TIME', 'DEP_DELAY', 
               'WEATHER_DELAY', 'NAS_DELAY','month', 'year',
              'travel_season', 'weather_event', 'nas_event']
df = df[wanted_cols]

In [None]:
# df = df.drop(['DEST', 'OP_CARRIER', 'ARR_TIME', 'ARR_DELAY', 'CRS_ARR_TIME', 'ACTUAL_ELAPSED_TIME'], axis=1)

In [None]:
df.head()

## Main functions

In [5]:
def run_isf_model(X: np.array, outliers_fraction = 'auto') -> IsolationForest: 
    """
    """
    
    clf = IsolationForest(contamination = outliers_fraction, 
                          random_state=42, n_jobs=-1).fit(X)
    return clf


def update_data(df, model, col_index) ->pd.DataFrame:
    
    df['scores'] = model.decision_function(df.iloc[:, [col_index]].values)
    df['anomaly'] = model.predict(df.iloc[:, [col_index]].values)
    df['anomaly'] = df['anomaly'].apply(lambda x: True if x ==-1 else False)
    
    df.shape[0] - df['anomaly'].value_counts()[0]
    
    return df


def get_model_results(df: pd.DataFrame, col_index):
    
    df = df.copy()
    
    # 2d array of column values
    
    X = df.iloc[:, [col_index]].values
    
    model = run_isf_model(X)
    
    df = update_data(df, model, col_index)
    
    return df


def run_model(df, group_id = ['ORIGIN', 'year'], col_index=8) -> pd.DataFrame:
    
    df = df.copy()
    groups = df.groupby(group_id)
    data = pd.concat([get_model_results(v, col_index) for k, v in groups])
    
    return data


## Run IsolationForest

In [70]:
weather_df = df.loc[df['weather_event']==True].copy()
# weather_df = weather_df.query('WEATHER_DELAY >= 45')
weather_isf_data = run_model(weather_df, col_index=5)
weather_isf_data.shape

(167557, 14)

In [71]:
weather_isf_data['anomaly'].value_counts()

False    134132
True      33425
Name: anomaly, dtype: int64

In [72]:
weather_isf_data.head()

Unnamed: 0,FL_DATE,ORIGIN,OP_CARRIER,DEP_TIME,DEP_DELAY,WEATHER_DELAY,NAS_DELAY,month,year,travel_season,weather_event,nas_event,scores,anomaly
84849,2014-01-01,ATL,WN,1723,63.0,63.0,35.0,1,2014,low,True,True,-0.006616,True
85180,2014-01-02,ATL,DL,2256,55.0,1.0,0.0,1,2014,low,True,False,-0.014018,True
85181,2014-01-02,ATL,DL,1955,23.0,1.0,0.0,1,2014,low,True,False,-0.014018,True
85182,2014-01-02,ATL,DL,1802,26.0,26.0,5.0,1,2014,low,True,True,0.071311,False
85183,2014-01-02,ATL,DL,1812,37.0,6.0,0.0,1,2014,low,True,False,0.078049,False


In [None]:
# isf_data.groupby(['ORIGIN', 'year', 'travel_season'])['WEATHER_DELAY'].describe().reset_index()

## Analyze data

In [73]:
data = weather_isf_data.copy()

In [74]:
import plotly.graph_objects as go
from scipy import stats
from statsmodels.robust.scale import mad

In [176]:
ALL = 'ALL'
def unique_sorted_values_plus_ALL(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    return unique

def get_describe(df):
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    df = df.groupby(['year', 'travel_season'])['WEATHER_DELAY'].describe().reset_index()
    display(df)

def get_bar_graph(df):
    origin = df['ORIGIN'].unique()[0]
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    
    df = df.groupby(['year', 'travel_season']).agg({'WEATHER_DELAY':'sum'}).reset_index()
    
    fig = px.bar(df, color='travel_season', y='WEATHER_DELAY', x='year', barmode='group', title=f'{origin} Total Season by Year')
    fig.show()


def determine_est_of_location(df, col1):
    """
    Prints common estimate of location 
    information for data
    """
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    print(f'\nMax: {df[col1].describe()[7]}')
    print(f'Min: {df[col1].describe()[3]}')
    print(f'Mean: {round(np.mean(df[col1]),2)}')
    print(f'Trimmed Mean: {round(stats.trim_mean(df[col1], proportiontocut=0.1),2)}')
    print(f'Median: {df[col1].median()}')


def determine_est_of_variability(df, col1):
    """
    Prints common estimate of variability 
    information for data
    """
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    print(f'\nSTD: {round(np.std(df[col1]),2)}') # population STD
    temp = df.describe(include=[np.number],percentiles=[.10,.90]).T
    tstd = stats.tstd(df[col1],(temp['10%'].tolist()[0],temp['90%'].tolist()[0]))
    print(f'Trimmed STD: {round(tstd,2)}')
    print(f'IQR: {df[col1].quantile(0.75) - df[col1].quantile(0.25)}')
    print(f'Mean absolute deviation: {round(df[col1].mad(),2)}')
    print(f'Median absolute deviation: {round(mad(df[col1]),2)}')


def get_histogram(df):
    origin = df['ORIGIN'].unique()[0]
    df = df.copy()
    df = df.loc[df['anomaly']==True]
    
    fig = px.box(df, x="year", y="WEATHER_DELAY", color="travel_season")
    fig.show()
    
def get_line_graph(df):
    
    non_anomalies_monthly_sum = df.loc[df['anomaly']==False].groupby(['year', 'month'])['WEATHER_DELAY'].sum().to_frame('total_min').reset_index()
    anomalies_monthly_sum = df.loc[df['anomaly']==True].groupby(['year', 'month'])['WEATHER_DELAY'].sum().to_frame('total_min').reset_index()
    
    non_anomalies_monthly_sum['anomaly'] = False
    anomalies_monthly_sum['anomaly'] = True
    data_sum = pd.concat([non_anomalies_monthly_sum, anomalies_monthly_sum])
    
    fig = go.Figure()
 
    chart_name = 'test'
    
    # 2014
    non_2014 = non_anomalies_monthly_sum.loc[non_anomalies_monthly_sum['year']==2014].copy()
    ano_2014 = anomalies_monthly_sum.loc[anomalies_monthly_sum['year']==2014].copy()
    
    fig.add_trace(go.Scatter(x=non_2014['month'], y=non_2014['total_min'], name='2014-Non',
                         line=dict(color='red', width=2)))
    
    fig.add_trace(go.Scatter(x=ano_2014['month'], y=ano_2014['total_min'], name='2014-Anomalies',
                         line=dict(color='red', width=2,
                              dash='dash')))
    
    # 2015
    non_2015 = non_anomalies_monthly_sum.loc[non_anomalies_monthly_sum['year']==2015].copy()
    ano_2015 = anomalies_monthly_sum.loc[anomalies_monthly_sum['year']==2015].copy()
    
    fig.add_trace(go.Scatter(x=non_2015['month'], y=non_2015['total_min'], name='2015-Non',
                         line=dict(color='green', width=2)))
    
    fig.add_trace(go.Scatter(x=ano_2015['month'], y=ano_2015['total_min'], name='2015-Anomalies',
                         line=dict(color='green', width=2,
                              dash='dash')))
    
    
    # 2016
    non_2016 = non_anomalies_monthly_sum.loc[non_anomalies_monthly_sum['year']==2016].copy()
    ano_2016 = anomalies_monthly_sum.loc[anomalies_monthly_sum['year']==2016].copy()
    
    fig.add_trace(go.Scatter(x=non_2016['month'], y=non_2016['total_min'], name='2016-Non',
                         line=dict(color='blue', width=2)))
    
    fig.add_trace(go.Scatter(x=ano_2016['month'], y=ano_2016['total_min'], name='2016-Anomalies',
                         line=dict(color='blue', width=2,
                              dash='dash')))
    
    
    # 2017
    non_2017 = non_anomalies_monthly_sum.loc[non_anomalies_monthly_sum['year']==2017].copy()
    ano_2017 = anomalies_monthly_sum.loc[anomalies_monthly_sum['year']==2017].copy()
    
    fig.add_trace(go.Scatter(x=non_2017['month'], y=non_2017['total_min'], name='2017-Non',
                         line=dict(color='goldenrod', width=2)))
    
    fig.add_trace(go.Scatter(x=ano_2017['month'], y=ano_2017['total_min'], name='2017-Anomalies',
                         line=dict(color='goldenrod', width=2,
                              dash='dash')))
    
    
    # 2018
    non_2018 = non_anomalies_monthly_sum.loc[non_anomalies_monthly_sum['year']==2018].copy()
    ano_2018 = anomalies_monthly_sum.loc[anomalies_monthly_sum['year']==2018].copy()
    
    fig.add_trace(go.Scatter(x=non_2018['month'], y=non_2018['total_min'], name='2018-Non',
                         line=dict(color='magenta', width=2)))
    
    fig.add_trace(go.Scatter(x=ano_2018['month'], y=ano_2018['total_min'], name='2018-Anomalies',
                         line=dict(color='magenta', width=2,
                              dash='dash')))
    
    
    fig.show()

In [177]:
dropdown_state = widgets.Dropdown(options = unique_sorted_values_plus_ALL(data.ORIGIN), description='Origin: ')

output_origin = widgets.Output()
describe_output = widgets.Output()
bar_graph_output = widgets.Output()
histo_graph_output = widgets.Output()
line_graph_output = widgets.Output()

def event_action():
    # clear the previous selection on each iteration
    output_origin.clear_output()
    describe_output.clear_output()
    bar_graph_output.clear_output()
    histo_graph_output.clear_output()
    line_graph_output.clear_output()
    
    if (dropdown_state.value == ALL):
        common_filter = data
        
    else:
        common_filter = data[data.ORIGIN == dropdown_state.value]
    
    with output_origin:
        print(f'Anamoly Record count: {common_filter.shape[0]}')
        determine_est_of_location(common_filter, 'WEATHER_DELAY')
        determine_est_of_variability(common_filter, 'WEATHER_DELAY')
        
        print('\nDelays 45 min >=')
        
        query_syn = 'WEATHER_DELAY >= 45'
        filter_45min = common_filter.query(query_syn)
        print(f'Record count: {filter_45min.shape[0]}')
        determine_est_of_location(filter_45min, 'WEATHER_DELAY')
        determine_est_of_variability(filter_45min, 'WEATHER_DELAY')
    
    with describe_output:
        get_describe(common_filter)
    
    with bar_graph_output:
        get_bar_graph(common_filter)
    
    with histo_graph_output:
        get_histogram(common_filter)
    
    with line_graph_output:
        get_line_graph(common_filter)
        
        

def dropdown_state_eventhandler(change):
    event_action()
    
def graphit():
    event_action()
    
def tab_chg(chg):
    if chg.old == {}:
        graphit()

dropdown_state.observe(dropdown_state_eventhandler, names='value')

In [179]:
input_widgets = widgets.HBox([dropdown_state])

tab = widgets.Tab([output_origin, describe_output, bar_graph_output, histo_graph_output, line_graph_output])
tab.set_title(0, 'Dataset')
tab.set_title(1, 'Describe Data')
tab.set_title(2, 'Travel Season By Year')
tab.set_title(3, 'Distribution')
tab.set_title(4, 'Line Chart')
# tab.set_title(5, 'State Density Plot')
tab.observe(tab_chg)

dashboard = widgets.VBox([tab], layout=Layout(height='700px'))
display(input_widgets, dashboard)

HBox(children=(Dropdown(description='Origin: ', index=2, options=('ALL', 'ATL', 'DEN', 'DFW', 'LAX', 'ORD'), v…

VBox(children=(Tab(children=(Output(outputs=({'output_type': 'stream', 'text': 'Anamoly Record count: 19579\n\…

## Testing

In [181]:
data.head()
testing = data.loc[data['ORIGIN']=='ATL'].copy()

In [180]:
import seaborn as sns

In [183]:
testing2 = testing.loc[testing['anomaly']==True].copy()
testing2.shape

(9721, 14)

In [185]:
testing2.head()

Unnamed: 0,FL_DATE,ORIGIN,OP_CARRIER,DEP_TIME,DEP_DELAY,WEATHER_DELAY,NAS_DELAY,month,year,travel_season,weather_event,nas_event,scores,anomaly
84849,2014-01-01,ATL,WN,1723,63.0,63.0,35.0,1,2014,low,True,True,-0.006616,True
85180,2014-01-02,ATL,DL,2256,55.0,1.0,0.0,1,2014,low,True,False,-0.014018,True
85181,2014-01-02,ATL,DL,1955,23.0,1.0,0.0,1,2014,low,True,False,-0.014018,True
85191,2014-01-02,ATL,DL,1955,51.0,1.0,0.0,1,2014,low,True,False,-0.014018,True
85194,2014-01-02,ATL,DL,2120,85.0,81.0,0.0,1,2014,low,True,False,-0.028418,True


In [None]:
rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

In [None]:
# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

In [None]:
# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

In [None]:
plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=20, edgecolor="k")
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="green", s=20, edgecolor="k")
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="red", s=20, edgecolor="k")
plt.axis("tight")
plt.xlim((-5, 5))
plt.ylim((-5, 5))
plt.legend(
    [b1, b2, c],
    ["training observations", "new regular observations", "new abnormal observations"],
    loc="upper left",
)
plt.show()

In [None]:
def model_testing(X, X_outliers):
    
    X_train, X_test = train_test_split(X,test_size = 0.20, random_state=42)
    
    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng)
    clf.fit(X_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    y_pred_outliers = clf.predict(X_outliers)
    
    plt.title("IsolationForest")
    plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)
    
#     plt.figure(figsize=(8, 8), dpi=80)

    b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c="white", s=20, edgecolor="k")
    b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c="green", s=20, edgecolor="k")
    c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c="red", s=20, edgecolor="k")
    plt.axis("tight")
#     plt.xlim((-5, 5))
#     plt.ylim((-5, 5))
    plt.legend(
        [b1, b2, c],
        ["training observations", "new regular observations", "new abnormal observations"],
        loc="upper left",
    )
    plt.show()
    
    
    

In [None]:
df_training = df_testing.loc[df_testing['year'] != 2018].copy(deep=True)
df_pred = df_testing.loc[df_testing['year'] == 2018].copy()

In [None]:
model_testing(df_training.iloc[:, [8,9]].values, df_pred.iloc[:, [8,9]].values)

In [None]:
df_pred.info()

In [None]:
X = org_test.iloc[:, [8]].values
X1 = org_test.iloc[:, [9]].values