In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
pd.set_option('display.max_columns', None)

In [None]:
dtypes = {'YEAR': np.int16, 'MONTH': np.int16, 'DAY_OF_MONTH': np.int16, 'DAY_OF_WEEK': np.int16,
'FL_NUM': np.int16, 'ORIGIN_AIRPORT_ID': np.int16, 'DEST_AIRPORT_ID': np.int16, 'CRS_DEP_TIME': np.int16, 
'DEP_TIME':np.float16, 'DEP_DELAY':np.float16, 'DEP_DELAY_NEW':np.float16, 'DEP_DEL15':np.float16,
'DEP_DELAY_GROUP': np.float16, 'TAXI_OUT':np.float16, 'WHEELS_OFF':np.float16, 'WHEELS_ON':np.float16,
'TAXI_IN':np.float16, 'CRS_ARR_TIME':np.int16, 'ARR_TIME':np.float16, 'ARR_DELAY':np.float16, 
'ARR_DELAY_NEW':np.float16, 'ARR_DEL15':np.float16, 'ARR_DELAY_GROUP':np.float16, 'CANCELLED':np.float16, 
'DIVERTED':np.float16, 'CRS_ELAPSED_TIME':np.float16, 'ACTUAL_ELAPSED_TIME':np.float16, 'AIR_TIME':np.float16,
'FLIGHTS':np.float16, 'DISTANCE':np.float16, 'DISTANCE_GROUP':np.int16, 'CARRIER_DELAY':np.float16, 
'WEATHER_DELAY':np.float16, 'NAS_DELAY':np.float16, 'SECURITY_DELAY':np.float16, 'LATE_AIRCRAFT_DELAY':np.float16
}
parse_dates = ['FL_DATE', ]

In [None]:
data = pd.read_csv("../airOT201201.csv", dtype = dtypes, parse_dates=parse_dates)

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
data.dtypes

In [None]:
data.head()

In [None]:
# plot showing percentage of delay in each distance group
by_distgrp = data.groupby(['DISTANCE_GROUP','ARR_DEL15']).count().reset_index()
by_distgrp['DELAY_PERCENT'] = by_distgrp.groupby("DISTANCE_GROUP")["YEAR"].transform(lambda x: x/x.sum())
by_distgrp = by_distgrp.query('ARR_DEL15==1')[["DISTANCE_GROUP", "DELAY_PERCENT"]]

plt.plot(by_distgrp["DISTANCE_GROUP"], by_distgrp["DELAY_PERCENT"])
plt.xlabel("Distance Group")
plt.ylabel("Percentage of Delay")

In [None]:
# plot showing percentage of delay for different unique carrier
by_carrier_dayofweek = data.groupby(['UNIQUE_CARRIER', 'DAY_OF_WEEK', 'ARR_DEL15']).count().reset_index()
by_carrier_dayofweek['DELAY_PERCENT'] = by_carrier_dayofweek.groupby(['UNIQUE_CARRIER', 'DAY_OF_WEEK'])["YEAR"].transform(lambda x: x/x.sum())
by_carrier_dayofweek = by_carrier_dayofweek.query('ARR_DEL15==1')[["UNIQUE_CARRIER", "DAY_OF_WEEK", "DELAY_PERCENT"]]

selected_carrier = ['AA', 'AS', 'UA', 'US', 'CO', 'DL', 'WN', 'NW']
for carrier in selected_carrier:
  plt.plot(by_carrier_dayofweek.query("UNIQUE_CARRIER==@carrier")["DAY_OF_WEEK"], by_carrier_dayofweek.query("UNIQUE_CARRIER==@carrier")["DELAY_PERCENT"])
plt.xlabel("Day of the week")
plt.ylabel("Percentage of Delay")
plt.legend(selected_carrier)

In [None]:
delay_type = ['CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY']
by_delay_dayofweek = data.query('ARR_DEL15==1').groupby('DAY_OF_WEEK')[delay_type].apply(lambda x: (x>0).sum()).reset_index()

by_delay_dayofweek = by_delay_dayofweek.drop(columns=['DAY_OF_WEEK'])
by_delay_dayofweek.plot(kind='bar', 
                    stacked=True, 
                    colormap='Set3')
plt.legend(loc="upper left", ncol=2)
plt.xlabel("Day of Week")
plt.ylabel("Total Number")
plt.xticks(list(range(7)), list(range(1,8)))
plt.show()

In [None]:
by_delay_dayofweek_prop = by_delay_dayofweek.div(by_delay_dayofweek.sum(1),axis=0)
by_delay_dayofweek_prop

by_delay_dayofweek_prop.plot(kind='bar', 
                    stacked=True, 
                    colormap='Set3')
plt.legend(loc="upper left", ncol=2)
plt.xlabel("Day of Week")
plt.ylabel("Proportion")
plt.xticks(list(range(7)), list(range(1,8)))
plt.show()