# Late Flights & Missing Data


In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

url = (
    "https://raw.githubusercontent.com/byuidatascience/data4missing/master/"
    "data-raw/flights_missing/flights_missing.json"
)
df = pd.read_json(url)

df.drop(columns=["airport_name"], inplace=True)
df

Unnamed: 0,airport_code,month,year,num_of_flights_total,num_of_delays_carrier,num_of_delays_late_aircraft,num_of_delays_nas,num_of_delays_security,num_of_delays_weather,num_of_delays_total,minutes_delayed_carrier,minutes_delayed_late_aircraft,minutes_delayed_nas,minutes_delayed_security,minutes_delayed_weather,minutes_delayed_total
0,ATL,January,2005.0,35048,1500+,-999,4598,10,448,8355,116423.0,104415,207467.0,297,36931,465533
1,DEN,January,2005.0,12687,1041,928,935,11,233,3153,53537.0,70301,36817.0,363,21779,182797
2,IAD,January,2005.0,12381,414,1058,895,4,61,2430,,70919,35660.0,208,4497,134881
3,ORD,January,2005.0,28194,1197,2255,5415,5,306,9178,88691.0,160811,364382.0,151,24859,638894
4,SAN,January,2005.0,7283,572,680,638,7,56,1952,27436.0,38445,21127.0,218,4326,91552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919,IAD,December,2015.0,2799,182,183,61,0,17,443,,15438,2826.0,0,1825,31164
920,ORD,December,2015.0,25568,923,1755,1364,11,180,4233,80962.0,132055,72045.0,435,22459,307956
921,SAN,,2015.0,6231,480,606,256,5,37,1383,25402.0,35796,9038.0,161,2742,73139
922,SFO,December,2015.0,13833,757,1180,2372,9,147,4465,55283.0,96703,193525.0,285,13788,359584


In [2]:
new_headers = {
    "airport_code": "code",
    "num_of_flights_total": "total_flights",
    "num_of_delays_carrier": "carrier_delays",
    "num_of_delays_late_aircraft": "late_aircraft_delays",
    "num_of_delays_nas": "nas_delays",
    "num_of_delays_security": "security_delays",
    "num_of_delays_weather": "weather_delays",
    "num_of_delays_total": "total_delays",
    "minutes_delayed_carrier": "carrier_time",
    "minutes_delayed_late_aircraft": "late_aircraft_time",
    "minutes_delayed_nas": "nas_time",
    "minutes_delayed_security": "security_time",
    "minutes_delayed_weather": "weather_time",
    "minutes_delayed_total": "total_time",
}

df.rename(columns=new_headers, inplace=True)
col_list = df.columns.values

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924 entries, 0 to 923
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   code                  924 non-null    object 
 1   month                 924 non-null    object 
 2   year                  901 non-null    float64
 3   total_flights         924 non-null    int64  
 4   carrier_delays        924 non-null    object 
 5   late_aircraft_delays  924 non-null    int64  
 6   nas_delays            924 non-null    int64  
 7   security_delays       924 non-null    int64  
 8   weather_delays        924 non-null    int64  
 9   total_delays          924 non-null    int64  
 10  carrier_time          872 non-null    float64
 11  late_aircraft_time    924 non-null    int64  
 12  nas_time              893 non-null    float64
 13  security_time         924 non-null    int64  
 14  weather_time          924 non-null    int64  
 15  total_time            9

## Checking the text data


In [4]:
df.code.value_counts(dropna=False)

ATL    132
DEN    132
IAD    132
ORD    132
SAN    132
SFO    132
SLC    132
Name: code, dtype: int64

In [5]:
df.month.value_counts(dropna=False)

April        77
July         77
October      77
Febuary      76
November     76
June         75
August       75
September    74
January      73
May          73
December     73
March        71
n/a          27
Name: month, dtype: int64

In [6]:
df.carrier_delays.value_counts(dropna=False)

1500+    73
342       4
601       4
411       4
393       4
         ..
391       1
759       1
591       1
1078      1
483       1
Name: carrier_delays, Length: 591, dtype: int64

In [7]:
# Lets replace all n/a with pd.NA, and while we're at it, fix february's spelling
df.month.replace(["n/a", "Febuary"], [np.nan, "February"], inplace=True)
df.month.value_counts(dropna=False)

April        77
July         77
October      77
February     76
November     76
June         75
August       75
September    74
January      73
May          73
December     73
March        71
NaN          27
Name: month, dtype: int64

## Checking numeric data


In [8]:
df.select_dtypes(exclude="object").describe()

Unnamed: 0,year,total_flights,late_aircraft_delays,nas_delays,security_delays,weather_delays,total_delays,carrier_time,late_aircraft_time,nas_time,security_time,weather_time,total_time
count,901.0,924.0,924.0,924.0,924.0,924.0,924.0,872.0,924.0,893.0,924.0,924.0,924.0
mean,2010.0,16607.544372,1017.844156,1376.467532,5.417749,100.971861,3437.391775,51902.25344,75511.96645,70057.12318,216.735931,8353.722944,206436.175325
std,3.170699,9868.315498,853.942405,1348.719957,5.414833,103.584998,2561.922043,37085.043011,58110.787045,85937.281155,274.848647,9164.925368,176132.914461
min,2005.0,2684.0,-999.0,61.0,0.0,3.0,320.0,6065.0,6199.0,-999.0,0.0,294.0,18872.0
25%,2007.0,8027.75,488.75,357.75,2.0,34.75,1389.75,23837.75,30411.25,12422.0,67.75,2429.5,71210.5
50%,2010.0,12544.0,804.0,960.0,4.0,66.0,2801.5,38784.5,58472.0,35660.0,150.0,4906.0,151410.5
75%,2013.0,25580.5,1473.75,1869.25,7.0,129.0,4714.75,73581.5,104091.0,95299.0,274.0,10684.25,287184.5
max,2015.0,38241.0,3969.0,8704.0,64.0,812.0,13699.0,220796.0,345456.0,574857.0,4949.0,76770.0,989367.0


In [9]:
df = df.replace(-999, np.nan)

In [10]:
df["month"] = np.where(df["code"] == "ATL", df["month"].bfill(), df["month"].ffill())

df["year"] = np.where(df["code"] == "ATL", df["year"].bfill(), df["year"].ffill())

In [11]:
df.code = pd.Categorical(
    df.code, ordered=False, categories=["ATL", "DEN", "IAD", "ORD", "SAN", "SFO", "SLC"]
)

df.month = pd.Categorical(
    df.month,
    ordered=True,
    categories=[
        "January",
        "February",
        "March",
        "April",
        "May",
        "June",
        "July",
        "August",
        "September",
        "October",
        "November",
        "December",
    ],
)

In [12]:
df.pivot_table(values="code", index="year", columns="month", aggfunc="count")

month,January,February,March,April,May,June,July,August,September,October,November,December
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2005.0,7,7,7,7,7,7,7,7,7,7,7,7
2006.0,7,7,7,7,7,7,7,7,7,7,7,7
2007.0,7,7,7,7,7,7,7,7,7,7,7,7
2008.0,7,7,7,7,7,7,7,7,7,7,7,7
2009.0,7,7,7,7,7,7,7,7,7,7,7,7
2010.0,7,7,7,7,7,7,7,7,7,7,7,7
2011.0,7,7,7,7,7,7,7,7,7,7,7,7
2012.0,7,7,7,7,7,7,7,7,7,7,7,7
2013.0,7,7,7,7,7,7,7,7,7,7,7,7
2014.0,7,7,7,7,7,7,7,7,7,7,7,7


In [13]:
for c in ["late_aircraft_delays", "carrier_time", "nas_time"]:
    df[c].fillna(np.nan, inplace=True)

df.isna().sum()

code                     0
month                    0
year                     0
total_flights            0
carrier_delays           0
late_aircraft_delays    40
nas_delays               0
security_delays          0
weather_delays           0
total_delays             0
carrier_time            52
late_aircraft_time       0
nas_time                48
security_time            0
weather_time             0
total_time               0
dtype: int64

In [14]:
df["year"] = pd.to_numeric(df["year"], downcast="integer")
df.dtypes

code                    category
month                   category
year                       int16
total_flights              int64
carrier_delays            object
late_aircraft_delays     float64
nas_delays                 int64
security_delays            int64
weather_delays             int64
total_delays               int64
carrier_time             float64
late_aircraft_time         int64
nas_time                 float64
security_time              int64
weather_time               int64
total_time                 int64
dtype: object

In [15]:
df.query("@pd.isna(late_aircraft_delays)").head(1)

Unnamed: 0,code,month,year,total_flights,carrier_delays,late_aircraft_delays,nas_delays,security_delays,weather_delays,total_delays,carrier_time,late_aircraft_time,nas_time,security_time,weather_time,total_time
0,ATL,January,2005,35048,1500+,,4598,10,448,8355,116423.0,104415,207467.0,297,36931,465533


In [16]:
worst = df[
    ["code", "month", "year", "total_flights", "total_delays", "total_time"]
].assign(
    ratio_delayed=lambda x: x.total_delays / x.total_flights,
    mins_per_delay=lambda x: x.total_time / x.total_delays,
)

worst

Unnamed: 0,code,month,year,total_flights,total_delays,total_time,ratio_delayed,mins_per_delay
0,ATL,January,2005,35048,8355,465533,0.238387,55.719090
1,DEN,January,2005,12687,3153,182797,0.248522,57.975579
2,IAD,January,2005,12381,2430,134881,0.196268,55.506584
3,ORD,January,2005,28194,9178,638894,0.325530,69.611462
4,SAN,January,2005,7283,1952,91552,0.268021,46.901639
...,...,...,...,...,...,...,...,...
919,IAD,December,2015,2799,443,31164,0.158271,70.347630
920,ORD,December,2015,25568,4233,307956,0.165559,72.751240
921,SAN,December,2015,6231,1383,73139,0.221955,52.884309
922,SFO,December,2015,13833,4465,359584,0.322779,80.533931


In [17]:
by_code = (
    worst.groupby("code", observed=True)
    .agg(
        avg_percent_delayed=("ratio_delayed", "mean"),
        median_percent_delayed=("ratio_delayed", "median"),
        avg_time_delay=("mins_per_delay", "mean"),
        median_time_delay=("mins_per_delay", "median"),
    )
    .reset_index()
)

by_code

Unnamed: 0,code,avg_percent_delayed,median_percent_delayed,avg_time_delay,median_time_delay
0,ATL,0.201649,0.195276,59.370106,58.409254
1,DEN,0.185782,0.183316,52.95484,52.273664
2,IAD,0.195161,0.192159,60.194677,59.771772
3,ORD,0.228812,0.222261,66.343044,66.403633
4,SAN,0.189708,0.185764,47.041653,46.433666
5,SFO,0.260612,0.252961,61.059171,60.180951
6,SLC,0.144063,0.13439,49.415215,48.019765


In [18]:
fig = px.bar(
    by_code,
    x="code",
    y="avg_percent_delayed",
    range_y=[0.1, 0.3],
    color="median_percent_delayed",
    text_auto=".1%",
    hover_data=["avg_percent_delayed", "median_percent_delayed"],
    labels={
        "avg_percent_delayed": "Percent of Flights Delayed (Average)",
        "median_percent_delayed": "Percent of Flights Delayed (Median)",
        "code": "Airport",
    },
)

fig.update_layout(title_text="Percent of Flights Delayed by Airport")

fig.update_yaxes(tickformat=".0%")

fig.update_coloraxes(colorbar_tickformat=".0%")

fig.show()

In [19]:
fig = px.bar(
    by_code,
    x="code",
    y="avg_time_delay",
    range_y=[40, 70],
    color="median_time_delay",
    text_auto=".1f",
    hover_data=["avg_time_delay", "median_time_delay"],
    labels={
        "avg_time_delay": "Average Time Delay (Minutes)",
        "median_time_delay": "Median Time Delay (Minutes)",
        "code": "Airport",
    },
)

fig.update_layout(title_text="Time Delays by Airport")

fig.show()

In [20]:
by_month = (
    worst.groupby("month", observed=True)
    .agg(
        avg_percent_delayed=("ratio_delayed", "mean"),
        median_percent_delayed=("ratio_delayed", "median"),
        avg_time_delay=("mins_per_delay", "mean"),
        median_time_delay=("mins_per_delay", "median"),
    )
    .reset_index()
)

by_month

Unnamed: 0,month,avg_percent_delayed,median_percent_delayed,avg_time_delay,median_time_delay
0,January,0.216022,0.210417,57.473489,56.125798
1,February,0.21565,0.209804,56.524162,55.750883
2,March,0.199513,0.186395,55.900842,55.222067
3,April,0.179019,0.174432,54.795227,53.647523
4,May,0.186128,0.182134,55.896611,55.676295
5,June,0.23953,0.239995,60.954467,60.718715
6,July,0.225045,0.221496,60.885143,60.189522
7,August,0.200899,0.193962,57.612562,57.487072
8,September,0.158473,0.14364,53.446794,53.535769
9,October,0.173353,0.154455,52.747688,53.133315


In [21]:
fig = px.bar(
    by_month,
    x="month",
    y="avg_percent_delayed",
    range_y=[0.14, 0.26],
    color="median_percent_delayed",
    text_auto=".1%",
    hover_data=["avg_percent_delayed", "median_percent_delayed"],
    labels={
        "avg_percent_delayed": "Average",
        "median_percent_delayed": "Median",
        "month": "Month",
    },
)

fig.update_layout(title_text="Percent of Flights Delayed by Month")

fig.update_yaxes(tickformat=".0%")

fig.update_coloraxes(colorbar_tickformat=".0%")


fig.show()

In [22]:
fig = px.bar(
    by_month,
    x="month",
    y="avg_time_delay",
    range_y=[50, 62],
    color="median_time_delay",
    text_auto=".1f",
    hover_data=["avg_time_delay", "median_time_delay"],
    labels={
        "avg_time_delay": "Average",
        "median_time_delay": "Median",
        "month": "Month",
    },
)

fig.update_layout(title_text="Average Delay Time by Month")

fig.show()

In [23]:
months = ["April", "May", "June", "July", "August"]

weather = df[
    [
        "code",
        "month",
        "year",
        "total_flights",
        "total_delays",
        "late_aircraft_delays",
        "nas_delays",
        "weather_delays",
    ]
]
weather.late_aircraft_delays.fillna(
    weather.late_aircraft_delays.mean(), inplace=True)
weather = weather.assign(
    total_weather_delays=lambda x: np.round(
        x.weather_delays
        + (x.late_aircraft_delays * 0.3)
        + (x.nas_delays * np.where((x.month.isin(months)), 0.4, 0.65))
    ),
    percent_weather_delays=lambda x: x.total_weather_delays / x.total_delays,
)
weather.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,code,month,year,total_flights,total_delays,late_aircraft_delays,nas_delays,weather_delays,total_weather_delays,percent_weather_delays
0,ATL,January,2005,35048,8355,1109.104072,4598,448,3769.0,0.451107
1,DEN,January,2005,12687,3153,928.0,935,233,1119.0,0.3549
2,IAD,January,2005,12381,2430,1058.0,895,61,960.0,0.395062
3,ORD,January,2005,28194,9178,2255.0,5415,306,4502.0,0.490521
4,SAN,January,2005,7283,1952,680.0,638,56,675.0,0.345799


In [24]:
dat = (
    weather.groupby("code", observed=True)
    .agg(avg_ratio_weather_delays=("percent_weather_delays", "mean"))
    .reset_index()
)

dat

Unnamed: 0,code,avg_ratio_weather_delays
0,ATL,0.347733
1,DEN,0.316976
2,IAD,0.301014
3,ORD,0.369628
4,SAN,0.278841
5,SFO,0.371091
6,SLC,0.290095


In [25]:
fig = px.bar(
    dat,
    x="code",
    y="avg_ratio_weather_delays",
    range_y=[0.25, 0.4],
    text_auto=".1%",
    labels={"avg_ratio_weather_delays": "Percent", "code": "Airport"},
)

fig.update_layout(title_text="Percent of Total Delays caused by Weather")

fig.update_yaxes(tickformat=".0%")

fig.show()