In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import math
from scipy import stats
import numpy as np
import pandas as pd
import warnings
import plotly.graph_objects as go
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

First, let's load in our data.

In [None]:
flights = pd.read_csv('flights.csv')
airports = pd.read_csv('airports.csv')
airlines = pd.read_csv('airlines.csv')


Columns (7,8) have mixed types. Specify dtype option on import or set low_memory=False.



In [3]:
airlines.head()

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways


Now, let's choose which variables would be interesting to analyze in a map display from the flights dataset by exploring the data a bit.

In [None]:
flights.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


In [None]:
airlines.head()

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways


In [None]:
airports.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


I think it would be interesting to make a map that draws lines for each flight in the United States. With this map, you could see which airports offer the most flights, which airports are limited, and can see just how interconnected the United States is through its airports. The datascience package doesn't support drawing lines on maps however, so I will be using the plotly.graph_objects and pandas libraries along with the plotly documentation for map lines: https://plotly.com/python/lines-on-maps/ to help me do what I would like to do.

First, for simplicity, let's remove all of the uneccessary columns in the flights and airports dataframes.

In [None]:
flights2 = flights.filter(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT'], axis=1)
airports2 = airports.filter(['IATA_CODE', 'LATITUDE', 'LONGITUDE'], axis=1)

In [None]:
airports2.head()

Unnamed: 0,IATA_CODE,LATITUDE,LONGITUDE
0,ABE,40.65236,-75.4404
1,ABI,32.41132,-99.6819
2,ABQ,35.04022,-106.60919
3,ABR,45.44906,-98.42183
4,ABY,31.53552,-84.19447


In [None]:
flights2.head()

Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT
0,ANC,SEA
1,LAX,PBI
2,SFO,CLT
3,LAX,MIA
4,SEA,ANC


Next, let's merge flights and airports to get the latitude and longitude of the origin airport.

In [None]:
flights2 = pd.merge(flights2, airports2, left_on='ORIGIN_AIRPORT', right_on='IATA_CODE', how='left')
flights2.head()

Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,IATA_CODE,LATITUDE,LONGITUDE
0,ANC,SEA,ANC,61.17432,-149.99619
1,LAX,PBI,LAX,33.94254,-118.40807
2,SFO,CLT,SFO,37.619,-122.37484
3,LAX,MIA,LAX,33.94254,-118.40807
4,SEA,ANC,SEA,47.44898,-122.30931


Before we do the same with the destination airport, let's rename the latitude and longitude columns so we can distinguish which lats and longs are for which airport. Also, we can drop the unnecessary Iata Code column.

In [None]:
flights2.rename(columns = {'LATITUDE': 'O_LAT', 'LONGITUDE': 'O_LONG'}, inplace = True)
flights2 = flights2.drop('IATA_CODE', axis=1)
flights2.head()

Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,O_LAT,O_LONG
0,ANC,SEA,61.17432,-149.99619
1,LAX,PBI,33.94254,-118.40807
2,SFO,CLT,37.619,-122.37484
3,LAX,MIA,33.94254,-118.40807
4,SEA,ANC,47.44898,-122.30931


Now, let's do the same for the destination airports.

In [None]:
flights2 = pd.merge(flights2, airports2, left_on='DESTINATION_AIRPORT', right_on='IATA_CODE', how='left')
flights2.rename(columns = {'LATITUDE': 'D_LAT', 'LONGITUDE': 'D_LONG'}, inplace = True)
flights2 = flights2.drop('IATA_CODE', axis=1)
flights2.head()

Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,O_LAT,O_LONG,D_LAT,D_LONG
0,ANC,SEA,61.17432,-149.99619,47.44898,-122.30931
1,LAX,PBI,33.94254,-118.40807,26.68316,-80.09559
2,SFO,CLT,37.619,-122.37484,35.21401,-80.94313
3,LAX,MIA,33.94254,-118.40807,25.79325,-80.29056
4,SEA,ANC,47.44898,-122.30931,61.17432,-149.99619


Next, let's use the grouby feature to add a count column and remove duplicate rows.

In [None]:
flights3 = flights2.groupby(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'O_LAT', 'O_LONG', 'D_LAT', 'D_LONG']).size().reset_index(name='COUNT')
flights3.head()


Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,O_LAT,O_LONG,D_LAT,D_LONG,COUNT
0,ABE,ATL,40.65236,-75.4404,33.64044,-84.42694,898
1,ABE,DTW,40.65236,-75.4404,42.21206,-83.34884,711
2,ABE,ORD,40.65236,-75.4404,41.9796,-87.90446,665
3,ABI,DFW,32.41132,-99.6819,32.89595,-97.0372,2329
4,ABQ,ATL,35.04022,-106.60919,33.64044,-84.42694,801


Now, let's start to work on the map. First we can add little markers at each airport using the airport data frame. This code is found from: https://plotly.com/python/lines-on-maps/ and adapted for our dataset by me.

In [None]:
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = airports['LONGITUDE'],
    lat = airports['LATITUDE'],
    hoverinfo = 'text',
    text = airports['AIRPORT'],
    mode = 'markers',
    marker = dict(
        size = 2,
        color = 'rgb(255, 0, 0)',
        line = dict(
            width = 3,
            color = 'rgba(68, 68, 68, 0)'
        )
    )))

fig.show()

Now, let's draw the lines.  Lines will be thicker if they are more common. Code adapted from: https://plotly.com/python/lines-on-maps/

In [None]:
flight_paths = []
for i in range(len(flights3)):
    fig.add_trace(
        go.Scattergeo(
            locationmode = 'USA-states',
            lon = [flights3['O_LONG'][i], flights3['D_LONG'][i]],
            lat = [flights3['O_LAT'][i], flights3['D_LAT'][i]],
            mode = 'lines',
            line = dict(width = 1,color = 'red'),
            opacity = float(flights3['COUNT'][i]) / float(flights3['COUNT'].max()),
        ))
fig.show()

Lastly, let's put some finishing touches on the map by adding a title and centralizing the image.

In [None]:
fig.update_layout(
    title_text = '2015 U.S.A Flights<br>(Hover for more info)',
    showlegend = False,
    geo = dict(
        scope = 'north america',
        projection_type = 'azimuthal equal area',
        showland = True,
        landcolor = 'rgb(243, 243, 243)',
        countrycolor = 'rgb(204, 204, 204)',
    ),
)

fig.show()

As you can see, this map is very busy which makes it difficult to get much insight without zooming in. You can get some incredible insights by zooming in, but because we have to turn in a PDF its hard for me to capture this detail. To make it easier to look at, let's redo the same thing but eliminate the bottom 50% of flights based on their frequency to get rid of a lot of uncommon flights. We can find out this data by using the .describe() method in pandas.

In [None]:
flights3.describe()

Unnamed: 0,O_LAT,O_LONG,D_LAT,D_LONG,COUNT
count,4671.0,4671.0,4671.0,4671.0,4671.0
mean,37.304876,-94.451848,37.290431,-94.475388,1139.734318
std,6.613047,18.31235,6.610938,18.317677,1372.617193
min,13.48345,-176.64603,13.48345,-176.64603,1.0
25%,33.43417,-104.683625,33.43417,-104.70025,295.0
50%,38.74769,-88.91595,38.74769,-88.91595,668.0
75%,41.59422,-81.16084,41.53493,-81.20214,1479.0
max,71.28545,-64.79856,71.28545,-64.79856,13744.0


As the above data shows, any flight that happened 668 times or less (less than twice a day) can be removed. After we remove that data with the first few lines of code. We will repeat the steps we did with the old dataframe to get the final visualization.

In [None]:
flights3 = flights3[flights3['COUNT'] > 668]
flights3


Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,O_LAT,O_LONG,D_LAT,D_LONG,COUNT
0,ABE,ATL,40.65236,-75.44040,33.64044,-84.42694,898
1,ABE,DTW,40.65236,-75.44040,42.21206,-83.34884,711
3,ABI,DFW,32.41132,-99.68190,32.89595,-97.03720,2329
4,ABQ,ATL,35.04022,-106.60919,33.64044,-84.42694,801
7,ABQ,DAL,35.04022,-106.60919,32.84711,-96.85177,1581
...,...,...,...,...,...,...,...
4657,XNA,ATL,36.28187,-94.30681,33.64044,-84.42694,1575
4660,XNA,DFW,36.28187,-94.30681,32.89595,-97.03720,2224
4662,XNA,IAH,36.28187,-94.30681,29.98047,-95.33972,1074
4665,XNA,ORD,36.28187,-94.30681,41.97960,-87.90446,2598


If we do not reset the underlying indices in the dataframe, we will get KeyErrors when we try to make our new map. So, we can use the numpy arange method to reset the index values.

In [None]:
flights3.index = np.arange(1, len(flights3) + 1)
flights3


Unnamed: 0,ORIGIN_AIRPORT,DESTINATION_AIRPORT,O_LAT,O_LONG,D_LAT,D_LONG,COUNT
1,ABE,ATL,40.65236,-75.44040,33.64044,-84.42694,898
2,ABE,DTW,40.65236,-75.44040,42.21206,-83.34884,711
3,ABI,DFW,32.41132,-99.68190,32.89595,-97.03720,2329
4,ABQ,ATL,35.04022,-106.60919,33.64044,-84.42694,801
5,ABQ,DAL,35.04022,-106.60919,32.84711,-96.85177,1581
...,...,...,...,...,...,...,...
2319,XNA,ATL,36.28187,-94.30681,33.64044,-84.42694,1575
2320,XNA,DFW,36.28187,-94.30681,32.89595,-97.03720,2224
2321,XNA,IAH,36.28187,-94.30681,29.98047,-95.33972,1074
2322,XNA,ORD,36.28187,-94.30681,41.97960,-87.90446,2598


In [None]:
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = airports['LONGITUDE'],
    lat = airports['LATITUDE'],
    hoverinfo = 'text',
    text = airports['AIRPORT'],
    mode = 'markers',
    marker = dict(
        size = 2,
        color = 'rgb(255, 0, 0)',
        line = dict(
            width = 3,
            color = 'rgba(68, 68, 68, 0)'
        )
    )))



flight_paths = []
for i in range(len(flights3)):
    try:
        fig.add_trace(
            go.Scattergeo(
                locationmode = 'USA-states',
                lon = [flights3['O_LONG'][i], flights3['D_LONG'][i]],
                lat = [flights3['O_LAT'][i], flights3['D_LAT'][i]],
                mode = 'lines',
                line = dict(width = 1,color = 'red'),
                opacity = float(flights3['COUNT'][i]) / float(flights3['COUNT'].max()),
            ))
    except KeyError: #I kept continually getting 1 key error no matter how I arranged the indices, so I decided to just omit the one row that would not work
        pass



fig.update_layout(
    title_text = '2015 U.S.A Flights<br>(Hover for more info)',
    showlegend = False,
    geo = dict(
        scope = 'north america',
        projection_type = 'azimuthal equal area',
        showland = True,
        landcolor = 'rgb(243, 243, 243)',
        countrycolor = 'rgb(204, 204, 204)',
    ),
)

fig.show()


Still, the map is relatively busy. However, there are still several takeaways to be made. First of all, it is astounding to see the number of airports in the United States and the seemingly infinite number of combinations of ways to travel between them. As someone who often complains about flight delays and interuptions, this makes it a bit more understandable. The web of connecting flights is so complex. Ultimatley, it is clear that the most commonly used airports are Atlanta, Dallas, Denver, Chicago, Los Angeles, New York, the airports in Florida like Orlando and Miami, and even Seattle gets a lot of traffic. It is also interesting to see the general lack of airports and flights that travel to Northern area of the United States by Montana, Idaho, Wyoming, and the Dakotas. Overall, the flights are pretty well distributed and there are a lot of flights that happen at least twice a day. While this is my official analysis paragraph, I think it will be interesting to look at only the flights that are in the top 5% of frequency. So that is what is below this textbox.

In [None]:
flights3["COUNT"].quantile(.95)

5104.4000000000005

In [None]:
flights3 = flights3[flights3['COUNT'] > 5104]
flights3.index = np.arange(1, len(flights3) + 1)
fig = go.Figure()

fig.add_trace(go.Scattergeo(
    locationmode = 'USA-states',
    lon = airports['LONGITUDE'],
    lat = airports['LATITUDE'],
    hoverinfo = 'text',
    text = airports['AIRPORT'],
    mode = 'markers',
    marker = dict(
        size = 2,
        color = 'rgb(255, 0, 0)',
        line = dict(
            width = 3,
            color = 'rgba(68, 68, 68, 0)'
        )
    )))



flight_paths = []
for i in range(len(flights3)):
    try:
        fig.add_trace(
            go.Scattergeo(
                locationmode = 'USA-states',
                lon = [flights3['O_LONG'][i], flights3['D_LONG'][i]],
                lat = [flights3['O_LAT'][i], flights3['D_LAT'][i]],
                mode = 'lines',
                line = dict(width = 1,color = 'red'),
                opacity = float(flights3['COUNT'][i]) / float(flights3['COUNT'].max()),
            ))
    except KeyError: #I kept continually getting 1 key error no matter how I arranged the indices, so I decided to just omit the one row that would not work
        pass



fig.update_layout(
    title_text = '2015 U.S.A Flights<br>(Hover for more info)',
    showlegend = False,
    geo = dict(
        scope = 'north america',
        projection_type = 'azimuthal equal area',
        showland = True,
        landcolor = 'rgb(243, 243, 243)',
        countrycolor = 'rgb(204, 204, 204)',
    ),
)

fig.show()