In [20]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
flights = pd.read_csv('Airlines.csv')
airports = pd.read_csv('airports.csv')

In [4]:
flights.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,1,CO,269,SFO,IAH,3,15,205,1
1,2,US,1558,PHX,CLT,3,15,222,1
2,3,AA,2400,LAX,DFW,3,20,165,1
3,4,AA,2466,SFO,DFW,3,20,195,1
4,5,AS,108,ANC,SEA,3,30,202,0


In [5]:
airports.head()

Unnamed: 0,IATA,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABQ,Albuquerque International,Albuquerque,NM,USA,35.040222,-106.609194
1,ANC,Ted Stevens Anchorage International,Anchorage,AK,USA,61.17432,-149.996186
2,ATL,William B Hartsfield-Atlanta Intl,Atlanta,GA,USA,33.640444,-84.426944
3,AUS,Austin-Bergstrom International,Austin,TX,USA,30.194533,-97.669872
4,BDL,Bradley International,Windsor Locks,CT,USA,41.938874,-72.683228


## Finalized Visualizations:

Example:

\(not great visualization cause too many airports, but looks kinda cool\)


Description of visualization in markdown cell

- What is being shown
- What are the main takeaways

We see that larger airports \(Atlanta, O'Hare, Denver, DFW, LAX\) have the highest proportion of delays to overall number of flights taking off.



In [6]:
delayed = flights[flights['Delay'] == 1]
airport_from_delay_props = (flights.groupby('AirportFrom').sum()['Delay'] / delayed.shape[0]).sort_values()
labels = airport_from_delay_props.index
values = airport_from_delay_props.values

import plotly.graph_objects as go
fig = go.Figure(data=[go.Pie(labels=labels, values=values, sort = False, title = "Proportion of Delays by Take Off Airport")])
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.write_html('airport_pie.html')
fig.show()

In [16]:
flights.corr()

Unnamed: 0,id,Flight,DayOfWeek,Time,Length,Delay
id,1.0,-0.006156,-0.055474,0.029156,0.020781,0.139762
Flight,-0.006156,1.0,0.000416,-0.00575,-0.341481,-0.046175
DayOfWeek,-0.055474,0.000416,1.0,0.001273,0.013397,-0.026199
Time,0.029156,-0.00575,0.001273,1.0,-0.020612,0.150454
Length,0.020781,-0.341481,0.013397,-0.020612,1.0,0.040489
Delay,0.139762,-0.046175,-0.026199,0.150454,0.040489,1.0


### Delayed Airports in the United States and Territories

Here we find 292 airports spread across the United States, Guam, and Puerto Rico. This visualization shows all the airports placed using information from the "Airport Coordinates" comma seperated values file. This file includes data like the airport's: acronym, latitude, longitude, name, and percentage of flights that were delayed.



In [17]:
df = pd.read_csv("Airport Coordinates.csv")
# df['text'] = df['IATA'] + ',' + df['AIRPORT'] + ',' + df['DELAY'].astype(str)
df['text'] = df['IATA'] + ', ' + df['AIRPORT'] + '</br>' + df['DELAY'].astype(str) + '% delay'

fig = go.Figure(data=go.Scattergeo(
        lon = df['LONGITUDE'],
        lat = df['LATITUDE'],
        text = df['text'],
        mode = 'markers',
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'circle',
            line = dict(
                width=0.3,
                color='rgba(102, 102, 102)'
            ),
#             colorscale = [[0, 'red'], [0.25, 'orange'], [0.5, 'green'], [0.75, 'blue'], [1, 'purple']],
            colorscale = [[0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1, 'green']],
            cmin = 0,
            color = df['DELAY'],
            cmax = df['DELAY'].max(),
            colorbar_title="Likelihood of Delays<br>(In percentages)"
        )))

fig.update_layout(
        title = 'Most delayed United States airports!<br>(Hover for airport names)',
        geo = dict(
            scope='north america',
            projection_type='albers',
            showland = True,
            landcolor = "lightblue",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.25,
            subunitwidth = 0.5
        ),
    )
fig.show()

In [21]:
df = pd.read_csv("Airport Coordinates.csv")
df['text'] = df['IATA'] + ', ' + df['AIRPORT'] + '</br>' +  (df["SIZE"]*1000).astype(str) + ' sq. ft. (sum of runway length) </br>' + df['DELAY'].astype(str) + '% delay'

fig = go.Figure(data=go.Scattergeo(
        lon = df['LONGITUDE'],
        lat = df['LATITUDE'],
        text = df['text'],
        mode = 'markers',
        marker = dict(
            size = df['SIZE']/2,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'circle',
            line = dict(
                width=0.3,
                color='rgba(102, 102, 102)'
            ),
#             colorscale = [[0, 'red'], [0.25, 'orange'], [0.5, 'green'], [0.75, 'blue'], [1, 'purple']],
            colorscale = [[0, 'red'], [0.3, 'orange'], [0.5, 'yellow'], [1, 'green']],
            cmin = 0,
            color = df['DELAY'],
            cmax = df['DELAY'].max(),
            colorbar_title="Likelihood of Delays<br>(In percentages)"
        )))

fig.update_layout(
        title = 'Most delayed United States airports!<br>(Hover for airport names)',
        geo = dict(
            scope='north america',
            projection_type='albers',
            showland = True,
            landcolor = "lightblue",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.25,
            subunitwidth = 0.5
        ),
    )
fig.show()

In [24]:
df.to_csv('lat_long.csv', index = False)

In [2]:
import pandas as pd
import plotly.express as px
data = pd.read_csv("Airlines.csv")
data.drop("id",axis=1,inplace=True)
df = data[["Airline","Delay"]].groupby("Airline").agg("count").reset_index()
fig = px.histogram(df.head(100),x="Airline",y="Delay",color="Airline",title="Comparing Airlines to the Amount of Delays")
fig.update_yaxes(title_text='Number of Delays')
fig.update_xaxes(title_text='Airline Flown')
fig.show()

In [25]:
import plotly.express as px
import pandas as pd

df = pd.read_csv("Janna/AirportFrom, Delay, Instance, Probability,Percentage.csv")
delay = df.sort_values(by=("AirportFrom"), axis=0, ascending=True)
prob_hist = px.histogram(df, x="AirportFrom", y="Percentage", title="Percentage of Delays by AirportFrom",color="AirportFrom")
prob_hist.update_yaxes(title_text='Percentage of Delays')
prob_hist.update_xaxes(title_text='AirportFrom')
prob_hist.show()

The three airports that have the highest percentages of delays \(from highest to lowest\) are MDW \(Chicago Midway\), DAL \(Dallas Love\), and OAK \(Metropolitan Oakland International\). This could be due to the fact that all three of these airports are in or near major cities. MDW and DAL are located in the cities in which they are named, respectively, while OAK is located near San Francisco. Chicago, Dallas, and San Francisco have populations of over a million. 

The three airports with the lowest percentages \(from highest to lowest\) are HTS \(Tri\-State Walker\-Long\), FLG \(Flagstaff Pulliam\), and TXK \(Texarkana Regional\-Webb\). HTS is located in Huntington, West Virginia, FLG is located in Flagstaff, Arizona, and TXK is located in Texarkana, Arkansas. All three cities have populations below 75,000

This evidence suggests that the higher rates of flight delays in larger cities could be caused by their larger populations. The same reasoning can be applied to cities with smaller populations; the smaller population sizes may be the cause of lower flight delay rates.


In [29]:
import plotly.express as px
import pandas as pd

df = pd.read_csv("Janna/AirportTo, Delay, Instance, Percentage.csv")
delay = df.sort_values(by=("AirportTo"), axis=0, ascending=True)
at_hist = px.histogram(df, x="AirportTo", y="Percentage", title="Percentage of Delays by AirportTo",color="AirportTo")
at_hist.update_yaxes(title_text='Percentage of Delays')
at_hist.update_xaxes(title_text='AirportTo')
at_hist.show()