#Project 1
#By Niteen Kumar, Alexander Low, Jagruti Solao, Brian Weinfeld

In [17]:
import csv
from scipy import stats
import pandas as pd
import networkx as nx
from statistics import mean
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats

A data set consisting of every airport in the world, along with various statistics about each airport, and a second data set containing every connection between these airports, was loaded into python.

In [2]:
df1 = pd.read_table('https://raw.githubusercontent.com/brian-cuny/620project1/master/openflights_airports.txt', sep=' ')
df1[:10]

Unnamed: 0,Airport ID,Name,City,Country,IATA/FAA,ICAO,Latitude,Longitude,Altitude,Timezone,DST
0,1,Goroka,Goroka,Papua New Guinea,GKA,AYGA,-6.081689,145.391881,5282,10.0,U
1,2,Madang,Madang,Papua New Guinea,MAG,AYMD,-5.207083,145.7887,20,10.0,U
2,3,Mount Hagen,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.826789,144.295861,5388,10.0,U
3,4,Nadzab,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569828,146.726242,239,10.0,U
4,5,Port Moresby Jacksons Intl,Port Moresby,Papua New Guinea,POM,AYPY,-9.443383,147.22005,146,10.0,U
5,6,Wewak Intl,Wewak,Papua New Guinea,WWK,AYWK,-3.583828,143.669186,19,10.0,U
6,7,Narsarsuaq,Narssarssuaq,Greenland,UAK,BGBW,61.160517,-45.425978,112,-3.0,E
7,8,Nuuk,Godthaab,Greenland,GOH,BGGH,64.190922,-51.678064,283,-3.0,E
8,9,Sondre Stromfjord,Sondrestrom,Greenland,SFJ,BGSF,67.016969,-50.689325,165,-3.0,E
9,10,Thule Air Base,Thule,Greenland,THU,BGTL,76.531203,-68.703161,251,-4.0,E


In [3]:
df2 = pd.read_table('https://raw.githubusercontent.com/brian-cuny/620project1/master/openflights.txt', 
                    sep=' ', header=None)
df2[:10]

Unnamed: 0,0,1,2
0,1,5,1
1,2,4,1
2,2,5,1
3,2,6,2
4,2,5430,1
5,3,2,1
6,3,5,2
7,3,5431,1
8,3,5434,2
9,4,2,1


This data was loaded into a Neo4j server and subset to only include airports that are located in Canada and the United Kingdom. From there, each of the centrality calculations was computed for each airport

In [4]:
with open('airports_sub.csv') as read_file:
    airports = {int(r[0]): r for r in csv.reader(read_file, delimiter=',')}

with open('connections_sub.csv') as read_file:
    connections = [(int(r[0]), int(r[1])) for r in csv.reader(read_file, delimiter=',')]

G = nx.Graph()
G.add_edges_from(connections)

degree_centrality = nx.degree(G)
degree_closeness = nx.closeness_centrality(G)
eigenvector_centrality = nx.eigenvector_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
for ap, deg in degree_centrality:
    airports[ap].append(deg)
    airports[ap].append(degree_closeness[ap])
    airports[ap].append(eigenvector_centrality[ap])
    airports[ap].append(betweenness_centrality[ap])


In order to facilitate calculations, a class was designed to house all the information about a single Airport, along with a helper method designed to extract all the needed values for the various queries.

In [5]:
class Airport():
    def __init__(self, id, airport, country, centrality, closeness, eigenvector, betweenness, international):
        self.id = id
        self.airport = airport
        self.country = country
        self.centrality = float(centrality)
        self.closeness = float(closeness)
        self.eigenvector = float(eigenvector)
        self.betweenness = float(betweenness)
        self.international = international

    def __repr__(self):
        return self.__str__()

    def __str__(self):
        return f'{self.id} "{self.airport}" {self.country} {self.international} {self.centrality}'


def extract(comparison, element, attr):
    return [getattr(x, attr) for x in airports if getattr(x, comparison) == element]

with open('calculations_international.csv') as read_file:
    reader = csv.reader(read_file, delimiter=',')
    next(reader)
    airports = [Airport(*r) for r in reader]

##Centrality Comparison beteween Canada and United Kingdom Airports


In [8]:
canada = extract('country', 'Canada', 'centrality')
uk = extract('country', 'United Kingdom', 'centrality')
f'Canada: {mean(canada)} UK: {mean(uk)}'

'Canada: 3.9351351351351354 UK: 6.186046511627907'

In [9]:

stats.ttest_ind(canada, uk)

Ttest_indResult(statistic=-3.0154243735731856, pvalue=0.0028595361869049748)

In [10]:
canada = extract('country', 'Canada', 'closeness')
uk = extract('country', 'United Kingdom', 'closeness')
f'Canada: {mean(canada)} UK: {mean(uk)}'

'Canada: 0.22580906714515855 UK: 0.19588782248151104'

In [11]:

stats.ttest_ind(canada, uk)

![UK](UKCentralityEdinburgh.png)
![Canada](canadaCentralityVancouverIntl.png)

The above picture show every airport connected to Edinburgh and Vancouver international. It turns out that 40% of all airports in the UK fly into Edinburgh while less than 15% fly into Vancouver International. Canada's higher closeness stems from the fact that every interntional airport connects to every other international airport.

##Centrality Comparison between International and Domestic Airports

In [12]:
inter = extract('international', 'True', 'centrality')
dom = extract('international', 'False', 'centrality')
f'International: {mean(inter)} Domestic: {mean(dom)}'

'International: 17.083333333333332 Domestic: 3.6527777777777777'

In [13]:
stats.ttest_ind(inter, dom)

Ttest_indResult(statistic=13.56153263468452, pvalue=4.742052671702762e-31)

In [14]:
sorted(airports, key=lambda a: a.centrality, reverse=True)[:10]

[156 "Vancouver Intl" Canada True 26.0,
 178 "Calgary Intl" Canada True 25.0,
 49 "Edmonton Intl" Canada True 23.0,
 193 "Lester B Pearson Intl" Canada True 23.0,
 146 "Pierre Elliott Trudeau Intl" Canada True 20.0,
 467 "City" United Kingdom False 19.0,
 532 "Dyce" United Kingdom False 18.0,
 169 "Sioux Lookout" Canada False 17.0,
 160 "Winnipeg Intl" Canada False 16.0,
 534 "Glasgow" United Kingdom True 16.0]

In [15]:
inter = extract('international', 'True', 'betweenness')
dom = extract('international', 'False', 'betweenness')
f'International: {mean(inter)} Domestic: {mean(dom)}'

'International: 0.10039363053781986 Domestic: 0.008939914947524354'

In [16]:
stats.ttest_ind(inter, dom)

![Highest Centrality](smallMap.png)

In conclusion, we find several statistically significant difference between Canada/UK airports and International/Domestic airports. 

Canadian airports have a wider spread than the UK due to the larger size of the country and International airports have a higher centrality and betweeness than domestic due to the spoke and hub nature of airport travel design.

##Plane Analysis

Next we explored whether there was a statistically significant difference in the centrality of airports based on the most common plane at airport.

In [24]:
airport_df1 = pd.read_csv("https://raw.githubusercontent.com/brian-cuny/620project1/master/airport_final.csv")
airport_df1 = airport_df1.drop(['Unnamed: 0'], axis=1)
airport_df2 = pd.read_csv("https://raw.githubusercontent.com/brian-cuny/620project1/master/calculations_international.csv")
airport_df3 = pd.merge(airport_df1, airport_df2, on='ID')
airport_df3.head()

Unnamed: 0,ID,total,Equipment,Airport_Name,City,Country_x,Airport,Country_y,Centrality,Closeness,Eigenvector,Betweenness,International
0,100,18,CRJ,Ottawa Macdonald-Cartier International Airport,Ottawa,Canada,Ottawa Macdonald Cartier Intl,Canada,15,0.344337,0.188821,0.117226,True
1,108,2,DH1,Prince Rupert Airport,Prince Pupert,Canada,Prince Rupert,Canada,2,0.252541,0.023661,0.0,False
2,108,2,DH3,Prince Rupert Airport,Prince Pupert,Canada,Prince Rupert,Canada,2,0.252541,0.023661,0.0,False
3,109,4,J31,Fort Chipewyan Airport,Fort Chipewyan,Canada,Fort Chipewyan,Canada,2,0.224632,0.008939,1.9e-05,False
4,111,8,DH1,Quebec Jean Lesage International Airport,Quebec,Canada,Quebec Jean Lesage Intl,Canada,10,0.289167,0.0584,0.053029,False


In [25]:
equipment_summary = airport_df3['Equipment'].value_counts()
equipment_summary[:10]

BEH    34
DHT    28
DH8    26
DH1    20
AT4    18
DH4    12
SWM    12
738    10
73W    10
BE1     8
Name: Equipment, dtype: int64

We found that the most common type of plane were not the large passenger jets that we are used to but significantly smaller planes. The 'BEH' is 'Beechcraft 1900' and seats 19 passengers. The 'DHT' is the 'Twin Otter' and holds 19 passengers as well.

In [26]:
equipment_rev = ['BEH','DHT','DH8','DH1','AT4','DH4','SWM','738','73W']
airport_df4 = airport_df3[airport_df3.Equipment.isin(equipment_rev)]
airport_hist = airport_df4['Eigenvector'].hist(by=airport_df3['Equipment'])
plt.show()

The data was restricted to only planes that had at least 10 occurances. We will now examine their eigenvector centrality.

In [27]:
airport_df4.groupby('Equipment')['Eigenvector'].mean()

Equipment
738    0.101597
73W    0.113216
AT4    0.007396
BEH    0.012623
DH1    0.016030
DH4    0.090312
DH8    0.004547
DHT    0.002965
SWM    0.015667
Name: Eigenvector, dtype: float64

In [28]:
equip_groups = airport_df4.groupby('Equipment').groups
eigenvector=airport_df4['Eigenvector']
BEH=eigenvector[equip_groups['BEH']]
DHT=eigenvector[equip_groups['DHT']]
DH8=eigenvector[equip_groups['DH8']]
DH1=eigenvector[equip_groups['DH1']]
AT4=eigenvector[equip_groups['AT4']]
DH4=eigenvector[equip_groups['DH4']]
SWM=eigenvector[equip_groups['SWM']]
Seven38=eigenvector[equip_groups['738']]
Seven3W=eigenvector[equip_groups['73W']]
# Source: http://hamelg.blogspot.com/2015/11/python-for-data-analysis-part-16_23.html
stats.f_oneway(DHT,DH8,DH1,AT4,DH4,SWM,Seven38,Seven3W)

F_onewayResult(statistic=26.201802268517, pvalue=5.596882900356146e-22)

We found the difference in centrality to be extremely significant. This supports our earlier conclusion supporting the spoke and hub model. Smaller, less influencial airports also have smaller planes that filter into more central (often international) airports that have much larger planes. 

In conclusion, we found significant differences when comparing Canadian and UK airports, international and domestic airports and when focusing on the types of planes used at the airport. 

All of these support the idea that airports vary greatly based on a number of factors from the size of the country, to their location, to the populuous they serve. Airports appear very adaptable to suit the needs of the populations they support.