In [None]:
# http://openflights.org/data.html

# Data preparation

In [None]:
import pandas as pd
import csv, sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm

airports_raw = []
filename = 'airports.dat'
with open(filename, 'rb') as f:
    reader = csv.reader(f)
    try:
        for row in reader:
            airports_raw.append(row)
    except csv.Error, e:
        sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))

airlines_raw = []
filename = 'airlines.dat'
with open(filename, 'rb') as f:
    reader = csv.reader(f)
    try:
        for row in reader:
            airlines_raw.append(row)
    except csv.Error, e:
        sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))

routes_raw = []
filename = 'routes.dat'
with open(filename, 'rb') as f:
    reader = csv.reader(f)
    try:
        for row in reader:
            routes_raw.append(row)
    except csv.Error, e:
        sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))



In [None]:
airports_df = pd.DataFrame(airports_raw)
airlines_df = pd.DataFrame(airlines_raw)
routes_df = pd.DataFrame(routes_raw)

In [None]:
airports_df.columns = ['ID', 'Name', 'City', 'Country', 'IATA/FAA', 'ICAO','Latitude','Longitude','Altitude','Timezone','DST','Tz']
airlines_df.columns = ['ID', 'Name','Alias','IATA','ICAO','Callsign','Country','Active']
routes_df.columns = ['Airline', 'AirlineID', 'Source', 'Source ID', 'Destination', 'Destination ID', 'Codeshare', 'Stops', 'Equipment']

In [None]:
airports_df['Longitude'] = pd.to_numeric(airports_df['Longitude']);
airports_df['Latitude'] = pd.to_numeric(airports_df['Latitude']);

In [None]:
plt.plot(airports_df['Longitude'], airports_df['Latitude'], 'g.', markersize = 1)
plt.show()

In [None]:
geo_values = pd.concat([airports_df['Longitude'], airports_df['Latitude']], axis=1)

# Part 1: Geo Clustering

## K-means

In [None]:
from sklearn.cluster import KMeans

kmeans_array = []
labels_array = []
centroids_array = []
for n_clusters in range(2,8):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(geo_values)
    kmeans_array.append(kmeans)
    labels_array.append(kmeans.labels_)
    centroids_array.append(kmeans.cluster_centers_)

In [None]:
for i in range(len(kmeans_array)):
    labels = labels_array[i]
    centroids = centroids_array[i]
    plt.scatter(geo_values['Longitude'], geo_values['Latitude'], s=10, c = labels+1, cmap = cm.rainbow)
    plt.scatter(centroids[:,0],centroids[:,1], s=30)
    plt.show()

## Spectral clustering

In [None]:
from sklearn.cluster import SpectralClustering

spectral_array = []
labels_array = []
centroids_array = []
for n_clusters in range(3,8):
    print n_clusters
    spectral = SpectralClustering(n_clusters=n_clusters, eigen_solver='arpack', affinity="nearest_neighbors",random_state=0).fit(geo_values)
    spectral_array.append(spectral)
    labels_array.append(spectral.labels_)

In [None]:
for i in range(len(spectral_array)):
    labels = labels_array[i]
    plt.scatter(geo_values['Longitude'], geo_values['Latitude'], s=10, c = labels+1, cmap = cm.rainbow)
    plt.show()

## Hierarchical clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

agglo_array = []
labels_array = []
centroids_array = []
for n_clusters in range(2,9):
    print n_clusters
    agglo = AgglomerativeClustering(n_clusters=n_clusters, linkage = 'ward').fit(geo_values)
    agglo_array.append(agglo)
    labels_array.append(agglo.labels_)

In [None]:
for i in range(len(agglo_array)):
    labels = labels_array[i]
    plt.scatter(geo_values['Longitude'], geo_values['Latitude'], s=10, c = labels+1, cmap = cm.rainbow)
    plt.show()

# Part 2 - data analysis

In [None]:
import math
import numpy as np

def route_distance(route, airports_df):
    source = np.array(route['Source'])[0]
    dest = np.array(route['Destination'])[0]
    source_lat = np.array(airports_df[airports_df['IATA/FAA']==source]['Latitude'])[0]
    source_lon = np.array(airports_df[airports_df['IATA/FAA']==source]['Longitude'])[0]
    dest_lat = airports_df[airports_df['IATA/FAA']==dest]['Latitude']
    dest_lon = airports_df[airports_df['IATA/FAA']==dest]['Longitude']
    dist = math.sqrt((source_lat - dest_lat)**2 + (source_lon - dest_lon)**2)
    return dist

## 2.1 Continents extraction

In [None]:
import copy
continents_num_labels = copy.copy(agglo_array[-1].labels_)
continents_labels = []

for i in range(len(continents_num_labels)):
    if continents_num_labels[i]==3:
        continents_num_labels[i] = 2
    if continents_num_labels[i]==5:
        continents_num_labels[i] = 0

for i in range(len(continents_num_labels)):
    if continents_num_labels[i]==0:
        continents_labels.append('North America')
    elif continents_num_labels[i]==1:
        continents_labels.append('South America')
    elif continents_num_labels[i]==2:
        continents_labels.append('Asia')
    elif continents_num_labels[i]==4:
        continents_labels.append('Europe')
    elif continents_num_labels[i]==6:
        continents_labels.append('Australia')
    elif continents_num_labels[i]==7:
        continents_labels.append('Africa')


In [None]:
continents_df = pd.DataFrame(continents_labels)
continents_df.columns = ['Continent']
airports_new = pd.concat([airports_df, continents_df], axis=1)
continents = ['North America', 'South America', 'Asia', 'Europe', 'Australia', 'Africa']

In [None]:
plt.scatter(geo_values['Longitude'], geo_values['Latitude'], s=10, c = continents_num_labels, cmap = cm.rainbow)
plt.show()

## 2.2 Questions we want to answer

1. Which two continents are best connected?
2. Which two continents are worst connected?
3. Which country has the biggest number of internal flights?
4. Which country has the biggest ratio of internal vs external?
5. Which country has the lowest ratio of internal vs external flights?
6. What is the best connected airport in the world?
7. What is the worst connected airport in the world?
8. Is there any one-way flight?

### Question 1 & 2

In [None]:
# 1. Merge routes with continents
# 2. Create distance matrix for matrices

In [None]:
continents

In [None]:
source_continent_column = []
dest_continent_column = []

inter_continents = pd.DataFrame(np.zeros((len(continents), len(continents))))
inter_continents.columns = continents
inter_continents.index = continents

for index, route in routes_df.iterrows():
    if index%3500==0 or index==len(routes_df)-1:
        print float(index)/len(routes_df)*100, "%"
    source = route['Source']
    dest = route['Destination']
    source_continent = np.array(airports_new[airports_new['IATA/FAA'] == source]['Continent'])
    dest_continent = np.array(airports_new[airports_new['IATA/FAA'] == dest]['Continent'])
    source_continent_column.append(source_continent)
    dest_continent_column.append(dest_continent)
    if source_continent.size != 0 and dest_continent.size != 0:
        inter_continents.ix[dest_continent[0], source_continent[0]] += 1

source_continent_df = pd.DataFrame(source_continent_column)
source_continent_df.columns = ['Source Continent']
dest_continent_df = pd.DataFrame(dest_continent_column)
dest_continent_df.columns = ['Destination Continent']

routes_new = pd.concat([routes_df, source_continent_df, dest_continent_df], axis=1)

In [None]:
inter_continents