In [1]:
import os
import datetime, warnings, scipy 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from sklearn import metrics, linear_model
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from scipy.optimize import curve_fit

from pandas.plotting import scatter_matrix

pd.options.display.max_columns = None

#os.chdir('/Users/Marta/Dropbox/2019-Move-to-NL/UvA/info_viz_course/project')
print(os.getcwd())

c:\Users\Marta\Dropbox\2019-Move-to-NL\UvA\info_viz_course\infovis-group36\jupyter


In [2]:
airline_names = pd.read_csv('airlines.csv')
airline_names

FileNotFoundError: [Errno 2] File airlines.csv does not exist: 'airlines.csv'

In [3]:
df = pd.read_csv("2018.csv")
print('Dataframe dimensions:', df.shape)
#____________________________________________________________
# # gives some infos on columns types and number of null values
tab_info=pd.DataFrame(df.dtypes).T.rename(index={0:'column type'})
tab_info=tab_info.append(pd.DataFrame(df.isnull().sum()).T.rename(index={0:'null values (nb)'}))
tab_info=tab_info.append(pd.DataFrame(df.isnull().sum()/df.shape[0]*100)
                          .T.rename(index={0:'null values (%)'}))
tab_info


# GLOSSARY
#https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time

Dataframe dimensions: (7213446, 26)


Unnamed: 0,YEAR,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,DEP_DEL15,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,ARR_DEL15,CANCELLED,CANCELLATION_CODE,DIVERTED,AIR_TIME,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,V26
column type,int64,int64,int64,int64,object,object,object,int64,float64,float64,float64,int64,float64,float64,float64,int64,object,int64,float64,int64,float64,float64,float64,float64,float64,float64
null values (nb),0,0,0,0,0,0,0,0,112317,117234,117234,0,119245,137040,137040,0,7096862,0,134442,0,5860736,5860736,5860736,5860736,5860736,7213446
null values (%),0,0,0,0,0,0,0,0,1.55705,1.62521,1.62521,0,1.65309,1.89979,1.89979,0,98.3838,0,1.86377,0,81.2474,81.2474,81.2474,81.2474,81.2474,100


In [4]:
missing_df = df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

Unnamed: 0,variable,missing values,filling factor (%)
0,V26,7213446,0.0
1,CANCELLATION_CODE,7096862,1.616204
2,SECURITY_DELAY,5860736,18.752618
3,NAS_DELAY,5860736,18.752618
4,WEATHER_DELAY,5860736,18.752618
5,CARRIER_DELAY,5860736,18.752618
6,LATE_AIRCRAFT_DELAY,5860736,18.752618
7,ARR_DEL15,137040,98.100215
8,ARR_DELAY,137040,98.100215
9,AIR_TIME,134442,98.136231


In [5]:
#numeric features
numeric_features = [feature for feature in df.columns if df[feature].dtypes!='O']

for feature in numeric_features:
    print("Unique features for", feature, "are",len(df[feature].unique()),"out of",df.shape[0])

Unique features for YEAR are 1 out of 7213446
Unique features for MONTH are 12 out of 7213446
Unique features for DAY_OF_MONTH are 31 out of 7213446
Unique features for DAY_OF_WEEK are 7 out of 7213446
Unique features for CRS_DEP_TIME are 1369 out of 7213446
Unique features for DEP_TIME are 1441 out of 7213446
Unique features for DEP_DELAY are 1489 out of 7213446
Unique features for DEP_DEL15 are 3 out of 7213446
Unique features for CRS_ARR_TIME are 1431 out of 7213446
Unique features for ARR_TIME are 1441 out of 7213446
Unique features for ARR_DELAY are 1528 out of 7213446
Unique features for ARR_DEL15 are 3 out of 7213446
Unique features for CANCELLED are 2 out of 7213446
Unique features for DIVERTED are 2 out of 7213446
Unique features for AIR_TIME are 677 out of 7213446
Unique features for DISTANCE are 1555 out of 7213446
Unique features for CARRIER_DELAY are 1331 out of 7213446
Unique features for WEATHER_DELAY are 1038 out of 7213446
Unique features for NAS_DELAY are 953 out of 7

In [6]:
# categorical features
categorical_features = [feature for feature in df.columns if df[feature].dtypes=='O']

for feature in categorical_features:
    print("Unique categories for",feature, "are",len(df[feature].unique()),"out of",df.shape[0])

Unique categories for OP_CARRIER are 18 out of 7213446
Unique categories for ORIGIN are 358 out of 7213446
Unique categories for DEST are 358 out of 7213446
Unique categories for CANCELLATION_CODE are 5 out of 7213446


In [7]:
# Dropping redundant columns

variables_to_remove = ['V26', 'CANCELLATION_CODE']

df.drop(variables_to_remove, axis = 1, inplace = True)

In [8]:
# Removing rows where there are NAs for ARR_DELAY

print("NA values BEFORE removal:", pd.isnull(df['ARR_DELAY']).sum())
df.dropna(subset=['ARR_DELAY'],inplace=True)
print("NA values AFTER removal:", pd.isnull(df['ARR_DELAY']).sum())

NA values BEFORE removal: 137040
NA values AFTER removal: 0


## Data for Avg. Flight Delay by Airline, by Month Vis

In [9]:
df_subset = df[['YEAR', 'MONTH', 'OP_CARRIER', 'ARR_DELAY']]

In [10]:
joined = df_subset.merge(airline_names, how = 'left', left_on = 'OP_CARRIER', right_on = 'IATA_CODE')

In [11]:
joined

Unnamed: 0,YEAR,MONTH,OP_CARRIER,ARR_DELAY,IATA_CODE,AIRLINE
0,2018,4,UA,120.0,UA,United Airlines
1,2018,4,UA,-20.0,UA,United Airlines
2,2018,4,UA,10.0,UA,United Airlines
3,2018,4,UA,-13.0,UA,United Airlines
4,2018,4,UA,-4.0,UA,United Airlines
...,...,...,...,...,...,...
7076401,2018,9,UA,-14.0,UA,United Airlines
7076402,2018,9,UA,-13.0,UA,United Airlines
7076403,2018,9,UA,-10.0,UA,United Airlines
7076404,2018,9,UA,7.0,UA,United Airlines


In [14]:
# Check if join is correct: OP_CARRIER == IATA_CODE
comparison_column = np.where(joined["OP_CARRIER"] == joined["IATA_CODE"], True, False)
joined["EQUAL"] = comparison_column
print("If no False values, then no missing airlines", joined["EQUAL"].unique())

missing_airlines = joined[(joined["EQUAL"] == False)]
print("If False values found, need to identify missing carrier in L_CARRIER_HISTORY.csv and update in AIRLINES.csv")
missing_airlines['OP_CARRIER'].unique()

If no False values, then no missing airlines [ True]
If False values found, need to identify missing carrier in L_CARRIER_HISTORY.csv and update in AIRLINES.csv


array([], dtype=object)

In [16]:
byCarrier = joined[['MONTH', 'AIRLINE', 'ARR_DELAY']]
byCarrier = byCarrier.groupby(['AIRLINE', 'MONTH']).mean()

In [17]:
byCarrier

Unnamed: 0_level_0,Unnamed: 1_level_0,ARR_DELAY
AIRLINE,MONTH,Unnamed: 2_level_1
Alaska Airlines,1,-6.991338
Alaska Airlines,2,-4.212754
Alaska Airlines,3,-4.373281
Alaska Airlines,4,-0.538546
Alaska Airlines,5,1.704911
...,...,...
United Airlines,11,8.235913
United Airlines,12,4.710846
Virgin America,1,-2.753631
Virgin America,2,-0.905862


In [21]:
# Check for missing values in byCarrier
missing_df = byCarrier.isnull().sum(axis=0).reset_index()
missing_df.columns = ['variable', 'missing values']
missing_df['filling factor (%)']=(df.shape[0]-missing_df['missing values'])/df.shape[0]*100
missing_df.sort_values('filling factor (%)').reset_index(drop = True)

ARR_DELAY    0
dtype: int64

In [22]:
# Pivot Table View
delays_by_carrier = pd.pivot_table(byCarrier, index = 'AIRLINE', columns = 'MONTH', values = 'ARR_DELAY') 
delays_by_carrier

MONTH,1,2,3,4,5,6,7,8,9,10,11,12
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Alaska Airlines,-6.991338,-4.212754,-4.373281,-0.538546,1.704911,-0.094814,0.03742,6.629404,-2.983903,0.992008,0.784328,-2.107323
Allegiant Air,5.949331,7.551872,8.005207,8.469854,12.87234,18.996748,20.339759,9.382006,3.051409,4.158883,7.545036,5.690137
American Airlines,0.429125,2.009694,-0.21383,0.515217,5.950072,9.798884,13.011832,11.225474,5.843163,6.403908,4.923866,3.895508
American Eagle Airlines,7.022381,9.120203,-0.108661,2.373132,5.450325,11.04851,7.389144,7.443578,3.151113,2.816541,4.246884,4.742595
Atlantic Southeast Airlines,7.96238,9.949914,4.044579,4.038905,2.181586,8.926578,9.385628,12.022194,9.90532,10.948643,15.747993,14.277988
Comair,10.646328,12.494252,5.604287,6.413445,10.877486,18.800804,11.297596,11.185461,3.669947,0.65753,5.353331,3.104275
Delta Air Lines,-1.165718,-3.456406,-2.043187,-0.654382,1.485662,5.707931,3.226279,3.036584,-3.006386,-3.546894,1.244556,-6.123091
Endeavor Air,3.202167,5.028379,3.413167,6.884534,1.901123,4.250141,7.504443,10.613545,2.739099,1.074403,5.876705,0.678493
Frontier Airlines,8.746292,10.480033,4.607815,9.930937,14.515536,25.450247,22.522248,22.172182,16.848213,13.964922,11.714082,7.22944
Hawaiian Airlines,1.22483,6.15404,3.056419,2.449204,0.469536,0.305873,-0.879572,0.770357,-0.949301,-0.595208,-1.430367,0.635152


In [24]:
byCarrier.to_csv('avg_flight_delay_month.csv', encoding='utf-8')