In [46]:
import os, sys
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

Investigate the performance of flights over time or simply look at data for a given year and create a graphic that showcases your finding(s).

In [49]:
def addDate(df):
    import datetime 
    #df = df.head(100000)

    dt = df[['Year', 'Month', 'DayofMonth']]
    dt.columns = ["year", "month", "day"]
    dt.head(2)

    #pd.to_datetime(dt["year", "month", "day"])
    df["Date"] = pd.to_datetime(dt.year*10000 + dt.month*100 + dt.day, format='%Y%m%d')
    return df

In [104]:
## From http://www.transtats.bts.gov/ot_delay/OT_DelayCause1.asp?pn=1
## Download file: http://www.transtats.bts.gov/ot_delay/ot_delaycause1.asp?display=download&pn=0&month=6&year=2016

In [47]:
df_delays = pd.read_csv("./rawData/airline_delay_causes.csv")

In [105]:
## Download yearly files
## http://stat-computing.org/dataexpo/2009/1992.csv.bz2

http://stat-computing.org/dataexpo/2009/the-data.html

In [95]:
#years = [1990,1991,1992,1993]
years = range(2004,2006)
print (" Reading ", years[0])
df_orig = pd.read_csv("./rawData/{0}.csv.bz2".format(years[0]))
for kyear in years[1:]:
    print (" Reading ", kyear)
    df_orig = pd.concat([df_orig,pd.read_csv("./rawData/{0}.csv.bz2".format(kyear))])

df_orig = addDate(df_orig)

print(" Dataframe with {0} registers and the folloging data columns: \n {1}".format(
        len(df_orig), ", ".join(df_orig.columns)))

 Reading  2004
 Reading  2005
 Dataframe with 14269866 registers and the folloging data columns: 
 Year, Month, DayofMonth, DayOfWeek, DepTime, CRSDepTime, ArrTime, CRSArrTime, UniqueCarrier, FlightNum, TailNum, ActualElapsedTime, CRSElapsedTime, AirTime, ArrDelay, DepDelay, Origin, Dest, Distance, TaxiIn, TaxiOut, Cancelled, CancellationCode, Diverted, CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay, Date


# Carriers names

In [102]:
Carriers = set(df_orig["UniqueCarrier"].values.tolist())
print(" Number of carriers {0} (Code):\n\t{1}".format(len(Carriers), ", ".join(Carriers)))

 Number of carriers 20 (Code):
	HP, AS, EV, UA, TZ, CO, AA, B6, FL, OO, WN, US, OH, HA, DL, MQ, NW, DH, F9, XE


In [103]:
def genNamesDict(df_delays):
    cD = df_delays[["carrier","carrier_name"]].drop_duplicates()
    return cD.set_index("carrier").to_dict()["carrier_name"]

cd = genNamesDict(df_delays)
cd = [cd[i] for i in Carriers]
print(" Number of carriers {0} (Names):\n\t{1}".format(len(Carriers), ",\n\t".join(cd)))

 Number of carriers 20 (Names):
	America West Airlines Inc.,
	Alaska Airlines Inc.,
	ExpressJet Airlines Inc.,
	United Air Lines Inc.,
	ATA Airlines d/b/a ATA,
	Continental Air Lines Inc.,
	American Airlines Inc.,
	JetBlue Airways,
	AirTran Airways Corporation,
	SkyWest Airlines Inc.,
	Southwest Airlines Co.,
	US Airways Inc.,
	Comair Inc.,
	Hawaiian Airlines Inc.,
	Delta Air Lines Inc.,
	Envoy Air,
	Northwest Airlines Inc.,
	Independence Air,
	Frontier Airlines Inc.,
	ExpressJet Airlines Inc.


# Data

In [96]:
def defineDataFrame(dff, selK, keep=[]):
    
    dfw = dff.copy()

    if not keep == []:
        # Keep indexes
        keep = keep + selK 
        for kcol in df.columns:
            if kcol not in keep:
                #print("Removing ", kcol)
                dfw.drop(kcol, inplace=True, axis=1)
    print("Counting")
    grBCou = dfw.groupby(by=selK).count() #.unstack()
    print("Summing")
    grBSum = dfw.groupby(by=selK).sum()
    print("Averaging")
    grBAvg = dfw.groupby(by=selK).mean()
    #grBCou.reset_index()

    dfr = grBAvg.join(
            grBSum, how="outer", 
            rsuffix="_sum").join(
            grBCou, how="outer", 
            lsuffix="_avg", rsuffix="_count").reset_index()

    #print(dfr.columns)
    return dfr

In [97]:
df = df_orig.copy()

In [98]:
df.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'Date'],
      dtype='object')

In [99]:
df.head()

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,Date
0,2004,1,12,1,623.0,630,901.0,915,UA,462,...,11,0,,0,0,0,0,0,0,2004-01-12
1,2004,1,13,2,621.0,630,911.0,915,UA,462,...,16,0,,0,0,0,0,0,0,2004-01-13
2,2004,1,14,3,633.0,630,920.0,915,UA,462,...,15,0,,0,0,0,0,0,0,2004-01-14
3,2004,1,15,4,627.0,630,859.0,915,UA,462,...,10,0,,0,0,0,0,0,0,2004-01-15
4,2004,1,16,5,635.0,630,918.0,915,UA,462,...,13,0,,0,0,0,0,0,0,2004-01-16


In [100]:
selK = ["Date"]
delays = ["Cancelled"]
df2 = defineDataFrame(df, selK, delays)

df2.head(2)

Counting
Summing
Averaging


Unnamed: 0,Date,Cancelled_avg,Cancelled_sum,Cancelled_count
0,2004-01-01,0.011564,199,17209
1,2004-01-02,0.032011,610,19056


In [101]:
### Date vs mean delay 
df2.set_index("Date").to_csv("data/fligths-DateCancelled.csv")