In [1]:
import pandas as pd
import os
import numpy as np
from matplotlib import pyplot as plt
import re
import time
import sys
import pickle
import datetime

%matplotlib inline

In [2]:
df = pd.read_csv('2008.csv')

In [3]:
df.shape

(7009728, 29)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7009728 entries, 0 to 7009727
Data columns (total 29 columns):
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              float64
CRSDepTime           int64
ArrTime              float64
CRSArrTime           int64
UniqueCarrier        object
FlightNum            int64
TailNum              object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             int64
TaxiIn               float64
TaxiOut              float64
Cancelled            int64
CancellationCode     object
Diverted             int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
dtypes: float64(14), int64(10), object(5)
memory usage: 1.5+ GB


In [5]:
df['DepDateTime'] = pd.to_datetime(df['Year']*10000+df['Month']*100+\
                                   df['DayofMonth'],format='%Y%m%d')

In [6]:
df['DepHour']= pd.cut(df.DepTime,
                      bins=[0,100,200,300,400,500,600,700,800,900,1000,1100,
                            1200,1300,1400,1500,1600,1700,1800,1900,2000,2100,
                            2200,2300,2400],right=False)

In [7]:
df.ArrDelay.describe(percentiles=[0.95])

count    6.855029e+06
mean     8.168452e+00
std      3.850194e+01
min     -5.190000e+02
50%     -2.000000e+00
95%      7.500000e+01
max      2.461000e+03
Name: ArrDelay, dtype: float64

In [8]:
df['DepDelay_30min'] = pd.cut(df.DepDelay, bins=range(-30,2490,30), right=False)
df['ArrDelay_30min'] = pd.cut(df.ArrDelay, bins=range(-30,2490,30), right=False)

In [9]:
carriers=pd.read_csv('carriers.csv')
airports=pd.read_csv('airports.csv')

In [10]:
df=pd.merge(df,airports[['iata','airport','lat','long']],left_on='Origin',
            right_on='iata')
df=pd.merge(df,airports[['iata','airport','lat','long']],left_on='Dest',
            right_on='iata', suffixes=('_Origin','_Dest'))

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7009728 entries, 0 to 7009727
Data columns (total 41 columns):
Year                 int64
Month                int64
DayofMonth           int64
DayOfWeek            int64
DepTime              float64
CRSDepTime           int64
ArrTime              float64
CRSArrTime           int64
UniqueCarrier        object
FlightNum            int64
TailNum              object
ActualElapsedTime    float64
CRSElapsedTime       float64
AirTime              float64
ArrDelay             float64
DepDelay             float64
Origin               object
Dest                 object
Distance             int64
TaxiIn               float64
TaxiOut              float64
Cancelled            int64
CancellationCode     object
Diverted             int64
CarrierDelay         float64
WeatherDelay         float64
NASDelay             float64
SecurityDelay        float64
LateAircraftDelay    float64
DepDateTime          datetime64[ns]
DepHour              category
DepDe

In [12]:
# rename columns
df.columns

Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'DepDateTime', 'DepHour', 'DepDelay_30min', 'ArrDelay_30min',
       'iata_Origin', 'airport_Origin', 'lat_Origin', 'long_Origin',
       'iata_Dest', 'airport_Dest', 'lat_Dest', 'long_Dest'],
      dtype='object')

In [13]:
df=df[['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime',
       'ArrTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'TailNum',
       'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay',
       'DepDelay', 'Origin', 'Dest', 'Distance', 'TaxiIn', 'TaxiOut',
       'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay',
       'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay',
       'DepDateTime', 'DepHour', 'DepDelay_30min', 'ArrDelay_30min', 
       'airport_Origin', 'lat_Origin', 'long_Origin','airport_Dest', 
       'lat_Dest', 'long_Dest']]

In [68]:
# write to csv
#df.to_csv('2008_rf.csv', index=False)

In [26]:
df.sample(10).to_csv('2008_01.csv')

In [17]:
df_sample = pd.read_csv('2008_01.csv')

In [22]:
df_sample.pivot_table(values='lat_Origin',index='DayOfWeek', aggfunc=len)

Unnamed: 0_level_0,lat_Origin
DayOfWeek,Unnamed: 1_level_1
1,7.0
2,18.0
3,18.0
4,12.0
5,8.0
6,14.0
7,23.0


In [31]:
df_sample.query('DayOfWeek==1')[['long_Origin', 'lat_Origin', 'long_Dest',
                                 'lat_Dest']]

Unnamed: 0,long_Origin,lat_Origin,long_Dest,lat_Dest
19,-82.53325,27.975472,-75.241141,39.871953
45,-93.216922,44.880547,-97.0372,32.895951
51,-115.152333,36.080361,-116.506253,33.829216
54,-80.15275,26.072583,-81.849397,41.410894
58,-80.943126,35.214011,-84.426944,33.640444
75,-90.507539,41.448526,-87.904464,41.979595
79,-81.316028,28.428889,-84.426944,33.640444
