In [1]:
import pandas as pd
import numpy as np
from data_cleaning import clean_flights_df, clean_passengers_df, avg_passengers

In [2]:
flights_df = clean_flights_df('data/flights_sample.csv')
flights_df.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,...,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,fl_day,fl_month,fl_year
0,UA,UA,613,UA,N454UA,613,14771,SFO,"San Francisco, CA",12892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17,7,2019
1,WN,WN,5610,WN,N712SW,5610,10821,BWI,"Baltimore, MD",10721,...,0.0,39.0,0.0,7.0,0.0,0.0,0.0,9,6,2018
2,B6,B6,737,B6,N705JB,737,10529,BDL,"Hartford, CT",13204,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,11,2019
3,WN,WN,6221,WN,N8503A,6221,12889,LAS,"Las Vegas, NV",13871,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24,12,2019
4,WN,WN,2663,WN,N446WN,2663,13204,MCO,"Orlando, FL",13232,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14,1,2018


### Feature Selection/Engineering

In [3]:
features = flights_df.columns.tolist()
col_to_remove = ['tail_num', 'origin_airport_id', 'origin_city_name','dest_airport_id','dest_city_name', 'flights', 'carrier_delay', 'weather_delay', 'nas_delay','security_delay', 'late_aircraft_delay', 'first_dep_time', 'total_add_gtime','longest_add_gtime', 'dep_time', 'arr_time', 'wheels_off', 'wheels_on', 'taxi_out', 'taxi_in', 'actual_elapsed_time', 'cancelled', 'air_time']

In [4]:
for col in col_to_remove:
    features.remove(col)
features

['mkt_unique_carrier',
 'branded_code_share',
 'mkt_carrier_fl_num',
 'op_unique_carrier',
 'op_carrier_fl_num',
 'origin',
 'dest',
 'crs_dep_time',
 'dep_delay',
 'crs_arr_time',
 'arr_delay',
 'crs_elapsed_time',
 'distance',
 'fl_day',
 'fl_month',
 'fl_year']

In [6]:
flights_df = flights_df[features]
flights_df.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,dep_delay,crs_arr_time,arr_delay,crs_elapsed_time,distance,fl_day,fl_month,fl_year
0,UA,UA,613,UA,613,SFO,LAX,06:30:00,-3.0,08:16:00,-23.0,106.0,337,17,7,2019
1,WN,WN,5610,WN,5610,BWI,BOS,13:40:00,20.0,15:10:00,59.0,90.0,369,9,6,2018
2,B6,B6,737,B6,737,BDL,MCO,06:00:00,-9.0,09:06:00,-21.0,186.0,1050,6,11,2019
3,WN,WN,6221,WN,6221,LAS,OMA,09:50:00,8.0,14:25:00,-8.0,155.0,1099,24,12,2019
4,WN,WN,2663,WN,2663,MCO,MDW,20:45:00,-3.0,22:35:00,-4.0,170.0,990,14,1,2018


In [7]:
flights_df.shape

(4681523, 16)

In [8]:
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4681523 entries, 0 to 4681522
Data columns (total 16 columns):
 #   Column              Dtype  
---  ------              -----  
 0   mkt_unique_carrier  object 
 1   branded_code_share  object 
 2   mkt_carrier_fl_num  int64  
 3   op_unique_carrier   object 
 4   op_carrier_fl_num   int64  
 5   origin              object 
 6   dest                object 
 7   crs_dep_time        object 
 8   dep_delay           float64
 9   crs_arr_time        object 
 10  arr_delay           float64
 11  crs_elapsed_time    float64
 12  distance            int64  
 13  fl_day              int32  
 14  fl_month            int32  
 15  fl_year             int32  
dtypes: float64(3), int32(3), int64(3), object(7)
memory usage: 517.9+ MB


In [10]:
cat_cols = flights_df.dtypes[flights_df.dtypes == 'object'].index.tolist()
cat_cols

['mkt_unique_carrier',
 'branded_code_share',
 'op_unique_carrier',
 'origin',
 'dest',
 'crs_dep_time',
 'crs_arr_time']

In [12]:
for col in cat_cols:
    print(flights_df[col].value_counts())

mkt_unique_carrier
AA    1207919
DL    1033946
UA     907047
WN     797048
AS     252649
B6     177457
NK     112397
F9      75084
G4      59600
HA      53226
VX       5150
Name: count, dtype: int64
branded_code_share
WN              797048
AA_CODESHARE    660914
DL              579670
AA              547005
UA_CODESHARE    536765
DL_CODESHARE    454276
UA              370282
B6              177457
AS              151665
NK              112397
AS_CODESHARE    100984
F9               75084
G4               59600
HA               50325
VX                5150
HA_CODESHARE      2901
Name: count, dtype: int64
op_unique_carrier
WN    797048
DL    579670
AA    547005
OO    473061
UA    370282
YX    187717
MQ    180145
B6    177457
OH    164381
AS    151665
9E    145075
YV    129181
NK    112397
EV     97525
F9     75084
QX     69089
PT     62825
ZW     60112
G4     59600
CP     56974
HA     50325
G7     49504
AX     45814
C5     30195
VX      5150
EM      2901
KS       825
9K       516
Name: 

In [13]:
passengers_df = clean_passengers_df('data/passengers.csv')
passengers_df.head()

Unnamed: 0,departures_scheduled,departures_performed,payload,seats,passengers,freight,mail,distance,ramp_to_ramp,air_time,...,dest_city_name,dest_country,dest_country_name,aircraft_group,aircraft_type,aircraft_config,year,month,distance_group,class
0,30,30,1122000,4500,3165,0,0,1404,6341,5443,...,"Fort Lauderdale, FL",US,United States,6,694,1,2015,4,3,F
1,30,30,1122000,4500,3732,0,0,1250,5954,5176,...,"Boston, MA",US,United States,6,694,1,2015,4,3,F
2,30,30,786000,3000,2182,0,0,952,4712,4056,...,"Washington, DC",US,United States,6,678,1,2015,4,2,F
3,30,30,786000,3000,2359,0,0,333,2581,1763,...,"Orlando, FL",US,United States,6,678,1,2015,4,1,F
4,30,30,1122000,4500,4072,0,0,720,3605,3077,...,"Fort Lauderdale, FL",US,United States,6,694,1,2015,4,2,F


In [14]:
flights_df = avg_passengers(flights_df, passengers_df)
flights_df.head()

Unnamed: 0,mkt_unique_carrier,branded_code_share,mkt_carrier_fl_num,op_unique_carrier,op_carrier_fl_num,origin,dest,crs_dep_time,dep_delay,crs_arr_time,arr_delay,crs_elapsed_time,distance,fl_day,fl_month,fl_year,monthly_avg_passengers
0,UA,UA,613,UA,613,SFO,LAX,06:30:00,-3.0,08:16:00,-23.0,106.0,337,17,7,2019,7306.693694
1,WN,WN,5610,WN,5610,BWI,BOS,13:40:00,20.0,15:10:00,59.0,90.0,369,9,6,2018,7343.459459
2,B6,B6,737,B6,737,BDL,MCO,06:00:00,-9.0,09:06:00,-21.0,186.0,1050,6,11,2019,3571.029412
3,WN,WN,6221,WN,6221,LAS,OMA,09:50:00,8.0,14:25:00,-8.0,155.0,1099,24,12,2019,2482.166667
4,WN,WN,2663,WN,2663,MCO,MDW,20:45:00,-3.0,22:35:00,-4.0,170.0,990,14,1,2018,9527.866667


In [16]:
# dropping more columns. op carrier contains more info that mkt_unique_carrier. 
flights_df.drop(columns=['mkt_unique_carrier', 'branded_code_share', 'mkt_carrier_fl_num'], inplace=True)

In [17]:
flights_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4681523 entries, 0 to 4681522
Data columns (total 14 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   op_unique_carrier       object 
 1   op_carrier_fl_num       int64  
 2   origin                  object 
 3   dest                    object 
 4   crs_dep_time            object 
 5   dep_delay               float64
 6   crs_arr_time            object 
 7   arr_delay               float64
 8   crs_elapsed_time        float64
 9   distance                int64  
 10  fl_day                  int32  
 11  fl_month                int32  
 12  fl_year                 int32  
 13  monthly_avg_passengers  float64
dtypes: float64(4), int32(3), int64(2), object(5)
memory usage: 446.5+ MB


In [18]:
# create dummy variables
dummy_df = pd.get_dummies(flights_df[['op_unique_carrier', 'origin', 'dest']])
dummy_df.head()

Unnamed: 0,op_unique_carrier_9E,op_unique_carrier_9K,op_unique_carrier_AA,op_unique_carrier_AS,op_unique_carrier_AX,op_unique_carrier_B6,op_unique_carrier_C5,op_unique_carrier_CP,op_unique_carrier_DL,op_unique_carrier_EM,...,dest_VEL,dest_VLD,dest_VPS,dest_WRG,dest_WYS,dest_XNA,dest_XWA,dest_YAK,dest_YKM,dest_YUM
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [21]:
flights_df = pd.concat([flights_df, dummy_df], axis=1)
flights_df.drop(columns=['op_unique_carrier', 'dest', 'origin'])

Unnamed: 0,op_carrier_fl_num,crs_dep_time,dep_delay,crs_arr_time,arr_delay,crs_elapsed_time,distance,fl_day,fl_month,fl_year,...,dest_VEL,dest_VLD,dest_VPS,dest_WRG,dest_WYS,dest_XNA,dest_XWA,dest_YAK,dest_YKM,dest_YUM
0,613,06:30:00,-3.0,08:16:00,-23.0,106.0,337,17,7,2019,...,False,False,False,False,False,False,False,False,False,False
1,5610,13:40:00,20.0,15:10:00,59.0,90.0,369,9,6,2018,...,False,False,False,False,False,False,False,False,False,False
2,737,06:00:00,-9.0,09:06:00,-21.0,186.0,1050,6,11,2019,...,False,False,False,False,False,False,False,False,False,False
3,6221,09:50:00,8.0,14:25:00,-8.0,155.0,1099,24,12,2019,...,False,False,False,False,False,False,False,False,False,False
4,2663,20:45:00,-3.0,22:35:00,-4.0,170.0,990,14,1,2018,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4681518,1784,05:25:00,-5.0,08:24:00,-21.0,119.0,599,15,12,2019,...,False,False,False,False,False,False,False,False,False,False
4681519,2450,12:40:00,-4.0,17:44:00,-25.0,184.0,1276,22,5,2019,...,False,False,False,False,False,False,False,False,False,False
4681520,504,19:52:00,-5.0,22:20:00,-29.0,148.0,746,5,8,2018,...,False,False,False,False,False,False,False,False,False,False
4681521,4661,13:03:00,-7.0,14:15:00,-23.0,72.0,198,8,4,2018,...,False,False,False,False,False,False,False,False,False,False


In [23]:
y = flights_df['arr_delay']
flights_df = flights_df.drop(columns=['op_carrier_fl_num', 'arr_delay'])


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(flights_df, y, test_size=.3, random_state=58)
print(X_train.shape)
print(X_test.shape)


MemoryError: Unable to allocate 1.14 GiB for an array with shape (375, 3277066) and data type bool

In [35]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()

rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)

ValueError: could not convert string to float: 'DL'