In [1]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import datetime as dt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df_flights= pd.read_csv(r'/Users/cameronstevens/Documents/Midterm_project/df_50000_flights.csv')
df_flights_original =df_flights.drop(['Unnamed: 0'], axis=1)
df_flights_original

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,arr_delay
0,2018-07-09,DL,DL_CODESHARE,DL,7428,OO,N454SW,7428,14113,PIH,...,14869,SLC,"Salt Lake City, UT",650,750,N,60.0,1,150,-36.0
1,2019-04-07,AA,AA,AA,1539,AA,N123UW,1539,11057,CLT,...,11618,EWR,"Newark, NJ",1005,1210,N,125.0,1,529,-7.0
2,2018-03-29,AA,AA,AA,2493,AA,N172AJ,2493,13303,MIA,...,12478,JFK,"New York, NY",700,1000,N,180.0,1,1089,-3.0
3,2019-09-01,AA,AA,AA,569,AA,N715UW,569,11057,CLT,...,14635,RSW,"Fort Myers, FL",1700,1900,N,120.0,1,600,0.0
4,2019-01-05,WN,WN,WN,3228,WN,N759GS,3228,10821,BWI,...,13204,MCO,"Orlando, FL",1720,1945,N,145.0,1,787,-15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2018-09-29,AA,AA_CODESHARE,AA,2966,OO,N724SK,2966,13930,ORD,...,13931,ORF,"Norfolk, VA",1840,2151,N,131.0,1,717,-9.0
49996,2018-11-14,AA,AA,AA,1967,AA,N768US,1967,11433,DTW,...,11057,CLT,"Charlotte, NC",1711,1909,N,118.0,1,500,0.0
49997,2018-10-14,UA,UA_CODESHARE,UA,4619,AX,N12126,4619,11637,FAR,...,11292,DEN,"Denver, CO",1335,1436,N,121.0,1,627,80.0
49998,2019-07-15,DL,DL_CODESHARE,DL,3374,9E,N801AY,3374,11433,DTW,...,15096,SYR,"Syracuse, NY",725,848,N,83.0,1,374,0.0


In [3]:
def feature_exp(df):
    
    df= df.drop(['mkt_unique_carrier','branded_code_share','tail_num','op_carrier_fl_num',
                          'mkt_carrier','origin','origin_city_name','dest','dest_city_name','dup'], axis=1)

    df= convert_to_numeric(df)
    
    # because we want to use the predictive model for the future we can remove year
    # we don't need fl_date for our model remove that
    df= df.drop(['fl_date_year','fl_date'], axis=1)
    
    return df

In [4]:
# Function to get dummy variables and convert to numeric

def convert_to_numeric(df):
    
        # convert op_unique_carrier into dummy variable

        df= pd.get_dummies(df, columns=['op_unique_carrier'])
     
        # convert date to numeric variable 
        df['fl_date'] = pd.to_datetime(df['fl_date'], format = '%Y-%m-%dT', errors = 'coerce')
        df['fl_date_year'] = df['fl_date'].dt.year
        df['fl_date_month'] = df['fl_date'].dt.month
        df['fl_date_day'] = df['fl_date'].dt.day
        df['fl_date_week'] = df['fl_date'].dt.dayofweek
     
        return df

In [5]:
# funcion that reduces features based on feature engineering process 

def select_features(df):
    
    columns=['origin_airport_id', 'dest_airport_id', 'crs_dep_time',
             'crs_elapsed_time', 'op_unique_carrier_AA', 'op_unique_carrier_WN',
             'fl_date_month', 'fl_date_day', 'fl_date_week']
    
    df= df[columns]
    X = df.loc[:, df.columns != 'arr_delay'] # features 
        
    return (X)

In [6]:
df_flights= feature_exp(df_flights_original)

In [7]:
# removing early arrivals
 
df_flights = df_flights[df_flights['arr_delay'] > 0]
#(df_flights['arr_delay'] < 120) &
df_flights

Unnamed: 0,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,flights,distance,arr_delay,op_unique_carrier_9E,...,op_unique_carrier_QX,op_unique_carrier_UA,op_unique_carrier_VX,op_unique_carrier_WN,op_unique_carrier_YV,op_unique_carrier_YX,op_unique_carrier_ZW,fl_date_month,fl_date_day,fl_date_week
6,4334,11278,12953,2200,2320,80.0,1,214,8.0,0,...,0,0,0,0,0,1,0,5,21,1
8,5658,11057,12448,1615,1713,118.0,1,562,42.0,0,...,0,0,0,0,0,0,0,3,6,2
10,49,12191,11697,2020,2335,135.0,1,957,41.0,0,...,0,0,0,1,0,0,0,12,13,3
17,4867,15412,13930,1100,1155,115.0,1,475,211.0,0,...,0,0,0,0,0,0,1,7,18,3
26,24,12892,12478,540,1358,318.0,1,2475,14.0,0,...,0,0,0,0,0,0,0,1,23,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49990,3420,12953,10693,1959,2139,160.0,1,764,23.0,0,...,0,0,0,0,0,0,0,4,7,6
49992,405,13204,11298,2045,2228,163.0,1,985,16.0,0,...,0,0,0,0,0,0,0,8,11,5
49993,3249,11298,11823,1825,2152,147.0,1,859,98.0,0,...,0,0,0,0,0,0,0,6,1,5
49994,2326,11259,10423,855,955,60.0,1,189,15.0,0,...,0,0,0,1,0,0,0,10,30,2


In [8]:
#df_flights.isnull().sum()
#df_flights.loc[df_flights['arr_delay']<0]

In [9]:
# analyse date based on it's numeric variable
# number of flights in each month which shows August has the most flights
df_flights.groupby('fl_date_month').size()

fl_date_month
1     1227
2     1294
3     1343
4     1297
5     1503
6     1648
7     1673
8     1618
9     1171
10    1416
11    1360
12    1472
dtype: int64

In [10]:
#plt.plot(df_flights.fl_date_month,df_flights.arr_delay)
#plt.show()

# Feature Engineering

In [11]:
# Removing Features With Small Variance

vt = VarianceThreshold(0.1)
df_transformed = vt.fit_transform(df_flights)

# get the columns to see which ones are removed
selected_columns = df_flights.columns[vt.get_support()]
# transforming an array back to a data-frame preserves column labels
df_transformed = pd.DataFrame(df_transformed, columns = selected_columns)
df_transformed

Unnamed: 0,mkt_carrier_fl_num,origin_airport_id,dest_airport_id,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,arr_delay,op_unique_carrier_AA,op_unique_carrier_WN,fl_date_month,fl_date_day,fl_date_week
0,4334.0,11278.0,12953.0,2200.0,2320.0,80.0,214.0,8.0,0.0,0.0,5.0,21.0,1.0
1,5658.0,11057.0,12448.0,1615.0,1713.0,118.0,562.0,42.0,0.0,0.0,3.0,6.0,2.0
2,49.0,12191.0,11697.0,2020.0,2335.0,135.0,957.0,41.0,0.0,1.0,12.0,13.0,3.0
3,4867.0,15412.0,13930.0,1100.0,1155.0,115.0,475.0,211.0,0.0,0.0,7.0,18.0,3.0
4,24.0,12892.0,12478.0,540.0,1358.0,318.0,2475.0,14.0,0.0,0.0,1.0,23.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17017,3420.0,12953.0,10693.0,1959.0,2139.0,160.0,764.0,23.0,0.0,0.0,4.0,7.0,6.0
17018,405.0,13204.0,11298.0,2045.0,2228.0,163.0,985.0,16.0,0.0,0.0,8.0,11.0,5.0
17019,3249.0,11298.0,11823.0,1825.0,2152.0,147.0,859.0,98.0,0.0,0.0,6.0,1.0,5.0
17020,2326.0,11259.0,10423.0,855.0,955.0,60.0,189.0,15.0,0.0,1.0,10.0,30.0,2.0


In [12]:
#Removing Correlated Features with correlation matrix
df_corr = df_transformed.corr().abs()

# step 2
indices = np.where(df_corr > 0.5) 
indices = [(df_corr.index[x], df_corr.columns[y]) 
for x, y in zip(*indices)
    if x != y and x < y]

# step 3
for idx in indices: #each pair
    try:
        df_transformed.drop(idx[1], axis = 1, inplace=True)
    except KeyError:
        pass

In [13]:
print(indices)

[('crs_dep_time', 'crs_arr_time'), ('crs_elapsed_time', 'distance')]


In [14]:
# Forward Regression for feature selection

y= df_flights.arr_delay # target 

skb = SelectKBest(f_regression, k=10)
X = skb.fit_transform(df_transformed, y)

# this will give us the position of top 10 columns
skb.get_support()
# column names
df_transformed.columns[skb.get_support()]
# transforming an array back to a data-frame preserves column labels
X = pd.DataFrame(X,columns=df_transformed.columns[skb.get_support()])

In [15]:
X= X.drop(['mkt_carrier_fl_num' ],axis=1)

In [16]:
X.columns # features

Index(['origin_airport_id', 'dest_airport_id', 'crs_dep_time',
       'crs_elapsed_time', 'arr_delay', 'op_unique_carrier_AA',
       'op_unique_carrier_WN', 'fl_date_month', 'fl_date_week'],
      dtype='object')

In [17]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # 80% training and 30% test 
print(X_train.shape)
print(X_test.shape)

(13617, 9)
(3405, 9)


In [18]:
# consider dataset with only 4 features
X4= df_flights[['crs_dep_time', 'crs_arr_time', 'crs_elapsed_time', 'distance']]
X4

Unnamed: 0,crs_dep_time,crs_arr_time,crs_elapsed_time,distance
6,2200,2320,80.0,214
8,1615,1713,118.0,562
10,2020,2335,135.0,957
17,1100,1155,115.0,475
26,540,1358,318.0,2475
...,...,...,...,...
49990,1959,2139,160.0,764
49992,2045,2228,163.0,985
49993,1825,2152,147.0,859
49994,855,955,60.0,189


In [19]:
X4_train, X4_test, y_train, y_test = train_test_split(X4, y, test_size=0.2,train_size=0.8, random_state=1) # 80% training and 30% test 
print(X4_train.shape)
print(X4_test.shape)

(13617, 4)
(3405, 4)


# Building Models

In [20]:
# LinearRegression with 10 features

regressor = LinearRegression()
regressor.fit(X_train, y_train)
coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
origin_airport_id,2.9941820000000005e-17
dest_airport_id,-3.092662e-17
crs_dep_time,1.764633e-15
crs_elapsed_time,1.23172e-16
arr_delay,1.0
op_unique_carrier_AA,7.810461e-15
op_unique_carrier_WN,-2.44615e-16
fl_date_month,1.547061e-16
fl_date_week,-5.728693e-16


In [21]:
# making prediction
y_pred = regressor.predict(X_test)

In [22]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
27537,23.0,23.0
25113,64.0,64.0
21370,35.0,35.0
17340,12.0,12.0
35692,18.0,18.0
...,...,...
19140,113.0,113.0
2815,1.0,1.0
16111,4.0,4.0
42218,18.0,18.0


In [23]:
print(X4_test.shape)
print(y.shape)

(3405, 4)
(17022,)


## **Accuracy**

In [24]:

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

r2 = regressor.score(X_test, y_test)
print("r2: ",r2)

Mean Absolute Error: 7.050695481855458e-13
Mean Squared Error: 6.898466312888998e-25
Root Mean Squared Error: 8.30570064045713e-13
r2:  1.0


In [25]:
# LinearRegression with 4 features

r = LinearRegression()
r.fit(X4_train, y_train)
coeff_df4 = pd.DataFrame(r.coef_, X4.columns, columns=['Coefficient'])
coeff_df4

Unnamed: 0,Coefficient
crs_dep_time,0.007135
crs_arr_time,0.005021
crs_elapsed_time,0.444361
distance,-0.055186


In [26]:
# making prediction
y_pred4 = r.predict(X4_test)

In [27]:
df4 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred4})
df4

Unnamed: 0,Actual,Predicted
27537,23.0,55.022660
25113,64.0,48.079770
21370,35.0,45.623603
17340,12.0,59.358635
35692,18.0,30.988061
...,...,...
19140,113.0,34.533854
2815,1.0,46.327420
16111,4.0,26.379914
42218,18.0,55.008889


In [28]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred4))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred4))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred4)))

r2 = r.score(X4_test, y_test)
print("r2: ",r2)

Mean Absolute Error: 38.53943800233593
Mean Squared Error: 5405.132986055776
Root Mean Squared Error: 73.5196095341629
r2:  0.014458603014275373


# Decision Tree 

In [29]:
# decision tree algorithm 

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=10)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9665198237885463


In [30]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
27537,23.0,23.0
25113,64.0,64.0
21370,35.0,35.0
17340,12.0,12.0
35692,18.0,18.0
...,...,...
19140,113.0,113.0
2815,1.0,1.0
16111,4.0,4.0
42218,18.0,18.0


In [31]:
# # decision tree algorithm for 4 features

# Create Decision Tree classifer object
clf4 = DecisionTreeClassifier(criterion="entropy", max_depth=10)

# Train Decision Tree Classifer
clf4 = clf4.fit(X4_train,y_train)

#Predict the response for test dataset
y_pred4 = clf4.predict(X4_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred4))

Accuracy: 0.0381791483113069


In [32]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred4})
df

Unnamed: 0,Actual,Predicted
27537,23.0,1.0
25113,64.0,19.0
21370,35.0,3.0
17340,12.0,39.0
35692,18.0,17.0
...,...,...
19140,113.0,5.0
2815,1.0,6.0
16111,4.0,2.0
42218,18.0,13.0


In [33]:
#XG boost for 10 features

import xgboost as xgb
from sklearn.metrics import mean_squared_error

#data_dmatrix = xgb.DMatrix(data=X,label=y)
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,
                max_depth = 200, alpha = 10, n_estimators = 10000)

xg_reg.fit(X_train,y_train)

xg_preds = xg_reg.predict(X_test)

In [34]:
print("Accuracy on training set for XGBoost(n=10000,depth=100): ", xg_reg.score(X_train, y_train))
df = pd.DataFrame({'Actual': y_test, 'Predicted':xg_preds})
df

Accuracy on training set for XGBoost(n=10000,depth=100):  0.9995470357304487


Unnamed: 0,Actual,Predicted
27537,23.0,29.422583
25113,64.0,68.782593
21370,35.0,35.126091
17340,12.0,20.777655
35692,18.0,16.304489
...,...,...
19140,113.0,86.226410
2815,1.0,8.588377
16111,4.0,0.592404
42218,18.0,30.915262


In [35]:
# Cross validation score to evaluate the training score

scores = cross_val_score(xg_reg, X_train, y_train,cv=10)
print("Mean cross-validation score for XBboost: %.2f" % scores.mean())

Mean cross-validation score for XBboost: 0.91


In [36]:
# KFlold method in cross-validation to evaluate the training score

kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xg_reg, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

K-fold CV average score: 0.91


In [37]:
#XG boost for 4 features

import xgboost as xgb
from sklearn.metrics import mean_squared_error

#data_dmatrix = xgb.DMatrix(data=X,label=y)
xg_reg4 = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,
                max_depth = 200, alpha = 10, n_estimators = 10000)

xg_reg4.fit(X4_train,y_train)

xg_preds4 = xg_reg4.predict(X4_test)

In [39]:
print("Accuracy on training set for XGBoost(n=10000,depth=100): {:.3f}".format(xg_reg4.score(X4_train, y_train)))

Accuracy on training set for XGBoost(n=10000,depth=100): 0.352


In [40]:
df = pd.DataFrame({'Actual': y_test, 'Predicted':xg_preds})
df

Unnamed: 0,Actual,Predicted
27537,23.0,29.422583
25113,64.0,68.782593
21370,35.0,35.126091
17340,12.0,20.777655
35692,18.0,16.304489
...,...,...
19140,113.0,86.226410
2815,1.0,8.588377
16111,4.0,0.592404
42218,18.0,30.915262


In [41]:
X

Unnamed: 0,origin_airport_id,dest_airport_id,crs_dep_time,crs_elapsed_time,arr_delay,op_unique_carrier_AA,op_unique_carrier_WN,fl_date_month,fl_date_week
0,11278.0,12953.0,2200.0,80.0,8.0,0.0,0.0,5.0,1.0
1,11057.0,12448.0,1615.0,118.0,42.0,0.0,0.0,3.0,2.0
2,12191.0,11697.0,2020.0,135.0,41.0,0.0,1.0,12.0,3.0
3,15412.0,13930.0,1100.0,115.0,211.0,0.0,0.0,7.0,3.0
4,12892.0,12478.0,540.0,318.0,14.0,0.0,0.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...
17017,12953.0,10693.0,1959.0,160.0,23.0,0.0,0.0,4.0,6.0
17018,13204.0,11298.0,2045.0,163.0,16.0,0.0,0.0,8.0,5.0
17019,11298.0,11823.0,1825.0,147.0,98.0,0.0,0.0,6.0,5.0
17020,11259.0,10423.0,855.0,60.0,15.0,0.0,1.0,10.0,2.0


In [42]:
df_flights_original

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance,arr_delay
0,2018-07-09,DL,DL_CODESHARE,DL,7428,OO,N454SW,7428,14113,PIH,...,14869,SLC,"Salt Lake City, UT",650,750,N,60.0,1,150,-36.0
1,2019-04-07,AA,AA,AA,1539,AA,N123UW,1539,11057,CLT,...,11618,EWR,"Newark, NJ",1005,1210,N,125.0,1,529,-7.0
2,2018-03-29,AA,AA,AA,2493,AA,N172AJ,2493,13303,MIA,...,12478,JFK,"New York, NY",700,1000,N,180.0,1,1089,-3.0
3,2019-09-01,AA,AA,AA,569,AA,N715UW,569,11057,CLT,...,14635,RSW,"Fort Myers, FL",1700,1900,N,120.0,1,600,0.0
4,2019-01-05,WN,WN,WN,3228,WN,N759GS,3228,10821,BWI,...,13204,MCO,"Orlando, FL",1720,1945,N,145.0,1,787,-15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,2018-09-29,AA,AA_CODESHARE,AA,2966,OO,N724SK,2966,13930,ORD,...,13931,ORF,"Norfolk, VA",1840,2151,N,131.0,1,717,-9.0
49996,2018-11-14,AA,AA,AA,1967,AA,N768US,1967,11433,DTW,...,11057,CLT,"Charlotte, NC",1711,1909,N,118.0,1,500,0.0
49997,2018-10-14,UA,UA_CODESHARE,UA,4619,AX,N12126,4619,11637,FAR,...,11292,DEN,"Denver, CO",1335,1436,N,121.0,1,627,80.0
49998,2019-07-15,DL,DL_CODESHARE,DL,3374,9E,N801AY,3374,11433,DTW,...,15096,SYR,"Syracuse, NY",725,848,N,83.0,1,374,0.0


In [43]:
# test our futuer data

flights_test_original= pd.read_csv(r'flights_test.csv')
flights_test= feature_exp(flights_test_original)
flights_test= select_features(flights_test)
flights_test

Unnamed: 0,origin_airport_id,dest_airport_id,crs_dep_time,crs_elapsed_time,op_unique_carrier_AA,op_unique_carrier_WN,fl_date_month,fl_date_day,fl_date_week
0,13891,14771,1810,95,0,1,1,1,2
1,13891,14771,1150,90,0,1,1,1,2
2,13891,14831,2020,70,0,1,1,1,2
3,13891,14831,1340,75,0,1,1,1,2
4,13891,14831,915,80,0,1,1,1,2
...,...,...,...,...,...,...,...,...,...
660551,11278,11193,1859,102,0,0,1,31,4
660552,11278,11193,1515,107,0,0,1,31,4
660553,12478,10785,2205,92,0,0,1,31,4
660554,13930,12478,1035,141,0,0,1,31,4


In [44]:
xg_preds = xg_reg.predict(flights_test)

In [45]:
df_sub = pd.DataFrame({'fl_date':flights_test_original.fl_date , 'mkt_carrier':flights_test_original.mkt_carrier, 
                       'mkt_carrier_fl_num':flights_test_original.mkt_carrier_fl_num, 
                       'origin':flights_test_original.origin, 'dest':flights_test_original.dest, 
                       'predicted_delay': xg_preds})
df_sub

Unnamed: 0,fl_date,mkt_carrier,mkt_carrier_fl_num,origin,dest,predicted_delay
0,2020-01-01 00:00:00,WN,5888,ONT,SFO,-0.554190
1,2020-01-01 00:00:00,WN,6276,ONT,SFO,7.767604
2,2020-01-01 00:00:00,WN,4598,ONT,SJC,-4.887969
3,2020-01-01 00:00:00,WN,4761,ONT,SJC,-4.465211
4,2020-01-01 00:00:00,WN,5162,ONT,SJC,-1.152472
...,...,...,...,...,...,...
660551,2020-01-31 00:00:00,DL,4954,DCA,CVG,8.995273
660552,2020-01-31 00:00:00,DL,4955,DCA,CVG,11.180797
660553,2020-01-31 00:00:00,DL,4956,JFK,BTV,-3.112993
660554,2020-01-31 00:00:00,DL,4957,ORD,JFK,7.779301


In [None]:
df_sub.to_csv('Submission.csv')