In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
%matplotlib inline
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
import warnings
warnings.filterwarnings('ignore')
import pickle
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv('data.csv')

In [3]:
print(df.columns)

Index(['fl_date', 'mkt_unique_carrier', 'tail_num', 'origin', 'dest',
       'arr_delay', 'distance', 'month', 'month_day', 'week_day', 'dep_hour',
       'arr_hour'],
      dtype='object')


In [4]:
df.isnull().sum()

fl_date                    0
mkt_unique_carrier         0
tail_num               49269
origin                     0
dest                       0
arr_delay             311744
distance                   0
month                      0
month_day                  0
week_day                   0
dep_hour                2568
arr_hour               48937
dtype: int64

In [5]:
df=df.dropna(subset=['tail_num','arr_delay'])

In [6]:
# # Shuffle the data in the DataFrame and create a sample with half the size
sample_size = len(df) // 10
df = df.sample(n=sample_size, random_state=42)

# Reset the index of the shuffled sample
df.reset_index(drop=True, inplace=True)

In [7]:
df=df.fillna(0)

In [8]:
df.head()

Unnamed: 0,fl_date,mkt_unique_carrier,tail_num,origin,dest,arr_delay,distance,month,month_day,week_day,dep_hour,arr_hour
0,2019-12-12,DL,N270SY,DSM,LGA,-19.0,1031,12,12,3,13.0,17.0
1,2019-09-30,DL,N896DN,MCI,ATL,-17.0,692,9,30,0,15.0,18.0
2,2018-09-14,G4,415NV,SFB,IAG,7.0,996,9,14,4,15.0,17.0
3,2018-06-02,B6,N337JB,PSE,MCO,-21.0,1179,6,2,5,15.0,4.0
4,2018-09-28,AS,N551AS,YAK,CDV,-10.0,213,9,28,4,11.0,12.0


In [9]:
df['origin-dest']=df['origin']+'-'+df['dest']
x= df.groupby('origin-dest')['arr_delay'].mean()
df['origin-dest-mean']= (df['origin-dest'].map(x)).round(2)
df=df.drop(['origin','dest'],axis=1)

In [10]:
df['month']=df['month'].astype(str)
df['month_carrier']=df['mkt_unique_carrier']+'-'+df['month']
x= df.groupby('month_carrier')['arr_delay'].mean()
df['month_carrier_mean']= (df['month_carrier'].map(x)).round(2)
df=df.drop(['month_carrier','mkt_unique_carrier','month'],axis=1)

In [11]:
df.head()

Unnamed: 0,fl_date,tail_num,arr_delay,distance,month_day,week_day,dep_hour,arr_hour,origin-dest,origin-dest-mean,month_carrier_mean
0,2019-12-12,N270SY,-19.0,1031,12,3,13.0,17.0,DSM-LGA,26.84,-0.5
1,2019-09-30,N896DN,-17.0,692,30,0,15.0,18.0,MCI-ATL,-1.84,-0.86
2,2018-09-14,415NV,7.0,996,14,4,15.0,17.0,SFB-IAG,6.81,2.55
3,2018-06-02,N337JB,-21.0,1179,2,5,15.0,4.0,PSE-MCO,-1.48,10.71
4,2018-09-28,N551AS,-10.0,213,28,4,11.0,12.0,YAK-CDV,0.81,-1.39


In [12]:
df=df.drop(['fl_date','tail_num','origin-dest','distance'],axis=1)

In [13]:
df.head()

Unnamed: 0,arr_delay,month_day,week_day,dep_hour,arr_hour,origin-dest-mean,month_carrier_mean
0,-19.0,12,3,13.0,17.0,26.84,-0.5
1,-17.0,30,0,15.0,18.0,-1.84,-0.86
2,7.0,14,4,15.0,17.0,6.81,2.55
3,-21.0,2,5,15.0,4.0,-1.48,10.71
4,-10.0,28,4,11.0,12.0,0.81,-1.39


In [14]:
dummies = ['month_day', 'week_day','dep_hour','arr_hour'] 

for i in dummies:
    df = pd.concat([df, pd.get_dummies(df[i], prefix=i,dtype=float)], axis=1)
    df = df.drop([i], axis=1)

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
# Scale features since weight magnitudes will effect regularization weight penalties
scaler = StandardScaler().set_output(transform='pandas')
df_scaled = scaler.fit_transform(df)
df_scaled.head()

Unnamed: 0,arr_delay,origin-dest-mean,month_carrier_mean,month_day_1,month_day_2,month_day_3,month_day_4,month_day_5,month_day_6,month_day_7,...,arr_hour_14.0,arr_hour_15.0,arr_hour_16.0,arr_hour_17.0,arr_hour_18.0,arr_hour_19.0,arr_hour_20.0,arr_hour_21.0,arr_hour_22.0,arr_hour_23.0
0,-0.488772,3.281929,-1.35347,-0.182733,-0.181675,-0.181532,-0.182549,-0.184022,-0.184581,-0.18394,...,-0.256388,-0.247764,-0.259235,3.924084,-0.252143,-0.255898,-0.243151,-0.25658,-0.219005,-0.217336
1,-0.44882,-1.122191,-1.435121,-0.182733,-0.181675,-0.181532,-0.182549,-0.184022,-0.184581,-0.18394,...,-0.256388,-0.247764,-0.259235,-0.254837,3.96601,-0.255898,-0.243151,-0.25658,-0.219005,-0.217336
2,0.030609,0.206109,-0.661707,-0.182733,-0.181675,-0.181532,-0.182549,-0.184022,-0.184581,-0.18394,...,-0.256388,-0.247764,-0.259235,3.924084,-0.252143,-0.255898,-0.243151,-0.25658,-0.219005,-0.217336
3,-0.528725,-1.066909,1.189042,-0.182733,5.504347,-0.181532,-0.182549,-0.184022,-0.184581,-0.18394,...,-0.256388,-0.247764,-0.259235,-0.254837,-0.252143,-0.255898,-0.243151,-0.25658,-0.219005,-0.217336
4,-0.308986,-0.715255,-1.555329,-0.182733,-0.181675,-0.181532,-0.182549,-0.184022,-0.184581,-0.18394,...,-0.256388,-0.247764,-0.259235,-0.254837,-0.252143,-0.255898,-0.243151,-0.25658,-0.219005,-0.217336


In [16]:
 # from sklearn.utils import shuffle
X, y = df.drop('arr_delay',axis=1), df['arr_delay']

X, y = shuffle(X, y, random_state=27)

In [17]:
# Train our model
from sklearn.linear_model import LinearRegression
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#regression model
reg = LinearRegression()
reg.fit(X_train, y_train)

# Check performance on train and test set
from sklearn.metrics import r2_score

y_train_pred = reg.predict(X_train)
y_test_pred = reg.predict(X_test)

r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f'Train R^2:\t{r2_train}\nTest R^2:\t{r2_test}')

Train R^2:	0.03163727346964518
Test R^2:	0.031099161498131855
