# Flight Delay Pridiction Model


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [15]:
df = pd.read_csv('airlines.csv')

## Dataframe

In [16]:
df

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y
...,...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199,N
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884,N
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076,N
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140,N


## Dataframe Head

In [17]:
print(df.head())

  Month DayofMonth DayOfWeek  DepTime UniqueCarrier Origin Dest  Distance  \
0   c-8       c-21       c-7     1934            AA    ATL  DFW       732   
1   c-4       c-20       c-3     1548            US    PIT  MCO       834   
2   c-9        c-2       c-5     1422            XE    RDU  CLE       416   
3  c-11       c-25       c-6     1015            OO    DEN  MEM       872   
4  c-10        c-7       c-6     1828            WN    MDW  OMA       423   

  dep_delayed_15min  
0                 N  
1                 N  
2                 N  
3                 N  
4                 Y  


## Dataframe Info

In [18]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Month              100000 non-null  object
 1   DayofMonth         100000 non-null  object
 2   DayOfWeek          100000 non-null  object
 3   DepTime            100000 non-null  int64 
 4   UniqueCarrier      100000 non-null  object
 5   Origin             100000 non-null  object
 6   Dest               100000 non-null  object
 7   Distance           100000 non-null  int64 
 8   dep_delayed_15min  100000 non-null  object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB
None


## Dataframe description

In [19]:
print(df.describe())

             DepTime      Distance
count  100000.000000  100000.00000
mean     1341.523880     729.39716
std       476.378445     574.61686
min         1.000000      30.00000
25%       931.000000     317.00000
50%      1330.000000     575.00000
75%      1733.000000     957.00000
max      2534.000000    4962.00000


## Find rows with missing or null values

In [20]:
row_with_nulls = df[df.isnull().any(axis=1)]
print(row_with_nulls)

Empty DataFrame
Columns: [Month, DayofMonth, DayOfWeek, DepTime, UniqueCarrier, Origin, Dest, Distance, dep_delayed_15min]
Index: []


## Encode categorical variables

In [21]:
df = pd.get_dummies(df, drop_first=True)  # Encode categorical variables if any
X = df.drop('dep_delayed_15min_Y', axis=1)
y = df['dep_delayed_15min_Y']

## Split the data into training and testing sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Scale the features

In [23]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Convert back to DataFrame to retain feature names for Linear Regression

In [28]:
print(X.columns)
X_train_df = pd.DataFrame(X_train, columns=X.columns)
X_test_df = pd.DataFrame(X_test, columns=X.columns)

Index(['DepTime', 'Distance', 'Month_c-10', 'Month_c-11', 'Month_c-12',
       'Month_c-2', 'Month_c-3', 'Month_c-4', 'Month_c-5', 'Month_c-6',
       ...
       'Dest_TYS', 'Dest_VCT', 'Dest_VIS', 'Dest_VLD', 'Dest_VPS', 'Dest_WRG',
       'Dest_WYS', 'Dest_XNA', 'Dest_YAK', 'Dest_YUM'],
      dtype='object', length=646)


## Train Model

In [25]:
lr = LinearRegression()
lr.fit(X_train_df, y_train)
y_pred_lr = lr.predict(X_test_df)

## Make Predictions

In [26]:
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f'Linear Regression - MSE: {mse_lr}, R-squared: {r2_lr}')

Linear Regression - MSE: 1.9568535980516537e+20, R-squared: -1.277179608091028e+21
