In [1]:
import pandas as pd

In [131]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import calmap
from plotly_calplot import calplot
from pathlib import Path

pd.set_option("display.max_columns", 500)
plt.style.use("seaborn-colorblind")
pal = sns.color_palette()

# Loading Data

In [151]:
df = pd.read_parquet("final_flight_status.parquet",engine = "pyarrow",
                     columns = ['FlightDate','Airline',
                                 "Cancelled","Diverted","CRSDepTime","OriginStateName",
                                 "DestStateName","CRSArrTime","Distance"])

In [152]:
df = df.sample(5000000)

In [153]:
df.shape

(5000000, 9)

In [154]:
# df_top = (
#     df["Airline"]
#     .value_counts()
#     .reset_index()
#     .query("Airline > 3_000_000")["index"]
#     .values.tolist()
# )

In [155]:
# df_top

In [156]:
# df = df.loc[df["Airline"].isin(df_top)].reset_index(drop=True).copy()

In [157]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000000 entries, 27551807 to 18001969
Data columns (total 9 columns):
 #   Column           Dtype         
---  ------           -----         
 0   FlightDate       datetime64[ns]
 1   Airline          object        
 2   Cancelled        bool          
 3   Diverted         bool          
 4   CRSDepTime       int64         
 5   OriginStateName  object        
 6   DestStateName    object        
 7   CRSArrTime       int64         
 8   Distance         float64       
dtypes: bool(2), datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 314.7+ MB


In [158]:
df.columns

Index(['FlightDate', 'Airline', 'Cancelled', 'Diverted', 'CRSDepTime',
       'OriginStateName', 'DestStateName', 'CRSArrTime', 'Distance'],
      dtype='object')

# Feature Engineering

In [159]:
# df = df[['FlightDate','Airline','Flight_Number_Marketing_Airline',
#          "Origin","Dest","Cancelled","Diverted","CRSDepTime","DepTime","DepDelayMinutes",
#          "OriginAirportID","OriginCityName","OriginStateName","DestAirportID",
#          "DestCityName","DestStateName","TaxiOut","TaxiIn",
#          "CRSArrTime","ArrTime","ArrDelayMinutes"]]

In [160]:
cat_cols = ["Airline", "OriginStateName", "DestStateName"]
for c in cat_cols:
    df[c] = df[c].astype("category")

In [161]:
df["Year"] = df["FlightDate"].dt.year
df['Month'] = df['FlightDate'].dt.month
df['day_of_week'] = df['FlightDate'].dt.dayofweek
df['day'] =  df['FlightDate'].dt.day

In [162]:
df.columns

Index(['FlightDate', 'Airline', 'Cancelled', 'Diverted', 'CRSDepTime',
       'OriginStateName', 'DestStateName', 'CRSArrTime', 'Distance', 'Year',
       'Month', 'day_of_week', 'day'],
      dtype='object')

In [163]:
#One hot encoding

In [164]:
y = df['Cancelled']

df = df[['Airline', 'Diverted', 'CRSDepTime',
       'OriginStateName', 'DestStateName', 'CRSArrTime', 'CRSArrTime', 'Distance',
       'Year', 'Month', 'day_of_week', 'day']]

In [165]:
y.shape

(5000000,)

In [166]:
df.shape

(5000000, 12)

In [167]:
from pandas import get_dummies as one_hot_encode

if "Airline" in df.columns:
    df = one_hot_encode(df, columns=["Airline"])

In [168]:
if "OriginStateName" in df.columns:
    df = one_hot_encode(df, columns=["OriginStateName"])

In [169]:
if "DestStateName" in df.columns:
    df = one_hot_encode(df, columns=["DestStateName"])

In [171]:
df.isnull().sum()

Diverted                       0
CRSDepTime                     0
CRSArrTime                     0
CRSArrTime                     0
Distance                       0
                              ..
DestStateName_Virginia         0
DestStateName_Washington       0
DestStateName_West Virginia    0
DestStateName_Wisconsin        0
DestStateName_Wyoming          0
Length: 143, dtype: int64

In [118]:
# df = df.fillna(-1)

In [172]:
y[df.index].value_counts()

False    4866489
True      133511
Name: Cancelled, dtype: int64

In [173]:
y.value_counts()

False    4866489
True      133511
Name: Cancelled, dtype: int64

# Scaling Data

In [174]:
df = (df - df.mean(axis=0)) / df.std(axis=0)

# Train Test Split

In [175]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df, y, random_state=99)
print("TRAIN:", x_train.shape, y_train.shape)
print("TEST:", x_test.shape, y_test.shape)

TRAIN: (3750000, 143) (3750000,)
TEST: (1250000, 143) (1250000,)


# Model

In [176]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=99)
model.fit(x_train, y_train)

LogisticRegression(random_state=99)

In [177]:
from pandas import Series

coefs = Series(model.coef_[0], index=x_train.columns)
coefs.sort_values(ascending=False)

DestStateName_New Jersey         0.103027
OriginStateName_New Jersey       0.099529
Airline_Allegiant Air            0.094022
CRSDepTime                       0.090792
Year                             0.087934
                                   ...   
Airline_United Air Lines Inc.   -0.100776
Airline_Delta Air Lines Inc.    -0.113950
Distance                        -0.128781
Diverted                        -0.425462
Month                           -0.465068
Length: 143, dtype: float64

In [178]:
y_pred = model.predict(x_test)

# Score

In [179]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       False       0.97      1.00      0.99   1216682
        True       0.00      0.00      0.00     33318

    accuracy                           0.97   1250000
   macro avg       0.49      0.50      0.49   1250000
weighted avg       0.95      0.97      0.96   1250000



  _warn_prf(average, modifier, msg_start, len(result))


In [180]:
roc_auc_score(y_test, y_pred)

0.5

In [181]:
y_pred_proba = model.predict_proba(x_test)
roc_auc_score(y_test, y_pred_proba[:,1])

0.680465195985942

In [182]:
cmat = confusion_matrix(y_test, y_pred, labels=model.classes_)
cmat

array([[1216682,       0],
       [  33318,       0]], dtype=int64)

In [183]:
from pandas import DataFrame

confusion_df = DataFrame(cmat, index=model.classes_, columns=model.classes_)
confusion_df

Unnamed: 0,False,True
False,1216682,0
True,33318,0
