In [4]:
import matplotlib.pylab as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from pandas_util import normalize_columns

In [5]:
delays_df = pd.read_csv("./datasets/dmba/FlightDelays.csv")
normalize_columns(delays_df)
delays_df

Unnamed: 0,crs_dep_time,carrier,dep_time,dest,distance,fl_date,fl_num,origin,weather,day_week,day_of_month,tail_num,flight_status
0,1455,OH,1455,JFK,184,01/01/2004,5935,BWI,0,4,1,N940CA,ontime
1,1640,DH,1640,JFK,213,01/01/2004,6155,DCA,0,4,1,N405FJ,ontime
2,1245,DH,1245,LGA,229,01/01/2004,7208,IAD,0,4,1,N695BR,ontime
3,1715,DH,1709,LGA,229,01/01/2004,7215,IAD,0,4,1,N662BR,ontime
4,1039,DH,1035,LGA,229,01/01/2004,7792,IAD,0,4,1,N698BR,ontime
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2196,645,RU,644,EWR,199,1/31/2004,2761,DCA,0,6,31,N15555,ontime
2197,1700,RU,1653,EWR,213,1/31/2004,2497,IAD,0,6,31,N16976,ontime
2198,1600,RU,1558,EWR,199,1/31/2004,2361,DCA,0,6,31,N14902,ontime
2199,1359,RU,1403,EWR,199,1/31/2004,2216,DCA,0,6,31,N16961,ontime


In [6]:
# Convert to categorical.
delays_df.day_week = delays_df.day_week.astype("category")
delays_df.flight_status = delays_df.flight_status.astype("category")

# Create hourly bins departure time
delays_df.crs_dep_time = [round(t / 100) for t in delays_df.crs_dep_time]
delays_df.crs_dep_time = delays_df.crs_dep_time.astype("category")

In [7]:
predictors = ["day_week", "crs_dep_time", "origin", "dest", "carrier"]
outcome = "flight_status"

X = pd.get_dummies(delays_df[predictors])
y = delays_df.flight_status.astype("category")
classes = list(y.cat.categories)
classes

['delayed', 'ontime']

In [8]:
# Split into training and validation.
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.4, random_state=1
)

In [9]:
# Run naive bayes
delays_nb = MultinomialNB(alpha=0.1)
delays_nb.fit(X_train, y_train)

MultinomialNB(alpha=0.1)

In [10]:
# Predict probabilities
pred_prob_train = delays_nb.predict_proba(X_train)
pred_prob_valid = delays_nb.predict_proba(X_valid)

In [21]:
# Predict class membership
y_train_pred = delays_nb.predict(X_train)
y_valid_pred = delays_nb.predict(X_valid)

## Pivot table of flight status by destination airport (training data)

In [12]:
# split the original data frame into a train and test using the same random state
train_df, valid_df = train_test_split(delays_df, test_size=0.4, random_state=1)

pd.set_option("precision", 4)

# Probablity of flight status
print(train_df.flight_status.value_counts() / len(train_df))

for predictor in predictors:
    # Construct the frequency table
    df = train_df[["flight_status", predictor]]
    freq_table = df.pivot_table(index="flight_status", columns=predictor, aggfunc=len)

    # Divide each value by the sum of the row to get conditional probabillities.
    prop_table = freq_table.apply(lambda x: x / sum(x), axis=1)
    print(prop_table)
    print()

pd.reset_option("precision")

ontime     0.8023
delayed    0.1977
Name: flight_status, dtype: float64
day_week            1       2       3       4       5      6       7
flight_status                                                       
delayed        0.1916  0.1494  0.1149  0.1264  0.1877  0.069  0.1609
ontime         0.1246  0.1416  0.1445  0.1794  0.1690  0.136  0.1048

crs_dep_time        6       7       8       9      10      11      12      13  \
flight_status                                                                   
delayed        0.0345  0.0536  0.0651  0.0192  0.0307  0.0115  0.0498  0.0460   
ontime         0.0623  0.0633  0.0850  0.0567  0.0519  0.0340  0.0661  0.0746   

crs_dep_time       14      15      16      17      18      19      20      21  
flight_status                                                                  
delayed        0.0383  0.2031  0.0728  0.1533  0.0192  0.0996  0.0153  0.0881  
ontime         0.0576  0.1171  0.0774  0.1001  0.0349  0.0397  0.0264  0.0529  

origi

## Scoring the example flight

In [19]:
# Classify a specific flight by searching in the dataset
# for a flight with the same predictor values
df = pd.concat(
    [
        pd.DataFrame({"actual": y_valid, "predicted": y_valid_pred}),
        pd.DataFrame(pred_prob_valid, index=y_valid.index),
    ],
    axis=1,
)

mask = (
    (X_valid.carrier_DL == 1)
    & (X_valid.day_week_7 == 1)
    & (X_valid.crs_dep_time_10 == 1)
    & (X_valid.dest_LGA == 1)
    & (X_valid.origin_DCA == 1)
)
df[mask]

Unnamed: 0,actual,predicted,0,1
1225,ontime,ontime,0.058299,0.941701


## Confusion matrices for flight delay using the naive Bayes classifier

In [24]:
from sklearn.metrics import classification_report

# Training
print(classification_report(y_train, y_train_pred, target_names=classes))

              precision    recall  f1-score   support

     delayed       0.48      0.20      0.28       261
      ontime       0.83      0.95      0.88      1059

    accuracy                           0.80      1320
   macro avg       0.65      0.57      0.58      1320
weighted avg       0.76      0.80      0.76      1320



In [25]:
# Validation
print(classification_report(y_valid, y_valid_pred, target_names=classes))

              precision    recall  f1-score   support

     delayed       0.36      0.16      0.22       167
      ontime       0.83      0.93      0.88       714

    accuracy                           0.79       881
   macro avg       0.59      0.54      0.55       881
weighted avg       0.74      0.79      0.75       881

