# Predicting Flight Delays

This example shows use of classification models to predict flight delays. 
Original example can be found [here](https://github.com/frenchlam/dask_CDSW/blob/master/03_Dask_ML-LargeDS.ipynb) (dataset is [here](https://github.com/frenchlam/dask_CDSW/blob/master/data/1988.csv.bz2)).

In [3]:
import pandas as pd
import numpy as np
import bodo
import time

## Pre-processing in Pandas

### Read flights dataset

In [4]:
@bodo.jit(cache=True)
def read_flights(input_file):
    flight_df = pd.read_csv(input_file, sep=',', header=0,
        usecols=['Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'FlightNum', 'Origin', 'Dest','Cancelled'])    
    print(flight_df.head())    
    return flight_df

input_file = "s3://bodo-example-data/flights/1988.csv.bz2"
flight_df = read_flights(input_file)


   Month  DayofMonth  DayOfWeek  CRSDepTime  ...  FlightNum Origin  Dest Cancelled
0      1           9          6        1331  ...        942    SYR   BWI         0
1      1          10          7        1331  ...        942    SYR   BWI         0
2      1          11          1        1331  ...        942    SYR   BWI         0
3      1          12          2        1331  ...        942    SYR   BWI         0
4      1          13          3        1331  ...        942    SYR   BWI         0

[5 rows x 10 columns]


### Feature Engineering
Create routes from origin and destination:

In [5]:
@bodo.jit(cache=True)
def create_routes(flight_df):
    flight_df['route'] = flight_df['Origin'] + "_" + flight_df['Dest']
    # show top 20 routes - As defined by nb of flights
    top_routes = flight_df['route'].value_counts(ascending=False)
    print(top_routes.head(10))
    # focus on 50 biggest routes - As defined by nb of flights 
    route_lst = top_routes.head(50)
    flight_df = flight_df[flight_df['route'].isin(route_lst.index)]
    return flight_df

flight_df = create_routes(flight_df)

route
LAX_SFO    20750
SFO_LAX    20658
LAX_PHX    13461
PHX_LAX    13273
LAX_LAS    12175
LGA_BOS    12027
LAS_LAX    11801
SJC_LAX    11535
LAX_SJC    11292
BOS_LGA    11141
Name: count, dtype: int64


Look at their cancellations:

In [6]:
@bodo.jit(cache=True)
def check_cancelations(flight_df):
    res = flight_df[['route', 'Cancelled', 'Month']].groupby(by='route')\
         .agg({'Month':'size', 'Cancelled':'sum'})\
        .rename(columns={'Month':'count','Cancelled':'nb_cancelled'}) \
        .reset_index()\
        .sort_values(['count'], ascending=False)
    print(res.head(10))

check_cancelations(flight_df)

      route  count  nb_cancelled
0   LAX_SFO  20750           228
32  SFO_LAX  20658           206
43  LAX_PHX  13461            78
29  PHX_LAX  13273            71
35  LAX_LAS  12175            58
41  LGA_BOS  12027           287
      route  count  nb_cancelled
19  LAS_LAX  11801            47
10  SJC_LAX  11535            71
42  LAX_SJC  11292            71
24  BOS_LGA  11141           243


Bodo automatically distributes the data on the worker processes. You can view this distribution by running the simple JIT'd function below.

In [7]:
@bodo.wrap_python(bodo.types.none)
def print_info(flight_df):
    print(flight_df.shape)

@bodo.jit
def print_info_jit(flight_df):
    print_info(flight_df)

print_info_jit(flight_df)

(63970, 11)
(64152, 11)
(60700, 11)
(58127, 11)
(55781, 11)
(66801, 11)
(59454, 11)
(58268, 11)


Quick sanity check - count number of null values():

In [9]:
@bodo.jit
def check_count(flight_df):
    print(flight_df.isnull().sum())
    
check_count(flight_df)

Month            0
DayofMonth       0
DayOfWeek        0
CRSDepTime       0
CRSArrTime       0
UniqueCarrier    0
FlightNum        0
Origin           0
Dest             0
Cancelled        0
route            0
dtype: int64


### Feature and label encoding

#### Encode Labels using Cancelled column

In [10]:
@bodo.jit(cache=True)
def encode_labels(flight_df):
    flight_df['Cancelled'] = pd.Categorical(flight_df["Cancelled"])
    flight_df['Label'] = flight_df['Cancelled'].cat.codes
    return flight_df.drop(['Cancelled'], axis=1)

flight_df = encode_labels(flight_df)

#### Feature Encoding

This is needed because Scikit-learn only supports numerical values. Get airport unique values and encode origin, destination, and route features:

In [11]:
import numpy as np

@bodo.jit(cache=True)
def get_airport_list(flight_df):
    airport_list = np.sort((pd.concat((flight_df['Origin'], flight_df['Dest']))).unique())
    return airport_list

airport_list = get_airport_list(flight_df)

In [12]:
from sklearn.preprocessing import LabelEncoder

@bodo.jit(cache=True)
def encode_features(flight_df, airport_list):
    t1 = time.time()    
    # encode airlines 
    le_carrier = LabelEncoder()
    flight_df['Carrier_encoded'] = pd.Series(le_carrier.fit_transform(flight_df['UniqueCarrier'].values))
    # Encode airports : Using same encoder for both origin and dest ( consistent encoding of airports )
    le_airport = LabelEncoder()
    le_airport.fit(airport_list)
    flight_df['Origin_encoded'] = pd.Series(le_airport.transform(flight_df['Origin']))
    flight_df['Dest_encoded'] = pd.Series(le_airport.transform(flight_df['Dest']))
    # Encode routes 
    le_route = LabelEncoder()
    flight_df['route_encoded'] = pd.Series(le_route.fit_transform(flight_df['route'].values))
    print("Encoding time: ", (time.time()-t1), " sec")
    return flight_df

flight_df = encode_features(flight_df, airport_list)

Encoding time:  0.17063599999983126  sec


In [13]:
@bodo.jit(cache=True)
def sample(flight_df):
    print(flight_df[['UniqueCarrier','Carrier_encoded','Origin','Origin_encoded',
           'Dest', 'Dest_encoded', 'route', 'route_encoded' ]].sample(10))
    
sample(flight_df)

       UniqueCarrier  Carrier_encoded  ...    route  route_encoded
207963            PI                8  ...  MCO_MIA             25
450866            PS                9  ...  SFO_SEA             47

[2 rows x 8 columns]
        UniqueCarrier  Carrier_encoded  ...    route  route_encoded
3704375            CO                2  ...  ORD_EWR             30
3510950            TW               10  ...  SFO_LAX             45
3889736            US               12  ...  SFO_LAX             45

[3 rows x 8 columns]
        UniqueCarrier  Carrier_encoded  ...    route  route_encoded
2574835            US               12  ...  LAX_SEA             19
2310854            NW                6  ...  ORD_DTW             29
2094889            AA                0  ...  LAX_SFO             20

[3 rows x 8 columns]
        UniqueCarrier  Carrier_encoded  ...    route  route_encoded
2614541            TW               10  ...  LAX_SFO             20
3077243            UA               11  ...  DEN_ORD 

In [14]:
from sklearn.model_selection import train_test_split

@bodo.jit(cache=True)
def split_data(flight_df):
    t1 = time.time()
    X_train, X_test, y_train, y_test = train_test_split(flight_df.drop(['UniqueCarrier','Origin','Dest','route'],axis=1),
                                                    flight_df['Label'], 
                                                    test_size=0.3, train_size=0.7,
                                                    random_state=100)
    print("Data splitting time: ", (time.time()-t1), " sec")    

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(flight_df)

Data splitting time:  0.7646199999999226  sec


## Model Training Using Scikit-learn

### RandomForestClassifier

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score # evaluation metric

@bodo.jit(cache=True)
def rf_model(X_train, X_test, y_train, y_test):
    start = time.time()
    rf = RandomForestClassifier()
    rf.fit(X_train.to_numpy(), y_train.values)
    y_pred = rf.predict(X_test)
    print("RandomForestClassifier fit and predict time: ", time.time()-start)    
    print('Accuracy score {}'.format(accuracy_score(y_test, y_pred)))

rf_model(X_train, X_test, y_train, y_test)

RandomForestClassifier fit and predict time:  1.9505760000001828
Accuracy score 1.0




### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score  # evaluation metric
@bodo.jit(cache=True)
def lr_model(X_train, X_test, y_train, y_test):
    start = time.time()
    lr = LogisticRegression()
    lr.fit(X_train, y_train.values)
    y_pred = lr.predict(X_test)
    print("Logistic Regression fit and predict time: ", time.time()-start)    
    print('Accuracy score {}'.format(accuracy_score(y_test, y_pred)))

lr_model(X_train, X_test, y_train, y_test)

  res = func(*args, **kwargs)


Logistic Regression fit and predict time:  0.6519339999999829
Accuracy score 0.9815770030647986


