# Preprocessing
### Dataset with weather

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_hdf("dataset_with_weather.hdf5", key="data")
dataset.reset_index(inplace=True)

dataset["dep_delay_more_15"] = dataset.DepDelayMinutes >= 15

In [3]:
dataset.head()

Unnamed: 0,index,Year,Quarter,Month,DayofMonth,DayOfWeek,UniqueCarrier,AirlineID,Carrier,TailNum,...,"light ice pellets, rain and snow",fog,rain,heavy snow,nearby dust,"light snow, rain and ice pellets",light rain and snow,light ice pellets and rain,light ice pellets,dep_delay_more_15
0,166,2012,1,1,1,7,MQ,20398,MQ,N815MQ,...,0,0,0,0,0,0,0,0,0,False
1,188,2012,1,1,1,7,B6,20409,B6,N603JB,...,0,0,0,0,0,0,0,0,0,False
2,193,2012,1,1,1,7,B6,20409,B6,N630JB,...,0,0,0,0,0,0,0,0,0,False
3,198,2012,1,1,1,7,B6,20409,B6,N586JB,...,0,0,0,0,0,0,0,0,0,False
4,202,2012,1,1,1,7,MQ,20398,MQ,N816MQ,...,0,0,0,0,0,0,0,0,0,False


In [4]:
for column_name in dataset.columns:
    print(column_name, " : ", dataset[column_name].dtype)

index  :  int64
Year  :  int64
Quarter  :  int64
Month  :  int64
DayofMonth  :  int64
DayOfWeek  :  int64
UniqueCarrier  :  object
AirlineID  :  int64
Carrier  :  object
TailNum  :  object
FlightNum  :  int64
OriginAirportID  :  int64
Origin  :  object
DestAirportID  :  int64
Dest  :  object
CRSDepTime  :  int64
DepTime  :  int64
DepDelayMinutes  :  int64
DepDel15  :  int64
DepartureDelayGroups  :  int64
CRSArrTime  :  int64
ArrTime  :  int64
ArrDelayMinutes  :  int64
ArrDel15  :  int64
ArrivalDelayGroups  :  int64
Cancelled  :  int64
CRSElapsedTime  :  int64
ActualElapsedTime  :  float64
Distance  :  int64
DistanceGroup  :  int64
CarrierDelay  :  int64
WeatherDelay  :  int64
NASDelay  :  int64
SecurityDelay  :  int64
LateAircraftDelay  :  int64
SCH_START_DATETIME  :  datetime64[ns]
SCH_END_DATETIME  :  datetime64[ns]
ACT_START_DATETIME  :  datetime64[ns]
ACT_END_DATETIME  :  datetime64[ns]
SCH_START_DATETIME_MIN  :  float64
SCH_END_DATETIME_MIN  :  float64
ACT_START_DATETIME_MIN  :  f

### Selecting Features and compression

In [5]:
dataset = dataset[["Quarter", "Month", "DayofMonth", "DayOfWeek", "AirlineID", "Origin", "TIME_FROM_LAST_FACT_LAND",
                   "LATE_ARRIVAL_BEFORE", "FLIGHT_TIME_TO_MIDNIGHT", "wind_speed_mps", "wind_gust_mps", "wind_dir",
                   "vis", "runway_visual_range_from", "first_layer_coverage", "first_layer_height", "first_layer_CB",
                   "first_layer_TCU", "clouds_hidden", "vertical_visibility", "heavy thunderstorm with rain and hail",
                   "patches of fog", "nearby fog", "ice pellets and rain", "dust", "rain and ice pellets",
                   "blowing snow", "light freezing rain, snow and ice pellets", "snow pellets", "snow", "drizzle",
                   "ice pellets and snow", "light freezing rain and snow", "nearby blowing dust", "light rain",
                   "heavy rain and ice pellets", "smoke", "heavy thunderstorm with rain", "squalls",
                   "thunderstorm with snow pellets and rain", "light rain and ice pellets",
                   "heavy thunderstorm with rain and snow pellets", "light ice pellets and snow", "light drizzle",
                   "light freezing rain", "light ice pellets, snow and rain", "thunderstorm",
                   "light freezing rain and ice pellets", "haze", "mist", "thunderstorm with rain", "light snow",
                   "rain, snow and ice pellets", "shallow fog", "light freezing rain, ice pellets and snow",
                   "nearby thunderstorm", "rain and snow", "light snow and ice pellets",
                   "light rain, snow and ice pellets", "unknown precipitation", "light thunderstorm with rain and snow",
                   "light thunderstorm with rain", "light freezing drizzle", "light rain, ice pellets and snow",
                   "light snow and rain", "freezing rain and ice pellets", "heavy ice pellets and snow", "blowing dust",
                   "heavy rain", "ice pellets", "freezing rain", "freezing fog", "light ice pellets, rain and snow",
                   "fog", "rain", "heavy snow", "nearby dust", "light snow, rain and ice pellets",
                   "light rain and snow", "light ice pellets and rain", "light ice pellets", "dep_delay_more_15"
                   ]]

In [6]:
dataset[["Quarter", "Month", "DayofMonth", "DayOfWeek"]] = \
    dataset[["Quarter", "Month", "DayofMonth", "DayOfWeek"]].astype(np.int8)

dataset[["AirlineID"]] = \
    dataset[["AirlineID"]].astype(np.int32)

dataset[["TIME_FROM_LAST_FACT_LAND", "LATE_ARRIVAL_BEFORE"]] = \
    dataset[["TIME_FROM_LAST_FACT_LAND", "LATE_ARRIVAL_BEFORE"]].astype(np.int32)

dataset[["FLIGHT_TIME_TO_MIDNIGHT"]] = \
    dataset[["FLIGHT_TIME_TO_MIDNIGHT"]].astype(np.int8)

dataset[["first_layer_coverage", "first_layer_CB", "first_layer_TCU", "clouds_hidden"]] = \
    dataset[["first_layer_coverage", "first_layer_CB", "first_layer_TCU", "clouds_hidden"]].astype(np.int8)

In [7]:
dataset.head()

Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,AirlineID,Origin,TIME_FROM_LAST_FACT_LAND,LATE_ARRIVAL_BEFORE,FLIGHT_TIME_TO_MIDNIGHT,wind_speed_mps,...,"light ice pellets, rain and snow",fog,rain,heavy snow,nearby dust,"light snow, rain and ice pellets",light rain and snow,light ice pellets and rain,light ice pellets,dep_delay_more_15
0,1,1,1,7,20398,JFK,53,18,12,2.57222,...,0,0,0,0,0,0,0,0,0,False
1,1,1,1,7,20409,JFK,37,5,12,0.0,...,0,0,0,0,0,0,0,0,0,False
2,1,1,1,7,20409,JFK,171,28,12,0.0,...,0,0,0,0,0,0,0,0,0,False
3,1,1,1,7,20409,JFK,56,-3,11,0.0,...,0,0,0,0,0,0,0,0,0,False
4,1,1,1,7,20398,JFK,57,22,11,0.0,...,0,0,0,0,0,0,0,0,0,False


In [8]:
for column_name in dataset.columns:
    print(column_name, " : ", dataset[column_name].dtype)

Quarter  :  int8
Month  :  int8
DayofMonth  :  int8
DayOfWeek  :  int8
AirlineID  :  int32
Origin  :  object
TIME_FROM_LAST_FACT_LAND  :  int32
LATE_ARRIVAL_BEFORE  :  int32
FLIGHT_TIME_TO_MIDNIGHT  :  int8
wind_speed_mps  :  float64
wind_gust_mps  :  float64
wind_dir  :  float64
vis  :  float64
runway_visual_range_from  :  float64
first_layer_coverage  :  int8
first_layer_height  :  float64
first_layer_CB  :  int8
first_layer_TCU  :  int8
clouds_hidden  :  int8
vertical_visibility  :  float64
heavy thunderstorm with rain and hail  :  int8
patches of fog  :  int8
nearby fog  :  int8
ice pellets and rain  :  int8
dust  :  int8
rain and ice pellets  :  int8
blowing snow  :  int8
light freezing rain, snow and ice pellets  :  int8
snow pellets  :  int8
snow  :  int8
drizzle  :  int8
ice pellets and snow  :  int8
light freezing rain and snow  :  int8
nearby blowing dust  :  int8
light rain  :  int8
heavy rain and ice pellets  :  int8
smoke  :  int8
heavy thunderstorm with rain  :  int8
squa

#### Let's save changes

In [9]:
dataset.to_hdf("dataset_for_prediction.hdf5", key="data")

### ReEncoding AirlineID 

In [10]:
from sklearn import preprocessing
from scipy.stats import mode

airlines_label_encoder = preprocessing.LabelEncoder()
dataset["AirlineID"] = airlines_label_encoder.fit_transform(dataset["AirlineID"])

### Filling NaN values

In [11]:
dataset.wind_speed_mps.fillna(0, inplace=True)
dataset.wind_gust_mps.fillna(0, inplace=True)


#  Replacing NaN values of wind_dir with the most frequent for current airport
x = dataset[["Origin", "wind_dir"]]
x = x.groupby("Origin").agg({"wind_dir": lambda wind_dir: mode(wind_dir.dropna())[0][0]})\
    .rename(columns={"wind_dir": "airport_wind_dir_mode"})
x = dataset.join(x, on="Origin")["airport_wind_dir_mode"]
dataset.wind_dir.fillna(x, inplace=True)


max_vis = dataset.vis.max()
dataset.vis.fillna(max_vis, inplace=True)

dataset.runway_visual_range_from.fillna(max_vis, inplace=True)


#### Let's save changes

In [12]:
dataset.to_hdf("dataset_for_prediction_preprocessed.hdf5", key="data")

# Prediction

In [13]:
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, precision_score
from imblearn.over_sampling import SMOTE


In [21]:
dataset = pd.read_hdf("dataset_for_prediction_preprocessed.hdf5", key="data")
dataset.head()

Unnamed: 0,Quarter,Month,DayofMonth,DayOfWeek,AirlineID,Origin,TIME_FROM_LAST_FACT_LAND,LATE_ARRIVAL_BEFORE,FLIGHT_TIME_TO_MIDNIGHT,wind_speed_mps,...,"light ice pellets, rain and snow",fog,rain,heavy snow,nearby dust,"light snow, rain and ice pellets",light rain and snow,light ice pellets and rain,light ice pellets,dep_delay_more_15
0,1,1,1,7,11,JFK,53,18,12,2.57222,...,0,0,0,0,0,0,0,0,0,False
1,1,1,1,7,12,JFK,37,5,12,0.0,...,0,0,0,0,0,0,0,0,0,False
2,1,1,1,7,12,JFK,171,28,12,0.0,...,0,0,0,0,0,0,0,0,0,False
3,1,1,1,7,12,JFK,56,-3,11,0.0,...,0,0,0,0,0,0,0,0,0,False
4,1,1,1,7,11,JFK,57,22,11,0.0,...,0,0,0,0,0,0,0,0,0,False


In [22]:
dataset.shape

(2900144, 82)

In [23]:
(dataset.dep_delay_more_15 == 0).sum()

2465805

### Encoding Origin Airport code

In [15]:
airports_label_encoder = preprocessing.LabelEncoder()
dataset["Origin"] = airports_label_encoder.fit_transform(dataset["Origin"])

In [16]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

### Oversampling using SMOTE

In [17]:
smote = SMOTE(n_jobs=2)
X, y = smote.fit_sample(X, y)


In [18]:
np.save("X_after_smote.npy", X)
np.save("y_after_smote.npy", y)

### Gradient Boosting n_estimators=100, num_leaves=31

In [19]:
X = np.load("X_after_smote.npy")
y = np.load("y_after_smote.npy")

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [32]:
gradient_boosting_classifier = lgb.LGBMClassifier(
                         n_estimators=100,
                         num_leaves=31,
                         objective="binary",
                         silent=False)

gradient_boosting_classifier.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=False, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [33]:
gradient_boosting_classifier.feature_importances_

array([136, 213,  52, 177, 526, 124, 336, 655, 486,  33,  26,  15,  24,
         9,  54,  52,   6,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   3,   0,   0,   0,   0,  23,   0,   0,   5,   0,
         0,   0,   0,   0,   0,   5,   0,   3,   0,   0,   0,  11,  10,
         0,   0,   0,   0,   0,   0,   0,   0,   0,  14,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   0,   0,   0,
         0,   0,   0])

In [34]:
df1 = pd.DataFrame({"feature":dataset.columns[:-1], "importance": gradient_boosting_classifier.feature_importances_})

In [35]:
df1.sort_values(by="importance", ascending=False).head(10)

Unnamed: 0,feature,importance
7,LATE_ARRIVAL_BEFORE,655
4,AirlineID,526
8,FLIGHT_TIME_TO_MIDNIGHT,486
6,TIME_FROM_LAST_FACT_LAND,336
1,Month,213
3,DayOfWeek,177
0,Quarter,136
5,Origin,124
14,first_layer_coverage,54
2,DayofMonth,52


In [36]:
y_pred = gradient_boosting_classifier.predict(X_test)

print('accuracy score: %2.3f' % accuracy_score(y_test, y_pred))
print('precision score: %2.3f' % precision_score(y_test, y_pred))
print('recall score: %2.3f' % recall_score(y_test, y_pred))
print('f1 score: %2.3f' % f1_score(y_test, y_pred))

print("Confusion matrix:")
confusion_matrix(y_test, y_pred)

  if diff:


accuracy score: 0.924
precision score: 0.974
recall score: 0.871
f1 score: 0.920
Confusion matrix:


array([[481892,  11269],
       [ 63848, 429313]], dtype=int64)

### Gradient Boosting n_estimators=200, num_leaves=63

In [37]:
gradient_boosting_classifier = lgb.LGBMClassifier(
                         n_estimators=200,
                         num_leaves=63,
                         objective="binary",
                         silent=False)

gradient_boosting_classifier.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=200,
        n_jobs=-1, num_leaves=63, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=False, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [38]:
gradient_boosting_classifier.feature_importances_

array([ 378,  779,  768,  621, 1971,  563, 1520, 2343, 1268,  309,  140,
        374,  209,   59,  189,  467,   44,   24,    1,    2,    0,    7,
          0,    0,    0,    0,    5,    0,    0,   13,    0,    0,    8,
          1,   74,    0,    1,   25,    5,    0,    3,    0,    0,   14,
         20,    4,   17,    3,    1,   11,   36,   34,    0,    3,    0,
          1,    0,    7,    2,    0,    0,   44,    2,    0,    0,    0,
          0,    0,    0,    0,    1,    0,    0,    0,   24,    0,    0,
          0,    0,    5,    0])

In [41]:
df1 = pd.DataFrame({"feature":dataset.columns[:-1], "importance": gradient_boosting_classifier.feature_importances_})
df1.sort_values(by="importance", ascending=False).head(20)

Unnamed: 0,feature,importance
7,LATE_ARRIVAL_BEFORE,2343
4,AirlineID,1971
6,TIME_FROM_LAST_FACT_LAND,1520
8,FLIGHT_TIME_TO_MIDNIGHT,1268
1,Month,779
2,DayofMonth,768
3,DayOfWeek,621
5,Origin,563
15,first_layer_height,467
0,Quarter,378


In [40]:
y_pred = gradient_boosting_classifier.predict(X_test)

print('accuracy score: %2.3f' % accuracy_score(y_test, y_pred))
print('precision score: %2.3f' % precision_score(y_test, y_pred))
print('recall score: %2.3f' % recall_score(y_test, y_pred))
print('f1 score: %2.3f' % f1_score(y_test, y_pred))

print("Confusion matrix:")
confusion_matrix(y_test, y_pred)

  if diff:


accuracy score: 0.932
precision score: 0.981
recall score: 0.882
f1 score: 0.929
Confusion matrix:


array([[484661,   8500],
       [ 58369, 434792]], dtype=int64)

### Random Forest

In [49]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier(n_estimators=100,
                             max_depth=10,
                             n_jobs=-1)


random_forest_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
y_pred = random_forest_classifier.predict(X_test)


print('accuracy score: %2.3f' % accuracy_score(y_test, y_pred))
print('precision score: %2.3f' % precision_score(y_test, y_pred))
print('recall score: %2.3f' % recall_score(y_test, y_pred))
print('f1 score: %2.3f' % f1_score(y_test, y_pred))

print("Confusion matrix:")
confusion_matrix(y_test, y_pred)

accuracy score: 0.835
precision score: 0.901
recall score: 0.752
f1 score: 0.820
Confusion matrix:


array([[452371,  40790],
       [122103, 371058]], dtype=int64)