In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
data = pd.read_csv("FlightDelays.csv")

In [3]:
data.head()

Unnamed: 0,schedtime,carrier,deptime,dest,distance,date,flightnumber,origin,weather,dayweek,daymonth,tailnu,delay
0,1455,OH,1455,JFK,184,1/1/2004,5935,BWI,0,4,1,N940CA,ontime
1,1640,DH,1640,JFK,213,1/1/2004,6155,DCA,0,4,1,N405FJ,ontime
2,1245,DH,1245,LGA,229,1/1/2004,7208,IAD,0,4,1,N695BR,ontime
3,1715,DH,1709,LGA,229,1/1/2004,7215,IAD,0,4,1,N662BR,ontime
4,1039,DH,1035,LGA,229,1/1/2004,7792,IAD,0,4,1,N698BR,ontime


In [4]:
data.describe()

Unnamed: 0,schedtime,deptime,distance,flightnumber,weather,dayweek,daymonth
count,2201.0,2201.0,2201.0,2201.0,2201.0,2201.0,2201.0
mean,1371.938664,1369.298955,211.871422,3815.086324,0.014539,3.905498,16.024989
std,432.697149,442.462754,13.316815,2409.750224,0.119725,1.903149,8.67739
min,600.0,10.0,169.0,746.0,0.0,1.0,1.0
25%,1000.0,1004.0,213.0,2156.0,0.0,2.0,8.0
50%,1455.0,1450.0,214.0,2385.0,0.0,4.0,16.0
75%,1710.0,1709.0,214.0,6155.0,0.0,5.0,23.0
max,2130.0,2330.0,229.0,7924.0,1.0,7.0,31.0


In [6]:
print(data.dtypes)
print(data.shape)

schedtime        int64
carrier         object
deptime          int64
dest            object
distance         int64
date            object
flightnumber     int64
origin          object
weather          int64
dayweek          int64
daymonth         int64
tailnu          object
delay           object
dtype: object
(2201, 13)


## Cleaning the Data

In [27]:
## Converting all Categorical Data into Category type 
data.carrier = data.carrier.astype("category")
data.origin = data.origin.astype("category")
data.weather = data.weather.astype("category")
data.dayweek = data.dayweek.astype("category")
data.daymonth = data.daymonth.astype("category")

In [8]:
## Dropping Variables which are considered Not Significant
X = data.drop(["tailnu","date","daymonth"],axis=1)

In [9]:
X.dtypes

schedtime          int64
carrier         category
deptime            int64
dest              object
distance           int64
flightnumber       int64
origin          category
weather         category
dayweek         category
delay             object
dtype: object

In [10]:
## creating Dummy variables 
X_dum = pd.get_dummies(X)

In [11]:
X_dum.head()

Unnamed: 0,schedtime,deptime,distance,flightnumber,carrier_CO,carrier_DH,carrier_DL,carrier_MQ,carrier_OH,carrier_RU,...,weather_1,dayweek_1,dayweek_2,dayweek_3,dayweek_4,dayweek_5,dayweek_6,dayweek_7,delay_delayed,delay_ontime
0,1455,1455,184,5935,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
1,1640,1640,213,6155,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
2,1245,1245,229,7208,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,1715,1709,229,7215,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
4,1039,1035,229,7792,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [12]:
X_delayed = X_dum.drop(["delay_ontime"], axis=1)

In [13]:
X_delayed.head()

Unnamed: 0,schedtime,deptime,distance,flightnumber,carrier_CO,carrier_DH,carrier_DL,carrier_MQ,carrier_OH,carrier_RU,...,weather_0,weather_1,dayweek_1,dayweek_2,dayweek_3,dayweek_4,dayweek_5,dayweek_6,dayweek_7,delay_delayed
0,1455,1455,184,5935,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
1,1640,1640,213,6155,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,1245,1245,229,7208,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
3,1715,1709,229,7215,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,1039,1035,229,7792,0,1,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [14]:
## Splitting Independent and dependent variables 
x = X_delayed.drop(["delay_delayed"],axis=1)
y = X_delayed["delay_delayed"]

# Building Classification Models 

#### All the models are checked for their acuuracy using cross validation score where you divide the whole data set into no of cross validations where no of cv is equal to number of equal split it is done 

#### Using Decision Tree Classifier

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier


In [17]:
from sklearn.cross_validation import cross_val_score



In [18]:
### Decision Tree 
dtree = DecisionTreeClassifier(max_depth=5, min_samples_split=2 )

In [19]:
dtree.fit(x,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [20]:
cross_val_score(dtree,x,y,cv=8)

array([0.85507246, 0.83333333, 0.83333333, 0.82246377, 0.84      ,
       0.85766423, 0.87956204, 0.83211679])

#### Using Random Forest Classifier 

In [21]:
## RandomForest
rforest = RandomForestClassifier(n_estimators=100, max_depth=10,min_samples_split=2)

In [23]:
rforest.fit(x,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [24]:
cross_val_score(rforest,x,y,cv=8)

array([0.87681159, 0.83695652, 0.81884058, 0.81884058, 0.86909091,
       0.86131387, 0.90145985, 0.83576642])

#### Using Gradient Boosting Classifier 

In [25]:
## Gradient Boosting Classifier
GBCl = GradientBoostingClassifier(learning_rate=0.01,n_estimators=100, max_depth=10,min_samples_split=2)

In [26]:
cross_val_score(GBCl,x,y,cv=8, n_jobs=-1)

array([0.91666667, 0.89492754, 0.85869565, 0.86594203, 0.89818182,
       0.90145985, 0.90145985, 0.87591241])