In [None]:
import pandas as pd
import numpy as np

#Importing libraries  for visualisation of data
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
matplotlib.rcParams.update({'font.size': 12})

#Importing sklearn libraries for modelling and evaluation

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor as knnr
from sklearn.metrics import mean_squared_error as mse



from random import randint #To generate random numbers in a given range

#Importing datetime module
from time import time
from datetime import date, timedelta #For creating additional time based features


from sklearn.preprocessing import MinMaxScaler ## Importing the MinMax Scaler

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_curve, roc_auc_score, auc, mean_squared_log_error


#importing all the important libraries
#Importing XGBoost module
import xgboost as xgb 
from xgboost import plot_importance
from xgboost import XGBRegressor #For modelling train data to predict Sales

pd.set_option('display.max_columns', None)  #To display all the columns in dataset

In [None]:
ppf = pd.read_csv('processedflights15july.csv')

# X = ppf.drop(["Arrival Delay (Minutes)", "Delayed?","Taxi-In time (Minutes)", "xa", "ya", "Departure Delay (Minutes)"],axis=1)
X = ppf.drop(["Arrival Delay (Minutes)", "Delayed?"],axis=1)
y = ppf["Arrival Delay (Minutes)"]
y1 = ppf["Delayed?"]
y2 = ppf["Departure Delay (Minutes)"]

scaler = MinMaxScaler()
Xs = scaler.fit_transform(X)

train_x,test_x,train_y,test_y = train_test_split(Xs,y1, test_size=0.2, random_state = 50, stratify=y1)
#for classification
train_x1, val_x, train_y1, val_y = train_test_split(train_x, train_y, test_size = 0.2 , random_state = 50, stratify = train_y)

print('training data    ',train_x.shape,train_y.shape)
print('validation data  ',val_x.shape,val_y.shape)
print('test data        ',test_x.shape,test_y.shape)

train_xr,test_xr,train_yr,test_yr = train_test_split(Xs,y, test_size=0.2, random_state = 50, stratify=y1)
#for arrival regression
train_x1r, val_xr, train_y1r, val_yr = train_test_split(train_xr, train_yr, test_size = 0.2 , random_state = 50, stratify = train_y)

print('training data    ',train_xr.shape,train_yr.shape)
print('validation data  ',val_xr.shape,val_yr.shape)
print('test data        ',test_xr.shape,test_yr.shape)

train_xr1,test_xr1,train_yr1,test_yr1 = train_test_split(Xs,y2, test_size=0.2, random_state = 50, stratify=y1)
#for delay regression
train_x1r1, val_xr1, train_y1r1, val_yr1 = train_test_split(train_xr1, train_yr1, test_size = 0.2 , random_state = 50, stratify = train_y)

print('training data    ',train_xr1.shape,train_yr1.shape)
print('validation data  ',val_xr1.shape,val_yr1.shape)
print('test data        ',test_xr1.shape,test_yr1.shape)

RANDOM FOREST

Base model with classifier

In [None]:
#Importing random forest classifier 
from sklearn.ensemble import RandomForestClassifier

In [None]:
#creating a random forest instance
rf = RandomForestClassifier(random_state=50)

In [None]:
#train the model
rf.fit(train_x,train_y)

In [None]:
#score on training data
rf.score(train_x, train_y)

In [None]:
#score on test data
rf.score(test_x, test_y)

In [None]:
y_pred_test = rf.predict(test_x)

In [None]:
confusion_matrix(test_y, y_pred_test)

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(test_y, y_pred_test)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot

plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

In [None]:
print(classification_report(test_y, y_pred_test))

In [None]:
import pandas as pd
rf_feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

In [None]:
rf_feature_importances

Base model with continous predictions

In [None]:
#Importing random forest classifier 
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr = RandomForestRegressor(random_state = 50)

In [None]:
#train the model
rfr.fit(train_xr, train_yr)

In [None]:
#score on training data
rfr.score(train_xr, train_yr)

In [None]:
#score on test data
rfr.score(test_xr, test_yr)

In [None]:
deptdelaytrain = rfr.predict(train_xr1)
yr_pred_test = rfr.predict(test_xr)

In [None]:
train_xr = np.concatenate((train_xr,deptdelaytrain))
test_xr = np.concatenate(test_xr,deptdelaytest)

In [None]:
errors = abs(yr_pred_test - test_yr)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'minutes')

In [None]:
# def smape(A, F):
#     return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [None]:
# # Calculate mean absolute percentage error (MAPE)
# mape = 100 * (errors / test_yr)
# # Calculate and display accuracy
# accuracy = 100 - np.mean(mape)
# print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# smape(test_yr,yr_pred_test)

In [None]:
# from sklearn.tree import export_graphviz
# import pydot
# # Pull out one tree from the forest
# tree = rfr.estimators_[5]
# # Export the image to a dot file
# export_graphviz(tree, out_file = 'tree.dot', rounded = True, precision = 1)
# # Use dot file to create a graph
# (graph, ) = pydot.graph_from_dot_file('tree.dot')
# # Write graph to a png file
# graph.write_png('tree.png')

In [None]:
import pandas as pd
rfr_feature_importances = pd.DataFrame(rfr.feature_importances_,
                                   index = X.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

In [None]:
rfr_feature_importances

Regressor with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_gridrfct1gsc = { 
    'n_estimators': [150, 200, 250],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,8,12,16,20, None],
    'criterion' :['gini', 'entropy']
}


In [None]:
CV_rfc = GridSearchCV(estimator=rf, param_grid=param_gridrfct1gsc, cv= 5,return_train_score=True)
CV_rfc.fit(train_x, train_y)

In [None]:
print(CV_rfc.best_params_)

In [None]:
#creating a random forest instance
rft1 = RandomForestClassifier(random_state=50, bootstrap=True, max_depth=None, max_features= 'auto', n_estimators= 15)

In [None]:
#train the model
rft1.fit(train_x,train_y)

In [None]:
#score on training data
rft1.score(train_x, train_y)

In [None]:
#score on test data
rft1.score(test_x, test_y)

In [None]:
y_pred_testt1 = rft1.predict(test_x)

In [None]:
confusion_matrix(test_y, y_pred_testt1)

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(test_y, y_pred_testt1)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot

plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
best_grid = CV_rfc.best_estimator_
grid_accuracy = evaluate(best_grid, test_x, test_y)

In [None]:
param_gridrfrt1gsc = {
    'max_depth': [90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}

In [None]:
CV_rfr = GridSearchCV(estimator = rfr, param_grid = param_gridrfrt1gsc, cv = 3, verbose = 2)

In [None]:
CV_rfr.fit(train_xr, train_yr)

Classifier with RandomisedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from random import randint

In [None]:
param_gridrfct1rsc = {'max_depth': [6,9, None], 
         'n_estimators':[50, 70, 100, 150], 
          'max_features': ["auto","log2","sqrt"],
          'criterion' : ['gini', 'entropy'],
          'bootstrap':[True, False],
          'min_samples_leaf': [randint(1,4)]}


In [None]:
CV_rfct1rsv = RandomizedSearchCV(rf, param_gridrfct1rsc, cv= 3,return_train_score=True,verbose=2)
CV_rfct1rsv.fit(train_x, train_y)

In [None]:
CV_rfct1rsv.best_params_

In [None]:
CV_rfct1rsv.best_score_

In [None]:
rf=RandomForestClassifier(bootstrap= False, criterion= 'entropy',max_depth= None,max_features= 'auto',min_samples_leaf= 3,n_estimators=100,random_state=50)

In [None]:
rf.fit(train_x,train_y)

In [None]:
#score on training data
rf.score(train_x, train_y)

In [None]:
#score on test data
rf.score(test_x, test_y)

In [None]:
#score on validation data
rf.score(val_x, val_y)

In [None]:
y_pred_testrfct1rsv = rf.predict(test_x)

In [None]:
confusion_matrix(test_y,y_pred_testrfct1rsv)

In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(test_y, y_pred_testrfct1rsv)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

# Add labels to the plot

plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for Random Forest Model')
plt.show()

Continous prediction with RandomisedSearchCV

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
param_gridrfrt1rsc = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(param_gridrfrt1rsc)

In [None]:
# search across 100 different combinations, and use all available cores
CV_rfrt1rsv = RandomizedSearchCV(estimator = rfr, param_distributions = param_gridrfrt1rsc, n_iter = 25, cv = 3, verbose=2, random_state=50)
# Fit the random search model
CV_rfrt1rsv.fit(train_xr, train_yr)