In [None]:
import pandas as pd
import numpy as np

#Importing libraries  for visualisation of data
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
matplotlib.rcParams.update({'font.size': 12})

#Importing sklearn libraries for modelling and evaluation

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor as knnr
from sklearn.metrics import mean_squared_error as mse



from random import randint #To generate random numbers in a given range

#Importing datetime module
from time import time
from datetime import date, timedelta #For creating additional time based features


from sklearn.preprocessing import MinMaxScaler ## Importing the MinMax Scaler

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_curve, roc_auc_score, auc, mean_squared_log_error


#importing all the important libraries
#Importing XGBoost module
import xgboost as xgb 
from xgboost import plot_importance
from xgboost import XGBRegressor #For modelling train data to predict Sales

pd.set_option('display.max_columns', None)  #To display all the columns in dataset

In [None]:
ppf = pd.read_csv('processedflights15july.csv')

# X = ppf.drop(["Arrival Delay (Minutes)", "Delayed?","Taxi-In time (Minutes)", "xa", "ya", "Departure Delay (Minutes)"],axis=1)
X = ppf.drop(["Arrival Delay (Minutes)", "Delayed?"],axis=1)
y = ppf["Arrival Delay (Minutes)"]
y1 = ppf["Delayed?"]
y2 = ppf["Departure Delay (Minutes)"]

scaler = MinMaxScaler()
Xs = scaler.fit_transform(X)

train_x,test_x,train_y,test_y = train_test_split(Xs,y1, test_size=0.2, random_state = 50, stratify=y1)
#for classification
train_x1, val_x, train_y1, val_y = train_test_split(train_x, train_y, test_size = 0.2 , random_state = 50, stratify = train_y)

print('training data    ',train_x.shape,train_y.shape)
print('validation data  ',val_x.shape,val_y.shape)
print('test data        ',test_x.shape,test_y.shape)

train_xr,test_xr,train_yr,test_yr = train_test_split(Xs,y, test_size=0.2, random_state = 50, stratify=y1)
#for arrival regression
train_x1r, val_xr, train_y1r, val_yr = train_test_split(train_xr, train_yr, test_size = 0.2 , random_state = 50, stratify = train_y)

print('training data    ',train_xr.shape,train_yr.shape)
print('validation data  ',val_xr.shape,val_yr.shape)
print('test data        ',test_xr.shape,test_yr.shape)

train_xr1,test_xr1,train_yr1,test_yr1 = train_test_split(Xs,y2, test_size=0.2, random_state = 50, stratify=y1)
#for delay regression
train_x1r1, val_xr1, train_y1r1, val_yr1 = train_test_split(train_xr1, train_yr1, test_size = 0.2 , random_state = 50, stratify = train_y)

print('training data    ',train_xr1.shape,train_yr1.shape)
print('validation data  ',val_xr1.shape,val_yr1.shape)
print('test data        ',test_xr1.shape,test_yr1.shape)

K-NEAREST NEIGHBOURS

Classification (Base Model)

In [None]:
# Creating instance of KNN
clf = KNeighborsClassifier(n_neighbors = 10)

# Fitting the model
clf.fit(train_x, train_y)

# Predicting over the Train Set and calculating F1
test_predict = clf.predict(val_x)
k = f1_score(test_predict, val_y)
print('Test F1 Score    ', k )

In [None]:
def Elbow(K):
    #initiating empty list
    test_error = []
   
    #training model for evey value of K
    for i in K:
        #Instance oh KNN
        clf = KNeighborsClassifier(n_neighbors = i)
        clf.fit(train_x, train_y)
        # Appending F1 scores to empty list claculated using the predictions
        tmp = clf.predict(test_x)
        tmp = f1_score(tmp,test_y)
        error = 1-tmp
        test_error.append(error)
    
    return test_error

In [None]:
k = range(6, 20, 2)
test = Elbow(k)

In [None]:
# plotting the Curves
plt.plot(k, test)
plt.xlabel('K Neighbors')
plt.ylabel('Test error')
plt.title('Elbow Curve for classification')

Regression (Base Model)

In [None]:
# Creating instance of KNN
reg = knnr(n_neighbors = 11)

# Fitting the model
reg.fit(train_xr, train_yr)

# Predicting over the Train Set and calculating MSE
test_predict = reg.predict(test_xr)
k = mse(test_predict, test_yr)
print('Test MSE    ', k )

In [None]:
def Elbow(K):
  #initiating empty list
    test_mse = []
  
  #training model for evey value of K
    for i in K:
        #Instance of KNN
        reg = knnr(n_neighbors = i)
        reg.fit(train_x, train_y)
        #Appending mse value to empty list claculated using the predictions
        tmp = reg.predict(test_x)
        tmp = mse(tmp,test_y)
        test_mse.append(tmp)
    
    return test_mse

In [None]:
k = range(6, 20, 2)
test = Elbow(k)

In [None]:
# plotting the Curves
plt.plot(k, test)
plt.xlabel('K Neighbors')
plt.ylabel('Test error')
plt.title('Elbow Curve for regression')

Cross-Validation

In [None]:
num_folds = 10
k_fold = KFold(num_folds)

nparray_label_list = np.array(train_y)
best_hyperparam_setting = None
best_cross_val_score = -np.inf  # assumes that a higher score is better
cross_val_scorelist=[]
for k in range(1,12):
    fold_scores = []
    
    for proper_train_indices, val_indices in k_fold.split(train_x):
        classifier = KNeighborsClassifier(n_neighbors=k)
        len(proper_train_indices)
        proper_train_features = train_x[proper_train_indices]
        proper_train_labels = nparray_label_list[proper_train_indices]
        val_features = train_x[val_indices]
        val_labels = nparray_label_list[val_indices]
        classifier = KNeighborsClassifier(n_neighbors=k)
        classifier.fit(proper_train_features, proper_train_labels)
        predicted_val_labels = classifier.predict(val_features)
        fold_score = f1_score(val_labels, predicted_val_labels, average='weighted')
        fold_scores.append(fold_score)
    cross_val_score = np.mean(fold_scores)
    cross_val_scorelist.append(cross_val_score)
    print(cross_val_score)
    if cross_val_score > best_cross_val_score:  # assumes that a higher score is better
        best_cross_val_score = cross_val_score
        best_hyperparam_setting = k

print('Best hyperparameter setting for knn is:', best_hyperparam_setting)

In [None]:
print(cross_val_scorelist)
plt.plot(range(1,12),cross_val_scorelist, color = 'blue')
plt.title('cross_val_score vs k')
plt.xlabel('k')
plt.ylabel('cross_val_score')

In [None]:
# calculate the fpr and tpr for all thresholds of the classification
probs = classifier.predict_proba(X)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y, preds)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
#this is the final knn model with k = 9, which we get from  1st knn hpt
finalclassifierknn = KNeighborsClassifier(n_neighbors = best_hyperparam_setting)
finalclassifierknn.fit(X,y)