In [1]:
import sklearn
from sklearn.naive_bayes import *
import pandas as pd
import numpy as np
from sklearn import *
import os
from sklearn.metrics import *
from sklearn import metrics, preprocessing
from sklearn import svm, naive_bayes, neighbors, tree

In [2]:
TRAINING_LINE_NUMBER = 6000000  # input files lines to be read
YEARS = ['2015', '2016', '2017', '2018', '2019', '2020', '2021'] # years of data
INPUT_FILE_PATH = ".\\flights\\" #input datasets location
SKIP_FIRST_LINE = True  # Skipping the first line as it is Header

In [3]:
allyears = []
print ("Reading into Pandas frame...")
try:
    for year in YEARS:
        path = os.path.join(INPUT_FILE_PATH, '%d.csv' % int(year))
        raw_data = pd.read_csv(
            path, nrows=TRAINING_LINE_NUMBER, encoding = "ISO-8859-1", skiprows=0, usecols=[
                u'Year',
                u'Month',
                u'DayofMonth',
                u'DayOfWeek',
                u'UniqueCarrier',
                u'DepTime',
                u'Origin',
                u'Dest',
                u'DepDelay',
                u'Cancelled',
            ])
        print (int(year))
        print (len(raw_data))
        raw_data = raw_data[raw_data['Cancelled'] == 0] # Normalizing the data by removing the cancelled flights
        rows = np.random.choice(
            np.random.permutation(raw_data.index.values), len(raw_data) // 3, replace=False)# 33% sampling of training data  
        sampled_raw_data = raw_data.loc[rows]
        sampled_raw_data = raw_data
        allyears.append(sampled_raw_data)
        print
except Exception as e:
    print ("Reading CSV file got failed", e)


Reading into Pandas frame...
2015
1048575
2016
1048575
2017
1048575
2018
1048575
2019
1048575
2020
1048575
2021
1048575


In [4]:
# Building the master frame by clubbing it all years
raw_data_all = pd.concat(allyears, ignore_index=True)
allyears = []
raw_data = []

print ("Total length - ", len(raw_data_all))
del raw_data_all['Cancelled']  # As we already did the data normalization based on the cancelled flights we no 
                               #longer need this column
raw_data_all.fillna(0, inplace=True)

Total length -  7150557


In [5]:
# Converting to appropriate datatypes for numeric cols.
raw_data_all['Year'] = raw_data_all['Year'].astype('int')
raw_data_all['Month'] = raw_data_all['Month'].astype('int')
raw_data_all['DayofMonth'] = raw_data_all['DayofMonth'].astype('int')
raw_data_all['DayOfWeek'] = raw_data_all['DayOfWeek'].astype('int')
raw_data_all['DepTime'] = raw_data_all['DepTime'].astype('int')
raw_data_all['DepDelay'] = raw_data_all['DepDelay'].astype('int')

In [6]:
raw_data_all

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,DepDelay,Origin,Dest
0,2015,1,13,7,2231,US,-4,PIT,CLT
1,2015,1,14,1,2230,US,-5,PIT,CLT
2,2015,1,15,2,2230,US,-5,PIT,CLT
3,2015,1,16,3,2230,US,-5,PIT,CLT
4,2015,1,17,4,2227,US,-8,PIT,CLT
...,...,...,...,...,...,...,...,...,...
7150552,2021,2,13,3,649,9E,19,ATW,DTW
7150553,2021,2,14,4,626,9E,-4,ATW,DTW
7150554,2021,2,15,5,628,9E,-2,ATW,DTW
7150555,2021,2,16,6,625,9E,-5,ATW,DTW


In [7]:
# Since we dont have a classification label in the data, we are creating
# one. Threshold of 15mins was chosen.
print ("Calculating classification label...")
raw_data_all['label'] = 0
raw_data_all.label[raw_data_all.DepDelay >= 5] = 1
raw_data_all.label[raw_data_all.DepDelay < 5] = 0

del raw_data_all['DepDelay'] #delete the column as we already

print ("Dataframe shape - ", raw_data_all.shape)
print ("Columns -", raw_data_all.columns)

Calculating classification label...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data_all.label[raw_data_all.DepDelay >= 5] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_data_all.label[raw_data_all.DepDelay < 5] = 0


Dataframe shape -  (7150557, 9)
Columns - Index(['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier',
       'Origin', 'Dest', 'label'],
      dtype='object')


In [8]:
# Converting categorical data to numeric for cols - UniqueCarrier, Dest, Origin
for col in set(raw_data_all.columns):
    if raw_data_all[col].dtype == np.dtype('object'):
        print ("Converting...", col)
        if col == 'UniqueCarrier':
            s = np.unique(raw_data_all[col].values)
            UniqueCarrier = pd.Series([x[0] for x in enumerate(s)], index=s)
        if col == 'Dest':
            s = np.unique(raw_data_all[col].values)
            Dest = pd.Series([x[0] for x in enumerate(s)], index=s)
        if col == 'Origin':
            s = np.unique(raw_data_all[col].values)
            Origin = pd.Series([x[0] for x in enumerate(s)], index=s)


Converting... UniqueCarrier
Converting... Dest
Converting... Origin


In [9]:
#to fetch the corresponding numeric value for the categorical variable.
def getDest(inDest):
    out = []
    for x, y in inDest.iteritems():
        out.append(Dest._get_value(y))
    return out

#to fetch the corresponding numeric value for the categorical variable.
def getOrigin(inOrign):
    out = []
    for x, y in inOrign.iteritems():
        out.append(Origin._get_value(y))
    return out

#to fetch the corresponding numeric value for the categorical variable.

def getCarrier(inCarrier):
    out = []
    for x, y in inCarrier.iteritems():
        out.append(UniqueCarrier._get_value(y))
    return out

In [10]:
# Conversion of unique Carrier
raw_data_all['UniqueCarrier'] = getCarrier(raw_data_all['UniqueCarrier'])
print ("UniqueCarrier completed.")

# Conversion of Destination
raw_data_all['Dest'] = getDest(raw_data_all['Dest'])
print ("Dest completed.")

# Conversion of Origin
raw_data_all['Origin'] = getOrigin(raw_data_all['Origin'])
print ("Origin completed.")   

UniqueCarrier completed.
Dest completed.
Origin completed.


In [11]:
# Lets build the crosss validation model

features = raw_data_all.columns[0:8]
target_names = ['Not Delayed', 'Delayed']

#lists for storing results of cross validation.
accuracy = {}
results = {}
matrix = {}
precision = {}
recall = {}

for year in YEARS:
    print ("Testing for the year - ", year)
    train = raw_data_all[raw_data_all['Year'] != int(year)]  # Testing using one year and the rest for training
    test = raw_data_all[raw_data_all['Year'] == int(year)]
    #to avoid memory issues we are taking only 50% of the data.
    rows = np.random.choice(np.random.permutation(
                            test.index.values), len(test) // 2, replace=False)
    output_data = test.loc[rows]
    output_data = test
    # lable data is stored to train targets
    trainTargets = np.array(train['label']).astype(int)
    # lable data is stored to test targets
    testTargets = np.array(output_data['label']).astype(int)
    print (train['Year'])
    print (test['Year'])
    print ("Model Started...")
    # defining the Naive Bayesian Classifier and fitting the data.
    gnb = GaussianNB()
    y_gnb = gnb.fit(train[features], trainTargets).predict(output_data[features])
    # lets store the predicted result in new column.
    output_data['pred_label'] = y_gnb
    output_data.to_csv(
        INPUT_FILE_PATH + "\dfTest" + year + ".csv", index=False)
# Calculating metrics using sklearn
    print ("\nCalculating metrics...")
    accuracy[int(year)] = accuracy_score(output_data['label'], y_gnb)
    print ("Accuracy score - ", accuracy[int(year)])
    precision[int(year)] = precision_score(
        output_data['label'], y_gnb, average='micro')
    print ("Precision Score - ", precision[int(year)])
    recall[int(year)] = recall_score(
        output_data['label'], y_gnb, average='micro')
    print ("Recall Score - ", recall[int(year)])
    print ("Confusion matrix")
    matrix[int(year)] = metrics.confusion_matrix(
        output_data['label'], y_gnb)
    print (matrix[int(year)])
    results[int(year)] = precision_recall_fscore_support(
        output_data['label'], y_gnb, average='micro')
    print ("Precision, recall, F-Score, Support - ", results[int(year)])
    print ("Classification report")
    print (classification_report(np.array(output_data['label']), y_gnb,
                                target_names=target_names))
    print
    train = []
    test = []

Testing for the year -  2015
1034620    2016
1034621    2016
1034622    2016
1034623    2016
1034624    2016
           ... 
7150552    2021
7150553    2021
7150554    2021
7150555    2021
7150556    2021
Name: Year, Length: 6115937, dtype: int32
0          2015
1          2015
2          2015
3          2015
4          2015
           ... 
1034615    2015
1034616    2015
1034617    2015
1034618    2015
1034619    2015
Name: Year, Length: 1034620, dtype: int32
Model Started...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['pred_label'] = y_gnb



Calculating metrics...
Accuracy score -  0.737375074906729
Precision Score -  0.737375074906729
Recall Score -  0.737375074906729
Confusion matrix
[[762901      1]
 [271716      2]]
Precision, recall, F-Score, Support -  (0.737375074906729, 0.737375074906729, 0.737375074906729, None)
Classification report
              precision    recall  f1-score   support

 Not Delayed       0.74      1.00      0.85    762902
     Delayed       0.67      0.00      0.00    271718

    accuracy                           0.74   1034620
   macro avg       0.70      0.50      0.42   1034620
weighted avg       0.72      0.74      0.63   1034620

Testing for the year -  2016
0          2015
1          2015
2          2015
3          2015
4          2015
           ... 
7150552    2021
7150553    2021
7150554    2021
7150555    2021
7150556    2021
Name: Year, Length: 6130231, dtype: int32
1034620    2016
1034621    2016
1034622    2016
1034623    2016
1034624    2016
           ... 
2054941    2016
205494

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['pred_label'] = y_gnb



Calculating metrics...
Accuracy score -  0.767912412307439
Precision Score -  0.767912412307439
Recall Score -  0.767912412307439
Confusion matrix
[[783055    776]
 [236029    466]]
Precision, recall, F-Score, Support -  (0.767912412307439, 0.767912412307439, 0.767912412307439, None)
Classification report
              precision    recall  f1-score   support

 Not Delayed       0.77      1.00      0.87    783831
     Delayed       0.38      0.00      0.00    236495

    accuracy                           0.77   1020326
   macro avg       0.57      0.50      0.44   1020326
weighted avg       0.68      0.77      0.67   1020326

Testing for the year -  2017
0          2015
1          2015
2          2015
3          2015
4          2015
           ... 
7150552    2021
7150553    2021
7150554    2021
7150555    2021
7150556    2021
Name: Year, Length: 6127525, dtype: int32
2054946    2017
2054947    2017
2054948    2017
2054949    2017
2054950    2017
           ... 
3077973    2017
307797

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['pred_label'] = y_gnb



Calculating metrics...
Accuracy score -  0.726128801445116
Precision Score -  0.726128801445116
Recall Score -  0.726128801445116
Confusion matrix
[[741358   1753]
 [278426   1495]]
Precision, recall, F-Score, Support -  (0.726128801445116, 0.726128801445116, 0.726128801445116, None)
Classification report
              precision    recall  f1-score   support

 Not Delayed       0.73      1.00      0.84    743111
     Delayed       0.46      0.01      0.01    279921

    accuracy                           0.73   1023032
   macro avg       0.59      0.50      0.43   1023032
weighted avg       0.65      0.73      0.61   1023032

Testing for the year -  2018
0          2015
1          2015
2          2015
3          2015
4          2015
           ... 
7150552    2021
7150553    2021
7150554    2021
7150555    2021
7150556    2021
Name: Year, Length: 6135411, dtype: int32
3077978    2018
3077979    2018
3077980    2018
3077981    2018
3077982    2018
           ... 
4093119    2018
409312

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['pred_label'] = y_gnb



Calculating metrics...
Accuracy score -  0.6909084998611037
Precision Score -  0.6909084998611037
Recall Score -  0.6909084998611037
Confusion matrix
[[698248   3700]
 [310073   3125]]
Precision, recall, F-Score, Support -  (0.6909084998611037, 0.6909084998611037, 0.6909084998611037, None)
Classification report
              precision    recall  f1-score   support

 Not Delayed       0.69      0.99      0.82    701948
     Delayed       0.46      0.01      0.02    313198

    accuracy                           0.69   1015146
   macro avg       0.58      0.50      0.42   1015146
weighted avg       0.62      0.69      0.57   1015146

Testing for the year -  2019
0          2015
1          2015
2          2015
3          2015
4          2015
           ... 
7150552    2021
7150553    2021
7150554    2021
7150555    2021
7150556    2021
Name: Year, Length: 6121675, dtype: int32
4093124    2019
4093125    2019
4093126    2019
4093127    2019
4093128    2019
           ... 
5122001    2019


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['pred_label'] = y_gnb



Calculating metrics...
Accuracy score -  0.7069401544589176
Precision Score -  0.7069401544589176
Recall Score -  0.7069401544589176
Confusion matrix
[[715623  11896]
 [289628  11735]]
Precision, recall, F-Score, Support -  (0.7069401544589176, 0.7069401544589176, 0.7069401544589176, None)
Classification report
              precision    recall  f1-score   support

 Not Delayed       0.71      0.98      0.83    727519
     Delayed       0.50      0.04      0.07    301363

    accuracy                           0.71   1028882
   macro avg       0.60      0.51      0.45   1028882
weighted avg       0.65      0.71      0.61   1028882

Testing for the year -  2020
0          2015
1          2015
2          2015
3          2015
4          2015
           ... 
7150552    2021
7150553    2021
7150554    2021
7150555    2021
7150556    2021
Name: Year, Length: 6136615, dtype: int32
5122006    2020
5122007    2020
5122008    2020
5122009    2020
5122010    2020
           ... 
6135943    2020


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['pred_label'] = y_gnb



Calculating metrics...
Accuracy score -  0.6533509806280833
Precision Score -  0.6533509806280833
Recall Score -  0.6533509806280833
Confusion matrix
[[642298  14235]
 [337247  20162]]
Precision, recall, F-Score, Support -  (0.6533509806280833, 0.6533509806280833, 0.6533509806280833, None)
Classification report
              precision    recall  f1-score   support

 Not Delayed       0.66      0.98      0.79    656533
     Delayed       0.59      0.06      0.10    357409

    accuracy                           0.65   1013942
   macro avg       0.62      0.52      0.44   1013942
weighted avg       0.63      0.65      0.54   1013942

Testing for the year -  2021
0          2015
1          2015
2          2015
3          2015
4          2015
           ... 
6135943    2020
6135944    2020
6135945    2020
6135946    2020
6135947    2020
Name: Year, Length: 6135948, dtype: int32
6135948    2021
6135949    2021
6135950    2021
6135951    2021
6135952    2021
           ... 
7150552    2021


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_data['pred_label'] = y_gnb



Calculating metrics...
Accuracy score -  0.6630741497463555
Precision Score -  0.6630741497463555
Recall Score -  0.6630741497463555
Confusion matrix
[[606913  58980]
 [282868  65848]]
Precision, recall, F-Score, Support -  (0.6630741497463555, 0.6630741497463555, 0.6630741497463555, None)
Classification report
              precision    recall  f1-score   support

 Not Delayed       0.68      0.91      0.78    665893
     Delayed       0.53      0.19      0.28    348716

    accuracy                           0.66   1014609
   macro avg       0.60      0.55      0.53   1014609
weighted avg       0.63      0.66      0.61   1014609



In [12]:
print ("Accuracy\n", accuracy)
print ("\nPrecision\n", precision)
print ("\nRecall\n", recall)
print ("\nMetrics\n", results)
print ("\nMatrix\n", matrix)

Accuracy
 {2015: 0.737375074906729, 2016: 0.767912412307439, 2017: 0.726128801445116, 2018: 0.6909084998611037, 2019: 0.7069401544589176, 2020: 0.6533509806280833, 2021: 0.6630741497463555}

Precision
 {2015: 0.737375074906729, 2016: 0.767912412307439, 2017: 0.726128801445116, 2018: 0.6909084998611037, 2019: 0.7069401544589176, 2020: 0.6533509806280833, 2021: 0.6630741497463555}

Recall
 {2015: 0.737375074906729, 2016: 0.767912412307439, 2017: 0.726128801445116, 2018: 0.6909084998611037, 2019: 0.7069401544589176, 2020: 0.6533509806280833, 2021: 0.6630741497463555}

Metrics
 {2015: (0.737375074906729, 0.737375074906729, 0.737375074906729, None), 2016: (0.767912412307439, 0.767912412307439, 0.767912412307439, None), 2017: (0.726128801445116, 0.726128801445116, 0.726128801445116, None), 2018: (0.6909084998611037, 0.6909084998611037, 0.6909084998611037, None), 2019: (0.7069401544589176, 0.7069401544589176, 0.7069401544589176, None), 2020: (0.6533509806280833, 0.6533509806280833, 0.65335098

In [13]:
# Finding mean of metrics
print ("\nMean Cross validation Precision score", np.mean(pd.Series(precision)))
print ("\nMean Cross validation Recall score", np.mean(pd.Series(recall)))
print ("\nMean Cross validation Accuracy score", np.mean(pd.Series(accuracy)))



Mean Cross validation Precision score 0.7065271533362493

Mean Cross validation Recall score 0.7065271533362493

Mean Cross validation Accuracy score 0.7065271533362493
