In [2]:
# import common libs
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# Import the evaluation submodule
import pyspark.ml.evaluation as evals

# Import LogisticRegression
from pyspark.ml.classification import LogisticRegression

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession \
        .builder \
        .master("local[*]") \
        .config("spark.executor.memory", "16g") \
        .config("spark.driver.memory", "16g") \
        .config("spark.memory.offHeap.enabled",True) \
        .config("spark.memory.offHeap.size","16g") \
        .appName("Airline") \
        .getOrCreate()


        # .config("spark.sql.shuffle.partitions",60) \

In [3]:
flights_df = spark.read.option("header",True).csv('Data_Dump/data/flights/2013.csv')
flights_df.createOrReplaceTempView("Flights")
flights_df=flights_df.select([when(col(c).contains('None') | \
                            col(c).contains('NULL') | \
                            (col(c) == '' ) | \
                            col(c).isNull() | \
                            isnan(c),0).otherwise(col(c)).alias(c) for c in flights_df.columns])
#checking if the file has any NaN values
flights_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in flights_df.columns]).show()


+-------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|FL_DATE|OP_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_TIME|ARR_DELAY|CANCELLED|CANCELLATION_CODE|DIVERTED|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|Unnamed: 27|
+-------+----------+-----------------+------+----+------------+--------+---------+--------+----------+---------+-------+------------+--------+---------+---------+-----------------+--------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+-----------+
|      0| 

In [4]:
from pyspark.sql.functions import *
 
df1 = flights_df.select(col('ORIGIN'), \
            col('DEST'), \
            col('OP_CARRIER'), \
            col('ARR_DELAY'),to_date(flights_df.FL_DATE, 'yyyy-MM-dd').alias('FL_DATE'))

In [8]:
fin=df1.withColumn('MONTH', month(df1.FL_DATE)).withColumn('DAY', month(df1.FL_DATE)).drop('FL_DATE')

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



le_dest = LabelEncoder()

fl_df=fin.toPandas()
le_carrier = LabelEncoder()
fl_df['OP_CARRIER']= le_carrier.fit_transform(fl_df['OP_CARRIER'])
le_origin = LabelEncoder()
fl_df['ORIGIN'] = le_origin.fit_transform(fl_df['ORIGIN'])
le_dest = LabelEncoder()
fl_df['DEST'] = le_dest.fit_transform(fl_df['DEST'])


In [8]:
# fl_df=fin.toPandas()
# df = pd.concat([fl_df,pd.get_dummies(fl_df['OP_CARRIER'],drop_first=True,prefix="OP_CARRIER")],axis=1)
# df = pd.concat([fl_df,pd.get_dummies(fl_df['ORIGIN'],drop_first=True,prefix="ORIGIN")],axis=1)
# df = pd.concat([fl_df,pd.get_dummies(fl_df['DEST'],drop_first=True,prefix="DEST")],axis=1)
# df = pd.concat([fl_df,pd.get_dummies(fl_df['DAY'],drop_first=True,prefix="DAY")],axis=1)
# df = pd.concat([fl_df,pd.get_dummies(fl_df['MONTH'],drop_first=True,prefix="MONTH")],axis=1)
# df.drop(['ORIGIN','DEST','OP_CARRIER','DAY','MONTH'],axis=1,inplace=True)


In [10]:
fl_df

Unnamed: 0,ORIGIN,DEST,OP_CARRIER,ARR_DELAY,MONTH,DAY
0,177,149,13,-27.0,1,1
1,177,149,13,-9.0,1,1
2,168,276,13,-26.0,1,1
3,239,176,13,14.0,1,1
4,177,237,13,-38.0,1,1
...,...,...,...,...,...,...
6369477,157,228,9,2.0,12,12
6369478,230,156,9,-9.0,12,12
6369479,67,181,9,61.0,12,12
6369480,182,67,9,-9.0,12,12


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(fl_df.drop('ARR_DELAY',axis=1),fl_df['ARR_DELAY'], test_size=0.30)


In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.preprocessing import LabelEncoder

Dtc = DecisionTreeRegressor(random_state = 2)
Dtc.fit(X_train,y_train)
predictedValues = Dtc.predict(X_test)
print ('MAE:' ,  mean_absolute_error(y_test, predictedValues))
print ('MSE:' , mean_squared_error(y_test, predictedValues))
print('RMSE:' , np.sqrt(mean_squared_error(y_test, predictedValues)))
print ('R2:' , r2_score(y_test, predictedValues))



MAE: 20.449672837161316
MSE: 1374.1367784471722
RMSE: 37.06935093102079
R2: 0.03374804607933446


In [28]:
X_test.head() 1307321

Unnamed: 0,ORIGIN,DEST,OP_CARRIER,MONTH,DAY
3708733,209,88,9,7,7
2023804,88,102,10,4,4
1307321,195,246,3,3,3
3337202,146,238,14,7,7
4040207,243,20,4,8,8


In [11]:
from sklearn.preprocessing import LabelEncoder

x1 = [m,d]
x2 = [c, o, dest]
x1.extend(x2)
df1 = pd.DataFrame(data = [x1], columns = ['MONTH','DAY', 'OP_CARRIER', 'ORIGIN', 'DEST'])
#df1['OP_CARRIER'] = le_carrier.fit(df1['OP_CARRIER'])
df1['OP_CARRIER'] = le_carrier.transform(df1['OP_CARRIER'])
df1['ORIGIN'] = le_origin.transform(df1['ORIGIN'])
df1['DEST'] = le_dest.transform(df1['DEST'])
x = df1.iloc[:, :5].values
ans = Dtc.predict(x)
output = ans

In [16]:
print("the flight is delayed for: ", str(output[0]))

the flight is delayed for:  26.045454545454547


In [37]:

from tkinter import *
 
top = Tk()  
top.geometry("250x250") 
def printValue():
    m = month.get()
    d=day.get()
    c=carrier.get()
    o=origin.get()
    dest=desti.get()
    x1 = [m,d]
    x2 = [c, o, dest]
    x1.extend(x2)
    df1 = pd.DataFrame(data = [x1], columns = ['MONTH','DAY', 'OP_CARRIER', 'ORIGIN', 'DEST'])
    #df1['OP_CARRIER'] = le_carrier.fit(df1['OP_CARRIER'])
    df1['OP_CARRIER'] = le_carrier.transform(df1['OP_CARRIER'])
    df1['ORIGIN'] = le_origin.transform(df1['ORIGIN'])
    df1['DEST'] = le_dest.transform(df1['DEST'])
    x = df1.iloc[:, :5].values
    ans = Dtc.predict(x)
    output = ans
    Label(top, text=f'{output}, is your delay!', bg='#ffbf00').grid(row=13, column=1, sticky=W) 
# the label for user_name

   # the label for user_name
# month_lbl = Label(top,text = "month in number").place(x = 40,y = 60) 
   
# # the label for user_password 
# day_lbl = Label(top,text = "day in number").place(x = 40, y = 100)
# carrier_lbl = Label(top,text = "Carrier").place(x = 40,y = 140) 
   
# # the label for user_password 
# origin_lbl = Label(top,text = "Origin").place(x = 40, y = 180)
# dest_lbl = Label(top,text = "Destination").place(x = 40, y = 220)
month = Entry(top,width = 30)
month.grid(row=1, column=1, sticky=W)
month.insert(0, 'month in numbers')
day = Entry(top,width = 30)
day.grid(row=3, column=1, sticky=W)
day.insert(0, 'day of month in numbers')
carrier = Entry(top,width = 30)
carrier.grid(row=5, column=1, sticky=W)
carrier.insert(0, 'carrier')
origin = Entry(top,width = 30)
origin.grid(row=7, column=1, sticky=W)
origin.insert(0, 'origin')
desti = Entry(top,width = 30)
desti.grid(row=9, column=1, sticky=W)
desti.insert(0, 'dest')

# day=Entry(top,width = 30).place(x = 110,y = 100)
# carrier=Entry(top,width = 30).place(x = 110,y = 140)
# origin=Entry(top,width = 30).place(x = 110,y = 180)
# destination=Entry(top,width = 30).place(x = 110,y = 220)
submit_button = Button(top,text = "Submit", bg="green", command=printValue).grid(row=11, column=1, sticky=W)
top.mainloop()