In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df_test = spark.read.option('inferSchema', 'true').csv('PlaneDelayTesting.csv', header=True)
df_train = spark.read.option('inferSchema', 'true').csv('PlaneDelayTraining.csv', header=True)

In [4]:
df_test = df_test.select('Airline', 'TimeDeparture', 'IsDelayed')
df_train = df_train.select('Airline', 'TimeDeparture', 'IsDelayed')
df_train.show(5)

+-------+-------------+---------+
|Airline|TimeDeparture|IsDelayed|
+-------+-------------+---------+
|     DL|          650|        0|
|     DL|          650|        0|
|     DL|          650|        0|
|     DL|          650|        0|
|     DL|          650|        0|
+-------+-------------+---------+
only showing top 5 rows



In [5]:
df_test = df_test.na.drop()
df_train = df_train.na.drop()

In [6]:
def transform(df):
    df = df.withColumn("Airline", when(df["Airline"] == "DL", 0).\
                      when(df["Airline"] == "FL", 1).\
                      when(df["Airline"] == "EV", 2))
    return df

df_test = transform(df_test)
df_train = transform(df_train)

df_test.show()

cols = df_train.columns
cols.remove("IsDelayed")

assembler = VectorAssembler(inputCols=cols, outputCol="Features")
scaler = StandardScaler(inputCol="Features", outputCol="ScaledFeatures")

df_test = assembler.transform(df_test)
df_test = scaler.fit(df_test).transform(df_test)

df_train = assembler.transform(df_train)
df_train = scaler.fit(df_train).transform(df_train)

+-------+-------------+---------+
|Airline|TimeDeparture|IsDelayed|
+-------+-------------+---------+
|      0|          650|        0|
|      0|          515|        0|
|      0|          515|        0|
|      1|         1184|        0|
|      1|         1184|        0|
|      1|          775|        0|
|      1|          775|        0|
|      1|          700|        1|
|      0|         1230|        1|
|      0|         1135|        1|
|      0|         1135|        1|
|      0|          965|        1|
|      0|          965|        1|
|      1|          697|        0|
|      1|          697|        0|
|      1|          697|        0|
|      0|          870|        0|
|      0|          870|        0|
|      0|          487|        0|
|      0|          487|        0|
+-------+-------------+---------+
only showing top 20 rows



In [7]:
model = LogisticRegression(featuresCol="ScaledFeatures", labelCol="IsDelayed", maxIter=1000).fit(df_train)
prediction = model.transform(df_test)

In [8]:
evaluator = BinaryClassificationEvaluator(labelCol="IsDelayed")
accuracy = round(evaluator.evaluate(prediction) * 100, 2)
print(accuracy)

81.76
