In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark

In [4]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

In [5]:
#create spark session
from pyspark.sql import SparkSession
appName = "Classification in Spark"
spark = SparkSession \
.builder \
.appName(appName) \
.config("spark.some.config.option","some-value") \
.getOrCreate()

In [6]:
#define our schema
flightSchema = StructType([
    StructField("DayOfMonth", IntegerType(), False),
    StructField("DayOfWeek", IntegerType(), False),
    StructField("Carrier", StringType(),False),
    StructField("OriginAirportID", IntegerType(), False),
    StructField("DestAirportID", IntegerType(), False),
    StructField("DepDelay", IntegerType(), False),
    StructField("ArrDelay", IntegerType(), False)
])

#read csv data with our defined schema
csv = spark.read.csv('C:/Users/aayushi srivastava/Documents/AayushiSrivastavaJobSearch/PySparkProjects/dataset/flights.csv', schema = flightSchema, header = True)
                     
csv.show(3)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayOfMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



In [7]:
#select important data for classification features and change arrival delay into binary class late vs not late
data = csv.select("DayOfMonth","DayOfWeek","OriginAirportID","DestAirportID","DepDelay",((col("ArrDelay") > 15).cast("Int").alias("Late")))
data.show(3)

+----------+---------+---------------+-------------+--------+----+
|DayOfMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|Late|
+----------+---------+---------------+-------------+--------+----+
|        19|        5|          11433|        13303|      -3|   0|
|        19|        5|          14869|        12478|       0|   0|
|        19|        5|          14057|        14869|      -4|   0|
+----------+---------+---------------+-------------+--------+----+
only showing top 3 rows



In [8]:
#Divide data into 70% for training and 30 5 for testing
dividedData = data.randomSplit([0.7,0.3])
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] # index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print("Training Data is: ",train_rows,"Testing Data is: ",test_rows)

Training Data is:  1889829 Testing Data is:  812389


In [13]:
#Prepare training data
#define an assembler
assembler = VectorAssembler(inputCols= ["DayOfMonth",
    "DayOfWeek", "OriginAirportID","DestAirportID", "DepDelay"], outputCol = "features")

trainingDataFinal = assembler.transform(
trainingData).select(col("features"), col("Late").alias("label"))

trainingDataFinal.show(truncate = False, n=2)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10821.0,8.0] |0    |
|[1.0,1.0,10140.0,11259.0,-3.0]|0    |
+------------------------------+-----+
only showing top 2 rows



In [14]:
#Train our classifier model using training data
#define our classifier
classifier = LogisticRegression(
labelCol = "label", featuresCol = "features", maxIter = 10, regParam = 0.3)

#train our classifier
model = classifier.fit(trainingDataFinal)
print("Classifier model is trained")

Classifier model is trained


In [15]:
#Prepare testing data
testingDataFinal = assembler.transform(
testingData).select(col("features"),col("Late").alias("trueLabel"))
testingDataFinal.show(3)

+--------------------+---------+
|            features|trueLabel|
+--------------------+---------+
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
|[1.0,1.0,10140.0,...|        0|
+--------------------+---------+
only showing top 3 rows



In [16]:
#Predict the testing data using our trained model
prediction = model.transform(testingDataFinal)
predictionFinal = prediction.select(
"features", "prediction", "probability", "trueLabel")
predictionFinal.show(truncate=False,n=3)
prediction.show(truncate=False,n=3)

+------------------------------+----------+----------------------------------------+---------+
|features                      |prediction|probability                             |trueLabel|
+------------------------------+----------+----------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-4.0]|0.0       |[0.8313282600755334,0.16867173992446666]|0        |
|[1.0,1.0,10140.0,10397.0,-2.0]|0.0       |[0.8273665955167756,0.17263340448322437]|0        |
|[1.0,1.0,10140.0,11292.0,-1.0]|0.0       |[0.8257590211382007,0.1742409788617994] |0        |
+------------------------------+----------+----------------------------------------+---------+
only showing top 3 rows

+------------------------------+---------+----------------------------------------+----------------------------------------+----------+
|features                      |trueLabel|rawPrediction                           |probability                             |prediction|
+------------------------------+------

In [17]:
#calculate our model performance
correctPrediction = predictionFinal.filter(
predictionFinal['prediction'] == predictionFinal['trueLabel']).count()
totalData = predictionFinal.count()
print("correct prediction:",correctPrediction, ",total data:",totalData,",accuracy:", correctPrediction/totalData)

correct prediction: 669997 ,total data: 812389 ,accuracy: 0.8247243623436555


In [18]:
#Another Classification Algorithm by Spark
from pyspark.ml.classification import RandomForestClassifier

model2 = RandomForestClassifier(
numTrees = 3, maxDepth=5, seed=42,labelCol = "label", featuresCol = "features")

model2 = model2.fit(trainingDataFinal)
print("Model is trained!")

Model is trained!


In [19]:
prediction = model2.transform(testingDataFinal)
predictionFinal = prediction.select(
"features", "prediction", "probability", "trueLabel")
predictionFinal.show(truncate = False, n=3)
correctPrediction = predictionFinal.filter(
predictionFinal['prediction'] == predictionFinal['trueLabel']).count()

totalData = predictionFinal.count()
print("Correct Prediction:",correctPrediction,"total data: ",totalData,"accuracy:",correctPrediction/totalData)

+------------------------------+----------+---------------------------------------+---------+
|features                      |prediction|probability                            |trueLabel|
+------------------------------+----------+---------------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-4.0]|0.0       |[0.9299175529154877,0.0700824470845123]|0        |
|[1.0,1.0,10140.0,10397.0,-2.0]|0.0       |[0.9299175529154877,0.0700824470845123]|0        |
|[1.0,1.0,10140.0,11292.0,-1.0]|0.0       |[0.9299175529154877,0.0700824470845123]|0        |
+------------------------------+----------+---------------------------------------+---------+
only showing top 3 rows

Correct Prediction: 753204 total data:  812389 accuracy: 0.9271469702322409
