In [11]:
import findspark

In [12]:
findspark.init()

In [13]:
import pyspark

In [14]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

In [15]:
#create spark session
appName = "Regression in Spark"

spark = SparkSession \
.builder \
.appName(appName) \
.config("spark.some.config.option","some-value") \
.getOrCreate()

In [16]:
#read csv data with our defined schema
flightSchema = StructType([
    StructField("DayOfMonth",IntegerType(),False),
    StructField("DayOfWeek", IntegerType(), False),
    StructField("Career", StringType(),False),
    StructField("OriginAirportID", IntegerType(),False),
    StructField("DestAirportID",IntegerType(),False),
    StructField("DepDelay", IntegerType(),False),
    StructField("ArrDelay", IntegerType(), False)
    
])

#read csv data with our defined schema
flightDataFrame = spark.read.csv('C:/Users/aayushi srivastava/Documents/AayushiSrivastavaJobSearch/PySparkProjects/dataset/flights.csv', schema = flightSchema, header = True)
flightDataFrame.show(3)

+----------+---------+------+---------------+-------------+--------+--------+
|DayOfMonth|DayOfWeek|Career|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+------+---------------+-------------+--------+--------+
|        19|        5|    DL|          11433|        13303|      -3|       1|
|        19|        5|    DL|          14869|        12478|       0|      -8|
|        19|        5|    DL|          14057|        14869|      -4|     -15|
+----------+---------+------+---------------+-------------+--------+--------+
only showing top 3 rows



In [17]:
#select related column data for our regression input features
data = flightDataFrame.select("DayOfMonth","DayOfWeek","OriginAirportID","DestAirportID","DepDelay","ArrDelay")
data.show(3)

+----------+---------+---------------+-------------+--------+--------+
|DayOfMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+---------------+-------------+--------+--------+
|        19|        5|          11433|        13303|      -3|       1|
|        19|        5|          14869|        12478|       0|      -8|
|        19|        5|          14057|        14869|      -4|     -15|
+----------+---------+---------------+-------------+--------+--------+
only showing top 3 rows



In [18]:
#Divide data 70% for training, 30% for testing
dividedData = data.randomSplit([0.7,0.3])
trainingData = dividedData[0]
testingData = dividedData[1]
train_rows = trainingData.count()
test_rows = testingData.count()
print("Training Data rows:",train_rows,";Testing Data rows:",test_rows)

Training Data rows: 1890030 ;Testing Data rows: 812188


In [21]:
#Prepare Training Data
#define an assembler
assembler = VectorAssembler(inputCols = [
    "DayOfMonth","DayOfWeek","OriginAirportID","DestAirportID","DepDelay"], outputCol = "features")
#change our features into one column using our defined assembler
trainingDataFinal = assembler.transform(trainingData).select(col("features"),(col("ArrDelay").cast("Int").alias("label")))

trainingDataFinal.show(truncate=False,n=3)

+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|-11  |
|[1.0,1.0,10140.0,10821.0,8.0] |-9   |
|[1.0,1.0,10140.0,11259.0,-3.0]|-11  |
+------------------------------+-----+
only showing top 3 rows



In [23]:
#Train our regression model using our training data
#call Spark linear regression we import before
algoritma = LinearRegression(labelCol="label",featuresCol = "features", maxIter=10,regParam=0.3)

#train the model
model = algoritma.fit(trainingDataFinal)
print("Regression model is trained")

Regression model is trained


In [26]:
#Prepare Testing Data
#change our feature data into one column using our defined assembler
#just like we did before in the training data
testingDataFinal = assembler.transform(
    testingData).select(
    col("features"),(col("ArrDelay")).cast("Int").alias("trueLabel"))
testingDataFinal.show(truncate=False,n=2)

+------------------------------+---------+
|features                      |trueLabel|
+------------------------------+---------+
|[1.0,1.0,10140.0,10397.0,-2.0]|-17      |
|[1.0,1.0,10140.0,11259.0,-2.0]|-14      |
+------------------------------+---------+
only showing top 2 rows



In [27]:
#Predict the testing data using our trained model
prediction = model.transform(testingDataFinal)
#show some prediction results
prediction.show(3)

+--------------------+---------+------------------+
|            features|trueLabel|        prediction|
+--------------------+---------+------------------+
|[1.0,1.0,10140.0,...|      -17|-5.560719461557764|
|[1.0,1.0,10140.0,...|      -14|-5.762820107840266|
|[1.0,1.0,10140.0,...|       -6|-4.773488417121978|
+--------------------+---------+------------------+
only showing top 3 rows



In [31]:
#Calculate your model performance
from pyspark.ml.evaluation import RegressionEvaluator

#define our evaluator
evaluator = RegressionEvaluator(
labelCol = "trueLabel", predictionCol = "prediction", metricName = "rmse")

#Calculate RMSE of our trained model
rmse = evaluator.evaluate(prediction)
print("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 13.22474706360927
