In [1]:
#import modules
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

#create Spark session
appName = "Regression in Spark"
spark = SparkSession \
    .builder \
    .appName(appName) \
    .config("spark.some.configq.option", "some-value") \
    .getOrCreate()

# #Read data from flight.csv

In [3]:
#define our schema
flightSchema = StructType([
  StructField("DayofMonth", IntegerType(), False),
  StructField("DayOfWeek", IntegerType(), False),
  StructField("Carrier", StringType(), False),
  StructField("OriginAirportID", IntegerType(), False),
  StructField("DestAirportID", IntegerType(), False),
  StructField("DepDelay", IntegerType(), False),
  StructField("ArrDelay", IntegerType(), False),
])
#read csv data with our defined schema
flightDataFrame = spark.read.csv('dataset/flights.csv', 
                                 schema=flightSchema, header=True)
flightDataFrame.show(3)

+----------+---------+-------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|Carrier|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+-------+---------------+-------------+--------+--------+
|        19|        5|     DL|          11433|        13303|      -3|       1|
|        19|        5|     DL|          14869|        12478|       0|      -8|
|        19|        5|     DL|          14057|        14869|      -4|     -15|
+----------+---------+-------+---------------+-------------+--------+--------+
only showing top 3 rows



In [4]:
#select related column data for our regression input features
data = flightDataFrame.select("DayofMonth", "DayOfWeek", 
                              "OriginAirportID", "DestAirportID", 
                              "DepDelay", "ArrDelay")
data.show(3)

+----------+---------+---------------+-------------+--------+--------+
|DayofMonth|DayOfWeek|OriginAirportID|DestAirportID|DepDelay|ArrDelay|
+----------+---------+---------------+-------------+--------+--------+
|        19|        5|          11433|        13303|      -3|       1|
|        19|        5|          14869|        12478|       0|      -8|
|        19|        5|          14057|        14869|      -4|     -15|
+----------+---------+---------------+-------------+--------+--------+
only showing top 3 rows



# Divide data into Training and Testing

In [10]:
#divide data, 70% for training, 30% for testing
dividedData = data.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0] #index 0 = data training
testingData = dividedData[1] #index 1 = data testing
train_rows = trainingData.count()
test_rows = testingData.count()
print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

Training data rows: 1891066 ; Testing data rows: 811152


# Prepeare Training Data 

In [32]:
#define an assembler
assembler = VectorAssembler(inputCols = [
    "DayofMonth", "DayOfWeek", "OriginAirportID", "DestAirportID", 
    "DepDelay"], outputCol="features")

#change our features into one column using our defined assembler
trainingDataFinal = assembler.transform(trainingData).select(
    col("features"), (col("ArrDelay").cast("Int").alias("label")))
trainingDataFinal.show(truncate=False , n=3)

#trainingDataFinal.show(truncate=False , n=5)


+------------------------------+-----+
|features                      |label|
+------------------------------+-----+
|[1.0,1.0,10140.0,10397.0,-4.0]|-11  |
|[1.0,1.0,10140.0,10397.0,-2.0]|-17  |
|[1.0,1.0,10140.0,11259.0,-3.0]|-11  |
+------------------------------+-----+
only showing top 3 rows



# Testing on Short Data Frame

In [31]:
df = spark.createDataFrame([("Rony",27, 168,1), 
                            ("Rony",15, 165,2), 
                            ("Rony",27, 168,3)], 
                           ["name","age","height","id"])

df.show()
assembler1 = VectorAssembler(inputCols = [
    "id", "age"], outputCol="VectorNameAge")

dfdatafinal = assembler1.transform(df).select(
    col("VectorNameAge"), (col("height").alias("label")))
dfdatafinal.show(truncate=False , n=3)
#dfdatafinal.show(truncate=True , n=3)

+----+---+------+---+
|name|age|height| id|
+----+---+------+---+
|Rony| 27|   168|  1|
|Rony| 15|   165|  2|
|Rony| 27|   168|  3|
+----+---+------+---+

+-------------+-----+
|VectorNameAge|label|
+-------------+-----+
|[1.0,27.0]   |168  |
|[2.0,15.0]   |165  |
|[3.0,27.0]   |168  |
+-------------+-----+



# Train our regression model using training data¶


In [33]:

#call Spark linear regression we import before
algoritma = LinearRegression(
    labelCol="label",featuresCol="features", 
    maxIter=10, regParam=0.3)
#train the model
model = algoritma.fit(trainingDataFinal)
print ("Regression model flight is trained!")

Regression model flight is trained!


In [35]:
#call Spark linear regression we import before
algoritma1 = LinearRegression(
    labelCol="label",featuresCol="VectorNameAge", 
    maxIter=10, regParam=0.3)
#train the model
model1 = algoritma1.fit(dfdatafinal)
print ("Regression model for rony is trained!")

Regression model for rony is trained!


In [36]:
#change our feature data into one column using our defined assembler
#just like what we did before in the training data
testingDataFinal = assembler.transform(testingData).select(
    col("features"), (col("ArrDelay")).cast("Int").alias("trueLabel"))
testingDataFinal.show(truncate=False, n=2)

+------------------------------+---------+
|features                      |trueLabel|
+------------------------------+---------+
|[1.0,1.0,10140.0,10821.0,8.0] |-9       |
|[1.0,1.0,10140.0,11292.0,-4.0]|-8       |
+------------------------------+---------+
only showing top 2 rows



# Predict the testing data using our trained model

In [37]:
#predict testing data using our model
prediction = model.transform(testingDataFinal)
#show some prediction results
prediction.show(3)

+--------------------+---------+-------------------+
|            features|trueLabel|         prediction|
+--------------------+---------+-------------------+
|[1.0,1.0,10140.0,...|       -9|  4.317688006282429|
|[1.0,1.0,10140.0,...|       -8| -7.763666488904541|
|[1.0,1.0,10140.0,...|       -6|-4.7703921909570015|
+--------------------+---------+-------------------+
only showing top 3 rows



# Evaluate Regression Model

In [38]:
#import evaluator module for regression
from pyspark.ml.evaluation import RegressionEvaluator

#define our evaluator
evaluator = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="rmse")
#calculate RMSE of our trained model
rmse = evaluator.evaluate(prediction)
print ("Root Mean Square Error (RMSE):", rmse)

Root Mean Square Error (RMSE): 13.241877738302986
