In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col
import numpy as np
import os

In [2]:
os.environ["SPARK_HOME"] = "C:/spark-2.4.4-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "C:/winutils"

In [3]:
# Creating spark session
spark = SparkSession.builder.appName("ICP7").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [4]:
# Load data and select feature and label columns
ICP = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load("D:/Datasets/Regression/imports-85.csv")
# Loading the dataset
# ICP = spark.read.load("D:/Datasets/Regression/imports-85.csv", format="csv", header=True, delimiter=",")

In [5]:
import pandas as pd
pd.DataFrame(ICP.take(5), columns=ICP.columns).transpose()

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [6]:
from pyspark.sql.types import *
ICP = ICP.withColumnRenamed("wheel-base", "label").select("length", "width", "height","label")
ICP = ICP.withColumn("label", ICP["label"].cast(IntegerType()))

In [7]:
import pandas as pd
pd.DataFrame(ICP.take(5), columns=ICP.columns).transpose()

Unnamed: 0,0,1,2,3,4
length,168.8,168.8,171.2,176.6,176.6
width,64.1,64.1,65.5,66.2,66.4
height,48.8,48.8,52.4,54.3,54.3
label,88.0,88.0,94.0,99.0,99.0


In [8]:
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=ICP.columns[:2], outputCol="features")
ICP = assembler.transform(ICP)

In [9]:
ICP = ICP.select("label", "features")

In [10]:
from pyspark.ml.regression import LinearRegression
model_1 = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [11]:
# Fit the model
model = model_1.fit(ICP)

In [12]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

Coefficients: [0.3337891635819007,0.5150505011624908]
Intercept: 6.2559533571945725


In [13]:
# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

numIterations: 7
objectiveHistory: [0.5, 0.39634946019346834, 0.1536353728360829, 0.15116447772451408, 0.14653853943833373, 0.14653853141273573, 0.14653853141271792]
+--------------------+
|           residuals|
+--------------------+
|  -7.614301294335078|
|  -7.614301294335078|
| -3.1364659885591237|
|-0.29946282271511393|
| -0.4024729229476236|
| -0.5846202873387085|
| -2.3517309624286753|
| -2.3517309624286753|
| -2.3517309624286753|
| -1.7091113364223958|
|  2.3548500461959776|
|  2.3548500461959776|
|  2.3548500461959776|
|  2.3548500461959776|
| -0.7989838019444448|
| -0.7989838019444448|
| -2.9162222883000624|
|   1.470500884750379|
|   3.588850441301048|
|   2.949104166452699|
+--------------------+
only showing top 20 rows

RMSE: 2.837581
r2: 0.780117
