In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col
import numpy as np
import os

In [2]:
os.environ["SPARK_HOME"] = "C:/spark-2.4.4-bin-hadoop2.7"
os.environ["HADOOP_HOME"] = "C:/winutils"

In [3]:
# Creating spark session
spark = SparkSession.builder.appName("ICP7").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [4]:
# Load data and select feature and label columns
ICP = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load("D:/Datasets/Regression/imports-85.csv")
# Loading the dataset
# ICP = spark.read.load("D:/Datasets/Regression/imports-85.csv", format="csv", header=True, delimiter=",")

In [5]:
import pandas as pd
pd.DataFrame(ICP.take(5), columns=ICP.columns).transpose()

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [9]:
from pyspark.sql.functions import col, when

ICP = ICP.withColumn("label", when(col("num-of-doors") == "four", 1).otherwise(0)).select("length", "width", "height","label")

In [10]:
import pandas as pd
pd.DataFrame(ICP.take(5), columns=ICP.columns).transpose()

Unnamed: 0,0,1,2,3,4
length,168.8,168.8,171.2,176.6,176.6
width,64.1,64.1,65.5,66.2,66.4
height,48.8,48.8,52.4,54.3,54.3
label,0.0,0.0,0.0,1.0,1.0


In [11]:
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=ICP.columns[:2], outputCol="features")
ICP = assembler.transform(ICP)

In [12]:
ICP = ICP.select("label", "features")

In [13]:
from pyspark.ml.classification import LogisticRegression
model_1 = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [14]:
# Fit the model
model = model_1.fit(ICP)

In [15]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(model.coefficients))
print("Intercept: %s" % str(model.intercept))

Coefficients: [0.0,0.0]
Intercept: 0.22533894187764542


In [21]:
# Summarize the model over the training set and print out some metrics
trainingSummary = model.summary

numIterations: 1
objectiveHistory: [0.6868400366505755]
Multinomial intercepts: [0.22533894187764542]
