### Making Session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('car-pricing').getOrCreate()

### Read CSV File

In [2]:
tmpData = spark.read.csv('cars_normal.csv',inferSchema=True,header=True)

In [3]:
tmpData.printSchema()

root
 |-- title: double (nullable = true)
 |-- year: double (nullable = true)
 |-- mileage: double (nullable = true)
 |-- transmission: double (nullable = true)
 |-- fuel: double (nullable = true)
 |-- body_color: double (nullable = true)
 |-- body_type: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- engine: double (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- fuel_cons: double (nullable = true)
 |-- price: double (nullable = true)



In [4]:
tmpData.columns


['title',
 'year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'fuel_cons',
 'price']

### Split Data to Train(75%) and Test(25%)

In [9]:
trainData,testData = tmpData.randomSplit([0.75,0.25])

### Assembling Data 

In [6]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
assembler = VectorAssembler(inputCols=['year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'fuel_cons',],outputCol='features')

### Training Model

In [11]:
lr = LinearRegression(labelCol='price')

### Pipelining

In [12]:
from pyspark.ml import Pipeline

In [None]:
pipeline = Pipeline(stages=[assembler,lr])

In [None]:
pricePredModel = lr.fit(trainData)

In [None]:
print("Coefficients: {} Intercept: {}".format(pricePredModel.coefficients,pricePredModel.intercept))

### Testing Model

In [None]:
testResult = pricePredModel.evaluate(testData)

In [None]:
print(" rmse : {} \n mse : {} \n r squared : {}".format(testResult.rootMeanSquaredError,testResult.meanSquaredError,testResult.r2))

##### mse < 0.1 => good   
#### r squared >= 0.75 =>good

In [None]:
res = pricePredModel.transform(testData)