### Making Session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('car-pricing').getOrCreate()

### Read CSV File

In [2]:
tmpData = spark.read.csv('cars_normal.csv',inferSchema=True,header=True)

In [3]:
tmpData.printSchema()

root
 |-- title: double (nullable = true)
 |-- year: double (nullable = true)
 |-- mileage: double (nullable = true)
 |-- transmission: double (nullable = true)
 |-- fuel: double (nullable = true)
 |-- body_color: double (nullable = true)
 |-- body_type: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- engine: double (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- fuel_cons: double (nullable = true)
 |-- price: double (nullable = true)



In [4]:
tmpData.columns


['title',
 'year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'fuel_cons',
 'price']

In [5]:
tmpData.filter(tmpData['price']>1).show()

+-----+----+-------+------------+----+----------+---------+------+------+------------+---------+-----+
|title|year|mileage|transmission|fuel|body_color|body_type|volume|engine|acceleration|fuel_cons|price|
+-----+----+-------+------------+----+----------+---------+------+------+------------+---------+-----+
+-----+----+-------+------------+----+----------+---------+------+------+------------+---------+-----+



### Assembling Data 

In [6]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [7]:
assembler = VectorAssembler(inputCols=['year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'fuel_cons',],outputCol='features')

In [8]:
assembledData = assembler.transform(tmpData)

In [9]:
assembledData.printSchema()

root
 |-- title: double (nullable = true)
 |-- year: double (nullable = true)
 |-- mileage: double (nullable = true)
 |-- transmission: double (nullable = true)
 |-- fuel: double (nullable = true)
 |-- body_color: double (nullable = true)
 |-- body_type: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- engine: double (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- fuel_cons: double (nullable = true)
 |-- price: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
assembledData.head(2)[0].asDict()

{'title': 0.0,
 'year': 0.9858712715855573,
 'mileage': 0.5748713782832385,
 'transmission': 0.0,
 'fuel': 0.0,
 'body_color': 0.0,
 'body_type': 0.0,
 'volume': 0.5000000000000001,
 'engine': 0.0,
 'acceleration': 0.4444444444444444,
 'fuel_cons': 0.6984126984126984,
 'price': 0.08629213493412448,
 'features': SparseVector(10, {0: 0.9859, 1: 0.5749, 6: 0.5, 8: 0.4444, 9: 0.6984})}

In [11]:
finalData = assembledData.select('features','price')

In [12]:
finalData.filter(finalData['price']>1).show()

+--------+-----+
|features|price|
+--------+-----+
+--------+-----+



In [13]:
finalData.printSchema()

root
 |-- features: vector (nullable = true)
 |-- price: double (nullable = true)



### Split Data to Train(75%) and Test(25%)

In [14]:
trainData,testData = finalData.randomSplit([0.75,0.25])

In [15]:
trainData.describe().show()
testData.describe().show()

+-------+-------------------+
|summary|              price|
+-------+-------------------+
|  count|              14214|
|   mean|0.17933514678055693|
| stddev|0.29931355921072494|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+

+-------+-------------------+
|summary|              price|
+-------+-------------------+
|  count|               4786|
|   mean|0.17502286639398915|
| stddev| 0.2956995119883784|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [16]:
trainData.filter(trainData['price']>1).show()
testData.filter(testData['price']>1).show()

+--------+-----+
|features|price|
+--------+-----+
+--------+-----+

+--------+-----+
|features|price|
+--------+-----+
+--------+-----+



In [17]:
trainData.show()
testData.show()

+--------------------+--------------------+
|            features|               price|
+--------------------+--------------------+
|(10,[0,1,6,8,9],[...| 0.08629213493412448|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.12359550

### Training Model

In [18]:
lr = LinearRegression(labelCol='price')

In [19]:
pricePredModel = lr.fit(trainData)

In [20]:
print("Coefficients: {} Intercept: {}".format(pricePredModel.coefficients,pricePredModel.intercept))

Coefficients: [-0.2602708910347227,0.16365599662019645,-0.6842252059399554,0.7362289983092618,-0.3491098403677365,0.22861798091110622,0.13207128900880155,-0.25432005928569407,-1.513205891354419,-0.5721277488707288] Intercept: 1.2548861730281973


### Testing Model

In [21]:
testResult = pricePredModel.evaluate(testData)

In [22]:
print(" rmse : {} \n mse : {} \n r squared : {}".format(testResult.rootMeanSquaredError,testResult.meanSquaredError,testResult.r2))

 rmse : 0.14527002599000338 
 mse : 0.02110338045113626 
 r squared : 0.7585976102505481


##### mse < 0.1 => good   
#### r squared >= 0.75 =>good

### our Model is good now we can use Pipeline to evalute it

In [23]:
from pyspark.ml import Pipeline


In [24]:
pipeline = Pipeline(stages=[assembler,lr])

In [25]:
finalModel = pipeline.fit(tmpData)