### Making Session

In [23]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('car-pricing').getOrCreate()

### Read CSV File

In [2]:
tmpData = spark.read.csv('cars_normal.csv',inferSchema=True,header=True)

In [3]:
tmpData.printSchema()

root
 |-- title: double (nullable = true)
 |-- year: double (nullable = true)
 |-- mileage: double (nullable = true)
 |-- transmission: double (nullable = true)
 |-- fuel: double (nullable = true)
 |-- body_color: double (nullable = true)
 |-- body_type: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- engine: double (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- fuel_cons: double (nullable = true)
 |-- price: double (nullable = true)



In [24]:
tmpData.columns


['title',
 'year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'fuel_cons',
 'price']

### Assembling Data 

In [7]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(inputCols=['year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'fuel_cons',],outputCol='features')

In [9]:
assembledData = assembler.transform(tmpData)

In [10]:
assembledData.printSchema()

root
 |-- title: double (nullable = true)
 |-- year: double (nullable = true)
 |-- mileage: double (nullable = true)
 |-- transmission: double (nullable = true)
 |-- fuel: double (nullable = true)
 |-- body_color: double (nullable = true)
 |-- body_type: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- engine: double (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- fuel_cons: double (nullable = true)
 |-- price: double (nullable = true)
 |-- features: vector (nullable = true)



In [11]:
assembledData.head(2)[0].asDict()

{'title': 0.0,
 'year': 0.9858712715855573,
 'mileage': 0.5748713782832385,
 'transmission': 0.0,
 'fuel': 0.0,
 'body_color': 0.0,
 'body_type': 0.0,
 'volume': 0.5000000000000001,
 'engine': 0.0,
 'acceleration': 0.4444444444444444,
 'fuel_cons': 0.6984126984126984,
 'price': 0.08629213493412448,
 'features': SparseVector(10, {0: 0.9859, 1: 0.5749, 6: 0.5, 8: 0.4444, 9: 0.6984})}

In [12]:
finalData = assembledData.select('features','price')

In [25]:
finalData.printSchema()

root
 |-- features: vector (nullable = true)
 |-- price: double (nullable = true)



### Split Data to Train(75%) and Test(25%)

In [26]:
trainData,testData = finalData.randomSplit([0.75,0.25])

In [29]:
trainData.describe().show()
testData.describe().show()

+-------+-------------------+
|summary|              price|
+-------+-------------------+
|  count|              14320|
|   mean|0.17897957605871143|
| stddev|0.29957942356705575|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+

+-------+-------------------+
|summary|              price|
+-------+-------------------+
|  count|               4680|
|   mean|0.17601318071361352|
| stddev| 0.2948047018765891|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



### Training Model

In [30]:
lr = LinearRegression(labelCol='price')

In [31]:
pricePredModel = lr.fit(trainData)

In [33]:
print("Coefficients: {} Intercept: {}".format(pricePredModel.coefficients,pricePredModel.intercept))

Coefficients: [-0.24706788583650463,0.1611698083423181,-0.6767217747726951,0.7346990171967308,-0.3510657917927858,0.227946748158977,0.11523089997827793,-0.2572015928428171,-1.4970789666932187,-0.5638001016769547] Intercept: 1.238735485879046


### Testing Model

In [34]:
testResult = pricePredModel.evaluate(testData)

In [42]:
print(" rmse : {} \n mse : {} \n r squared : {}".format(testResult.rootMeanSquaredError,testResult.meanSquaredError,testResult.r2))

 rmse : 0.1445305605959715 
 mse : 0.020889082946185786 
 r squared : 0.7595950694308807


##### mse < 0.1 => good   
#### r squared >= 0.75 =>good