### Making Session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('car-pricing').getOrCreate()

### Read CSV File

In [2]:
tmpData = spark.read.csv('cars_unnormal_with_label.csv',inferSchema=True,header=True)

In [3]:
tmpData.printSchema()

root
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- mileage: integer (nullable = true)
 |-- transmission: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- body_color: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- volume: double (nullable = true)
 |-- engine: string (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- fuel_cons: double (nullable = true)
 |-- price: long (nullable = true)



In [6]:
tmpData.columns


['title',
 'year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'fuel_cons',
 'price']

### give index to strings

In [10]:
from pyspark.ml.feature import StringIndexer

In [14]:
indexer = StringIndexer(inputCols=['transmission',
 'fuel',
 'body_color',
 'body_type',
 'engine',],
outputCols=['indexedtransmission',
 'indexedfuel',
 'indexedbody_color',
 'indexedbody_type',
'indexedengine',])

In [16]:
indexedData = indexer.fit(tmpData).transform(tmpData)

In [21]:
indexedData.columns

['title',
 'year',
 'mileage',
 'transmission',
 'fuel',
 'body_color',
 'body_type',
 'volume',
 'engine',
 'acceleration',
 'fuel_cons',
 'price',
 'indexedengine',
 'indexedtransmission',
 'indexedbody_color',
 'indexedfuel',
 'indexedbody_type']

### Assembling Data 

In [23]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [24]:
assembler = VectorAssembler(inputCols=['year',
 'mileage',
 'volume',
 'acceleration',
 'fuel_cons',
'indexedengine',
 'indexedtransmission',
 'indexedbody_color',
 'indexedfuel',
 'indexedbody_type'],outputCol='features')

In [26]:
assembledData = assembler.transform(indexedData)

In [27]:
assembledData.printSchema()

root
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- mileage: integer (nullable = true)
 |-- transmission: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- body_color: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- volume: double (nullable = true)
 |-- engine: string (nullable = true)
 |-- acceleration: double (nullable = true)
 |-- fuel_cons: double (nullable = true)
 |-- price: long (nullable = true)
 |-- indexedengine: double (nullable = false)
 |-- indexedtransmission: double (nullable = false)
 |-- indexedbody_color: double (nullable = false)
 |-- indexedfuel: double (nullable = false)
 |-- indexedbody_type: double (nullable = false)
 |-- features: vector (nullable = true)



In [29]:
assembledData.head(2)[0].asDict()

{'title': 'هیوندای، کوپه',
 'year': 2008,
 'mileage': 213000,
 'transmission': 'اتوماتیک',
 'fuel': 'بنزینی',
 'body_color': 'مشکی',
 'body_type': 'coupe',
 'volume': 2.7,
 'engine': '6 سیلندر ',
 'acceleration': 9.1,
 'fuel_cons': 10.5,
 'price': 768000000,
 'indexedengine': 3.0,
 'indexedtransmission': 0.0,
 'indexedbody_color': 1.0,
 'indexedfuel': 0.0,
 'indexedbody_type': 4.0,
 'features': DenseVector([2008.0, 213000.0, 2.7, 9.1, 10.5, 3.0, 0.0, 1.0, 0.0, 4.0])}

#### Normalizing Data

In [30]:
from pyspark.mllib.util import MLUtils
from pyspark.ml.feature import Normalizer

In [32]:
normalizer = Normalizer(inputCol='features',outputCol='normFeatures')

In [33]:
normedData = normalizer.transform(assembledData)

In [38]:
normedData.head(5)[4].asDict()

{'title': 'پژو، 206',
 'year': 1400,
 'mileage': 700,
 'transmission': 'دنده ای',
 'fuel': 'بنزینی',
 'body_color': 'خاکستری',
 'body_type': 'hatchback',
 'volume': 1.4,
 'engine': '4 سیلندر TU3 ',
 'acceleration': 14.1,
 'fuel_cons': 6.4,
 'price': 295000000,
 'indexedengine': 1.0,
 'indexedtransmission': 1.0,
 'indexedbody_color': 2.0,
 'indexedfuel': 0.0,
 'indexedbody_type': 2.0,
 'features': DenseVector([1400.0, 700.0, 1.4, 14.1, 6.4, 1.0, 1.0, 2.0, 0.0, 2.0]),
 'normFeatures': DenseVector([0.8944, 0.4472, 0.0009, 0.009, 0.0041, 0.0006, 0.0006, 0.0013, 0.0, 0.0013])}

In [45]:
finalData = normedData.select('normFeatures','price')
finalData = finalData.withColumnRenamed('normFeatures','features')

In [47]:
finalData.printSchema()

root
 |-- features: vector (nullable = true)
 |-- price: long (nullable = true)



Row(features=DenseVector([0.8944, 0.4472, 0.0009, 0.009, 0.0041, 0.0006, 0.0006, 0.0013, 0.0, 0.0013]), price=295000000)

### Split Data to Train(75%) and Test(25%)

In [26]:
trainData,testData = finalData.randomSplit([0.75,0.25])

In [54]:
trainData.describe().show()
testData.describe().show()

+-------+-------------------+
|summary|              price|
+-------+-------------------+
|  count|              14320|
|   mean|0.17897957605871143|
| stddev|0.29957942356705575|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+

+-------+-------------------+
|summary|              price|
+-------+-------------------+
|  count|               4680|
|   mean|0.17601318071361352|
| stddev| 0.2948047018765891|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [56]:
trainData.filter(trainData['price']>1).show()
testData.filter(testData['price']>1).show()

+--------+-----+
|features|price|
+--------+-----+
+--------+-----+

+--------+-----+
|features|price|
+--------+-----+
+--------+-----+



In [46]:
trainData.show()
testData.show()

+--------------------+--------------------+
|            features|               price|
+--------------------+--------------------+
|(10,[0,1,6,8,9],[...| 0.08629213493412448|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.123595505491730...|
|[0.0,1.0,0.5,1.0,...|1.12359550

### Training Model

In [30]:
lr = LinearRegression(labelCol='price')

In [31]:
pricePredModel = lr.fit(trainData)

In [33]:
print("Coefficients: {} Intercept: {}".format(pricePredModel.coefficients,pricePredModel.intercept))

Coefficients: [-0.24706788583650463,0.1611698083423181,-0.6767217747726951,0.7346990171967308,-0.3510657917927858,0.227946748158977,0.11523089997827793,-0.2572015928428171,-1.4970789666932187,-0.5638001016769547] Intercept: 1.238735485879046


### Testing Model

In [34]:
testResult = pricePredModel.evaluate(testData)

In [42]:
print(" rmse : {} \n mse : {} \n r squared : {}".format(testResult.rootMeanSquaredError,testResult.meanSquaredError,testResult.r2))

 rmse : 0.1445305605959715 
 mse : 0.020889082946185786 
 r squared : 0.7595950694308807


##### mse < 0.1 => good   
#### r squared >= 0.75 =>good

In [43]:
res = pricePredModel.transform(testData)

In [59]:
res.head(5)

[Row(features=DenseVector([0.0, 1.0, 0.5, 1.0, 0.8333, 0.2, 0.0, 0.0833, 1.0, 0.0476]), price=1.1235955054917308e-10, prediction=0.003917890516722089),
 Row(features=DenseVector([0.0, 1.0, 0.5, 1.0, 0.8333, 0.2, 0.0, 0.0833, 1.0, 0.0476]), price=1.1235955054917308e-10, prediction=0.003917890516722089),
 Row(features=DenseVector([0.0, 1.0, 0.5, 1.0, 0.8333, 0.2, 0.0, 0.0833, 1.0, 0.0476]), price=1.1235955054917308e-10, prediction=0.003917890516722089),
 Row(features=DenseVector([0.0, 1.0, 0.5, 1.0, 0.8333, 0.2, 0.0, 0.0833, 1.0, 0.0476]), price=1.1235955054917308e-10, prediction=0.003917890516722089),
 Row(features=DenseVector([0.0, 1.0, 0.5, 1.0, 0.8333, 0.2, 0.0, 0.0833, 1.0, 0.0476]), price=1.1235955054917308e-10, prediction=0.003917890516722089)]