In [47]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('carPricing').getOrCreate()


In [48]:
tmpData = spark.read.csv('cars.csv',inferSchema=True,header=True)

In [49]:
tmpData.printSchema()

root
 |-- _id: string (nullable = true)
 |-- body_color: string (nullable = true)
 |-- body_status: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- fuel_consumption: double (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- mileage: integer (nullable = true)
 |-- model: string (nullable = true)
 |-- price: long (nullable = true)
 |-- volume: double (nullable = true)
 |-- year: integer (nullable = true)



In [50]:
tmpData.head(3)[2].asDict()

{'_id': '62c022d45ac6166d04f2a9c8',
 'body_color': 'سفید',
 'body_status': 'دور رنگ',
 'body_type': 'suv',
 'brand': 'nissan',
 'engine': '6 سیلندر P40 کاربراتوری ',
 'fuel_consumption': 17.0,
 'fuel_type': 'دوگانه سوز',
 'mileage': 500000,
 'model': 'patrol4doorir',
 'price': 210000000,
 'volume': 4.0,
 'year': 1990}

In [51]:
tmpData.count()

16777

### remove null data

In [52]:
tmpData.na.drop().count()

16777

In [53]:
tmpData.columns

['_id',
 'body_color',
 'body_status',
 'body_type',
 'brand',
 'engine',
 'fuel_consumption',
 'fuel_type',
 'mileage',
 'model',
 'price',
 'volume',
 'year']

In [54]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

In [9]:
indexer = StringIndexer(inputCols=['body_color',
 'body_status',
 'body_type',
 'brand',
 'engine','fuel_type',
 'model'],
outputCols=['body_color_indexed',
 'body_status_indexed',
 'body_type_indexed',
 'brand_indexed',
 'engine_indexed','fuel_type_indexed',
 'model_indexed'])

In [55]:
indexedData = indexer.fit(tmpData).transform(tmpData)

In [57]:
indexedData.printSchema()

root
 |-- _id: string (nullable = true)
 |-- body_color: string (nullable = true)
 |-- body_status: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- fuel_consumption: double (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- mileage: integer (nullable = true)
 |-- model: string (nullable = true)
 |-- price: long (nullable = true)
 |-- volume: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- body_status_indexed: double (nullable = false)
 |-- engine_indexed: double (nullable = false)
 |-- fuel_type_indexed: double (nullable = false)
 |-- model_indexed: double (nullable = false)
 |-- brand_indexed: double (nullable = false)
 |-- body_color_indexed: double (nullable = false)
 |-- body_type_indexed: double (nullable = false)



In [58]:
indexedData.columns

['_id',
 'body_color',
 'body_status',
 'body_type',
 'brand',
 'engine',
 'fuel_consumption',
 'fuel_type',
 'mileage',
 'model',
 'price',
 'volume',
 'year',
 'body_status_indexed',
 'engine_indexed',
 'fuel_type_indexed',
 'model_indexed',
 'brand_indexed',
 'body_color_indexed',
 'body_type_indexed']

In [59]:
assembler = VectorAssembler(inputCols=['body_status_indexed',
 'engine_indexed',
 'fuel_type_indexed',
 'model_indexed',
 'brand_indexed',
 'body_color_indexed',
 'body_type_indexed','fuel_consumption','mileage','volume',
 'year'],outputCol='features')

In [60]:
assembledData = assembler.transform(indexedData)

In [61]:
assembledData.printSchema()

root
 |-- _id: string (nullable = true)
 |-- body_color: string (nullable = true)
 |-- body_status: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- fuel_consumption: double (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- mileage: integer (nullable = true)
 |-- model: string (nullable = true)
 |-- price: long (nullable = true)
 |-- volume: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- body_status_indexed: double (nullable = false)
 |-- engine_indexed: double (nullable = false)
 |-- fuel_type_indexed: double (nullable = false)
 |-- model_indexed: double (nullable = false)
 |-- brand_indexed: double (nullable = false)
 |-- body_color_indexed: double (nullable = false)
 |-- body_type_indexed: double (nullable = false)
 |-- features: vector (nullable = true)



In [62]:
assembledData.head(2)[0].asDict()

{'_id': '62c0216f5ac6166d04f28fb9',
 'body_color': 'قهوه ای',
 'body_status': 'کامل رنگ',
 'body_type': 'suv',
 'brand': 'nissan',
 'engine': '6 سیلندر P40 کاربراتوری ',
 'fuel_consumption': 17.0,
 'fuel_type': 'بنزینی',
 'mileage': 200000,
 'model': 'patrol4doorir',
 'price': 165000000,
 'volume': 4.0,
 'year': 1989,
 'body_status_indexed': 13.0,
 'engine_indexed': 61.0,
 'fuel_type_indexed': 0.0,
 'model_indexed': 122.0,
 'brand_indexed': 13.0,
 'body_color_indexed': 7.0,
 'body_type_indexed': 3.0,
 'features': DenseVector([13.0, 61.0, 0.0, 122.0, 13.0, 7.0, 3.0, 17.0, 200000.0, 4.0, 1989.0])}

In [63]:
assembledData.select('features').head(2)

[Row(features=DenseVector([13.0, 61.0, 0.0, 122.0, 13.0, 7.0, 3.0, 17.0, 200000.0, 4.0, 1989.0])),
 Row(features=DenseVector([9.0, 61.0, 1.0, 122.0, 13.0, 0.0, 3.0, 17.0, 240000.0, 4.0, 1990.0]))]

In [64]:
data = assembledData.select('features','price')

In [77]:
data.show()

+--------------------+---------+
|            features|    price|
+--------------------+---------+
|[13.0,61.0,0.0,12...|165000000|
|[9.0,61.0,1.0,122...|200000000|
|[9.0,61.0,1.0,122...|210000000|
|[0.0,40.0,0.0,85....|150000000|
|[13.0,61.0,0.0,12...|182000000|
|[3.0,61.0,0.0,122...|150000000|
|[0.0,61.0,0.0,122...|200000000|
|[9.0,61.0,0.0,122...|230000000|
|[9.0,40.0,0.0,85....|190000000|
|[9.0,40.0,0.0,85....|175000000|
|[3.0,40.0,0.0,85....|230000000|
|[3.0,61.0,0.0,122...|240000000|
|[3.0,61.0,1.0,122...|188000000|
|[9.0,61.0,1.0,122...|194000000|
|[3.0,40.0,0.0,85....|250000000|
|[2.0,40.0,0.0,85....|269000000|
|[9.0,61.0,1.0,122...|280000000|
|[3.0,61.0,1.0,122...|290000000|
|[9.0,61.0,1.0,122...|320000000|
|[9.0,12.0,0.0,8.0...| 73000000|
+--------------------+---------+
only showing top 20 rows



In [87]:
data.head(2)[0].asDict()

{'features': DenseVector([13.0, 61.0, 0.0, 122.0, 13.0, 7.0, 3.0, 17.0, 200000.0, 4.0, 1989.0]),
 'price': 165000000,
 'normFeatures': DenseVector([0.0001, 0.0003, 0.0, 0.0006, 0.0001, 0.0, 0.0, 0.0001, 0.989, 0.0, 0.0098])}

In [108]:
trainData,testData = data.randomSplit([0.7,0.3])

##### importing linear reg

In [104]:
from pyspark.ml.regression import LinearRegression

In [110]:
lr =  LinearRegression(labelCol='price')

In [111]:
Model = lr.fit(trainData)

In [112]:
testResult  = Model.evaluate(testData)

In [113]:
testResult.r2

0.4698047043571588

In [114]:
testResult.rootMeanSquaredError

684969289.4227656

In [115]:
result = Model.transform(testData)

In [116]:
result.show()

+--------------------+---------+--------------------+
|            features|    price|          prediction|
+--------------------+---------+--------------------+
|(11,[0,1,3,7,9,10...|335000000|3.4790162706365013E8|
|(11,[1,3,4,7,9,10...|755000000|  6.34573001149231E8|
|(11,[1,3,4,7,9,10...|760000000| 6.272942015324516E8|
|(11,[1,3,4,7,9,10...|760000000| 6.272942015324516E8|
|(11,[1,3,4,7,9,10...|760000000| 6.272942015324516E8|
|(11,[1,3,4,7,9,10...|205000000| 5.774254631557713E8|
|(11,[1,3,4,7,9,10...|208000000| 5.701466635389938E8|
|(11,[1,3,4,7,9,10...|208000000| 5.701466635389938E8|
|(11,[1,3,4,7,9,10...|178000000| 2.793628849753113E8|
|(11,[1,3,4,7,9,10...|180000000| 2.793628849753113E8|
|(11,[1,3,4,7,9,10...|182000000|2.7208408535853195E8|
|(11,[1,3,4,7,9,10...|183500000|2.7208408535853195E8|
|(11,[1,3,4,7,9,10...|360000000|4.9144578358324623E8|
|(11,[1,3,4,7,9,10...|425000000|4.6960938473291016E8|
|(11,[1,3,4,7,9,10...|430000000|4.6960938473291016E8|
|(11,[1,3,4,7,9,10...|433000

In [117]:
testResult.meanSquaredError

4.6918292745232845e+17