In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from  pyspark.sql.functions import abs

spark = SparkSession\
    .builder\
    .appName("PythonExample")\
    .getOrCreate()

# read the vehicle data
hive_df = spark.sql("select * from automotive.vehicles")
hive_df.show()

features = hive_df.select("year", "km_driven")




+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|                name|year|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|              torque|seats|
+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|Maruti Swift Dzir...|2014|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|      190Nm@ 2000rpm|    5|
|Skoda Rapid 1.5 T...|2014|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp| 250Nm@ 1500-2500rpm|    5|
|Honda City 2017-2...|2006|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|12.7@ 2,700(kgm@ ...|    5|
|Hyundai i20 Sport...|2010|       225000|   127000|Diesel| Individual|      

In [7]:
# extract features
assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol="features")

output = assembler.transform(hive_df).select('features','selling_price')
output.show()



+-----------------+-------------+
|         features|selling_price|
+-----------------+-------------+
|[2014.0,145500.0]|       450000|
|[2014.0,120000.0]|       370000|
|[2006.0,140000.0]|       158000|
|[2010.0,127000.0]|       225000|
|[2007.0,120000.0]|       130000|
| [2017.0,45000.0]|       440000|
|[2007.0,175000.0]|        96000|
|  [2001.0,5000.0]|        45000|
| [2011.0,90000.0]|       350000|
|[2013.0,169000.0]|       200000|
| [2014.0,68000.0]|       500000|
|[2005.0,100000.0]|        92000|
|[2009.0,140000.0]|       280000|
| [2007.0,80000.0]|       200000|
| [2009.0,90000.0]|       180000|
| [2016.0,40000.0]|       400000|
| [2016.0,70000.0]|       778000|
| [2012.0,53000.0]|       500000|
| [2002.0,80000.0]|       150000|
|[2016.0,100000.0]|       680000|
+-----------------+-------------+
only showing top 20 rows



In [8]:
# split
train,test = output.randomSplit([0.75, 0.25])

# train
lin_reg = LinearRegression(featuresCol = 'features', labelCol='selling_price')
linear_model = lin_reg.fit(train)
print("Coefficients: " + str(linear_model.coefficients))
print("\nIntercept: " + str(linear_model.intercept))

trainSummary = linear_model.summary

print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)



Coefficients: [77853.80321750963,-0.8838653304614231]

Intercept: -156080851.5431577
RMSE: 731195.986737

r2: 0.175157


In [5]:
#evaluate

predictions = linear_model.transform(test)
x =((predictions['selling_price']-predictions['prediction'])/predictions['selling_price'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
predictions.select("prediction","selling_price","Accuracy","features").show()

+-------------------+-------------+------------------+-----------------+
|         prediction|selling_price|          Accuracy|         features|
+-------------------+-------------+------------------+-----------------+
| -875608.6070124805|        99000|  984.453138396445|[1994.0,100000.0]|
| -765469.8746379614|        55000|1491.7634084326571| [1995.0,70000.0]|
| -645803.1679921746|        40000|1714.5079199804363| [1996.0,32000.0]|
| -604742.2490843832|        50000|1309.4844981687665| [1997.0,60000.0]|
| -664292.0882798135|        57000|1265.4247162803745|[1997.0,110000.0]|
|-506513.48454892635|        35000| 1547.181384425504| [1998.0,40000.0]|
| -542243.3880662024|        40000|1455.6084701655059| [1998.0,70000.0]|
| -554153.3559052646|        45000| 1331.451902011699| [1998.0,80000.0]|
| -444014.6235307753|        40000|1210.0365588269383| [1999.0,50000.0]|
| -455924.5913698673|        55000| 928.9538024906678| [1999.0,60000.0]|
|-369605.79467353225|        45000| 921.34621038562