In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark=SparkSession.builder.appName('Machine_Learning').getOrCreate()

In [4]:
spark

In [6]:
data=spark.read.option('header',True).csv(r"G:\datasets\cars.csv")

In [7]:
data.show()

+--------+---------+------+------------+-------------+
|   brand|km_driven|  fuel|       owner|selling_price|
+--------+---------+------+------------+-------------+
|  Maruti|   145500|Diesel| First Owner|       450000|
|   Skoda|   120000|Diesel|Second Owner|       370000|
|   Honda|   140000|Petrol| Third Owner|       158000|
| Hyundai|   127000|Diesel| First Owner|       225000|
|  Maruti|   120000|Petrol| First Owner|       130000|
| Hyundai|    45000|Petrol| First Owner|       440000|
|  Maruti|   175000|   LPG| First Owner|        96000|
|  Maruti|     5000|Petrol|Second Owner|        45000|
|  Toyota|    90000|Diesel| First Owner|       350000|
|    Ford|   169000|Diesel| First Owner|       200000|
| Renault|    68000|Diesel|Second Owner|       500000|
|  Maruti|   100000|Petrol|Second Owner|        92000|
|  Maruti|   140000|Diesel|Second Owner|       280000|
|  Maruti|    80000|Petrol|Second Owner|       200000|
|  Maruti|    90000|Petrol|Second Owner|       180000|
|Mahindra|

In [8]:
data.dtypes

[('brand', 'string'),
 ('km_driven', 'string'),
 ('fuel', 'string'),
 ('owner', 'string'),
 ('selling_price', 'string')]

In [9]:
data=spark.read.option('header',True).csv(r"G:\datasets\cars.csv",inferSchema=True)

In [10]:
data.show()

+--------+---------+------+------------+-------------+
|   brand|km_driven|  fuel|       owner|selling_price|
+--------+---------+------+------------+-------------+
|  Maruti|   145500|Diesel| First Owner|       450000|
|   Skoda|   120000|Diesel|Second Owner|       370000|
|   Honda|   140000|Petrol| Third Owner|       158000|
| Hyundai|   127000|Diesel| First Owner|       225000|
|  Maruti|   120000|Petrol| First Owner|       130000|
| Hyundai|    45000|Petrol| First Owner|       440000|
|  Maruti|   175000|   LPG| First Owner|        96000|
|  Maruti|     5000|Petrol|Second Owner|        45000|
|  Toyota|    90000|Diesel| First Owner|       350000|
|    Ford|   169000|Diesel| First Owner|       200000|
| Renault|    68000|Diesel|Second Owner|       500000|
|  Maruti|   100000|Petrol|Second Owner|        92000|
|  Maruti|   140000|Diesel|Second Owner|       280000|
|  Maruti|    80000|Petrol|Second Owner|       200000|
|  Maruti|    90000|Petrol|Second Owner|       180000|
|Mahindra|

In [11]:
data.dtypes

[('brand', 'string'),
 ('km_driven', 'int'),
 ('fuel', 'string'),
 ('owner', 'string'),
 ('selling_price', 'int')]

In [12]:
data.printSchema()

root
 |-- brand: string (nullable = true)
 |-- km_driven: integer (nullable = true)
 |-- fuel: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- selling_price: integer (nullable = true)



In [13]:
data.columns

['brand', 'km_driven', 'fuel', 'owner', 'selling_price']

In [16]:
from pyspark.ml.feature import StringIndexer 

In [17]:
ind=StringIndexer(inputCol='brand',outputCol='brand_index')

In [18]:
ind2=StringIndexer(inputCol='fuel',outputCol='fuel_index')
ind3=StringIndexer(inputCol='owner',outputCol='owner_index')

In [19]:
data1=ind.fit(data).transform(data)
data2=ind2.fit(data1).transform(data1)
data_ind=ind3.fit(data2).transform(data2)

In [20]:
data_ind.show()

+--------+---------+------+------------+-------------+-----------+----------+-----------+
|   brand|km_driven|  fuel|       owner|selling_price|brand_index|fuel_index|owner_index|
+--------+---------+------+------------+-------------+-----------+----------+-----------+
|  Maruti|   145500|Diesel| First Owner|       450000|        0.0|       0.0|        0.0|
|   Skoda|   120000|Diesel|Second Owner|       370000|       11.0|       0.0|        1.0|
|   Honda|   140000|Petrol| Third Owner|       158000|        5.0|       1.0|        2.0|
| Hyundai|   127000|Diesel| First Owner|       225000|        1.0|       0.0|        0.0|
|  Maruti|   120000|Petrol| First Owner|       130000|        0.0|       1.0|        0.0|
| Hyundai|    45000|Petrol| First Owner|       440000|        1.0|       1.0|        0.0|
|  Maruti|   175000|   LPG| First Owner|        96000|        0.0|       3.0|        0.0|
|  Maruti|     5000|Petrol|Second Owner|        45000|        0.0|       1.0|        1.0|
|  Toyota|

In [21]:
from pyspark.ml.feature import VectorAssembler
x=VectorAssembler(inputCols=['brand_index','km_driven','fuel_index','owner_index','selling_price'],outputCol='Independent Features')

In [22]:
df=x.transform(data_ind)

In [23]:
df.show()

+--------+---------+------+------------+-------------+-----------+----------+-----------+--------------------+
|   brand|km_driven|  fuel|       owner|selling_price|brand_index|fuel_index|owner_index|Independent Features|
+--------+---------+------+------------+-------------+-----------+----------+-----------+--------------------+
|  Maruti|   145500|Diesel| First Owner|       450000|        0.0|       0.0|        0.0|(5,[1,4],[145500....|
|   Skoda|   120000|Diesel|Second Owner|       370000|       11.0|       0.0|        1.0|[11.0,120000.0,0....|
|   Honda|   140000|Petrol| Third Owner|       158000|        5.0|       1.0|        2.0|[5.0,140000.0,1.0...|
| Hyundai|   127000|Diesel| First Owner|       225000|        1.0|       0.0|        0.0|[1.0,127000.0,0.0...|
|  Maruti|   120000|Petrol| First Owner|       130000|        0.0|       1.0|        0.0|[0.0,120000.0,1.0...|
| Hyundai|    45000|Petrol| First Owner|       440000|        1.0|       1.0|        0.0|[1.0,45000.0,1.0,...|
|

In [24]:
df.columns

['brand',
 'km_driven',
 'fuel',
 'owner',
 'selling_price',
 'brand_index',
 'fuel_index',
 'owner_index',
 'Independent Features']

In [25]:
df_final=df.select(['Independent Features','selling_price'])

In [27]:
df_final.show()

+--------------------+-------------+
|Independent Features|selling_price|
+--------------------+-------------+
|(5,[1,4],[145500....|       450000|
|[11.0,120000.0,0....|       370000|
|[5.0,140000.0,1.0...|       158000|
|[1.0,127000.0,0.0...|       225000|
|[0.0,120000.0,1.0...|       130000|
|[1.0,45000.0,1.0,...|       440000|
|[0.0,175000.0,3.0...|        96000|
|[0.0,5000.0,1.0,1...|        45000|
|[4.0,90000.0,0.0,...|       350000|
|[6.0,169000.0,0.0...|       200000|
|[8.0,68000.0,0.0,...|       500000|
|[0.0,100000.0,1.0...|        92000|
|[0.0,140000.0,0.0...|       280000|
|[0.0,80000.0,1.0,...|       200000|
|[0.0,90000.0,1.0,...|       180000|
|[2.0,40000.0,1.0,...|       400000|
|[0.0,70000.0,0.0,...|       778000|
|[1.0,53000.0,0.0,...|       500000|
|[0.0,80000.0,1.0,...|       150000|
|[1.0,100000.0,0.0...|       680000|
+--------------------+-------------+
only showing top 20 rows



In [29]:
from pyspark.ml.regression import LinearRegression

In [31]:
model=LinearRegression(featuresCol='Independent Features',labelCol='selling_price')

In [32]:
train,test=df_final.randomSplit([0.75,0.25])

In [33]:
model=model.fit(train)

In [36]:
model.coefficients

DenseVector([0.0, 0.0, 0.0, 0.0, 1.0])

In [37]:
model.intercept

0.0

In [38]:
pred=model.evaluate(test)

In [39]:
pred.predictions.show()

+--------------------+-------------+----------+
|Independent Features|selling_price|prediction|
+--------------------+-------------+----------+
|(5,[1,4],[5000.0,...|       750000|  750000.0|
|(5,[1,4],[10000.0...|       892000|  892000.0|
|(5,[1,4],[14000.0...|       750000|  750000.0|
|(5,[1,4],[14000.0...|      1200000| 1200000.0|
|(5,[1,4],[15000.0...|       570000|  570000.0|
|(5,[1,4],[15381.0...|       800000|  800000.0|
|(5,[1,4],[20000.0...|       755000|  755000.0|
|(5,[1,4],[20000.0...|       889000|  889000.0|
|(5,[1,4],[24000.0...|       750000|  750000.0|
|(5,[1,4],[24000.0...|       802999|  802999.0|
|(5,[1,4],[24019.0...|       450000|  450000.0|
|(5,[1,4],[25000.0...|       757000|  757000.0|
|(5,[1,4],[25000.0...|       950000|  950000.0|
|(5,[1,4],[29700.0...|       810000|  810000.0|
|(5,[1,4],[30000.0...|       370000|  370000.0|
|(5,[1,4],[30000.0...|       590000|  590000.0|
|(5,[1,4],[30000.0...|       660000|  660000.0|
|(5,[1,4],[30000.0...|       700000|  70

In [40]:
pred.meanAbsoluteError

0.0

In [41]:
pred.meanSquaredError

0.0