# Task 1 : 
### Installing Java, Spark3+ , and a compatible pyspark Python lib and making it work.


In [3]:
# Testing pyspark installation

import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\spark-3.2.0-bin-hadoop3.2'

In [7]:
# Initiate spark context
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('SparkApp').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark =SparkSession(sc)


In [4]:
#Example Test code
numeric_val = sc.parallelize([1,2,3,4])
numeric_val.map(lambda x: x*x*x).collect()


[1, 8, 27, 64]

In [174]:
# Stop the spark session
sc.stop()

# Task 2 
### Read "Car details v3.csv" data with spark

In [511]:
df = spark.read.option("header", "true").csv ('Car details v3.csv')
df.show(5)

+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|                name|year|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|              torque|seats|
+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|Maruti Swift Dzir...|2014|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|      190Nm@ 2000rpm|    5|
|Skoda Rapid 1.5 T...|2014|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp| 250Nm@ 1500-2500rpm|    5|
|Honda City 2017-2...|2006|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|12.7@ 2,700(kgm@ ...|    5|
|Hyundai i20 Sport...|2010|       225000|   127000|Diesel| Individual|      

In [512]:
df.limit(10).toPandas()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,113.75nm@ 4000rpm,5
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,57.5 bhp,"7.8@ 4,500(kgm@ rpm)",5
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,37 bhp,59Nm@ 2500rpm,4
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,67.1 bhp,170Nm@ 1800-2400rpm,5
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5


# Task 3
### Creating a model to predict the selling price from the other variables using Sparks'mlib 

In [513]:
spark = SparkSession.builder.appName('car_price_predictor').getOrCreate()

In [514]:
spark

In [515]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- year: string (nullable = true)
 |-- selling_price: string (nullable = true)
 |-- km_driven: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- mileage: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- max_power: string (nullable = true)
 |-- torque: string (nullable = true)
 |-- seats: string (nullable = true)



All the columns are string type.
The selling_price is our target and the remainder are our features we want to predict the target with.

Let's select some columns to see how the dataframe look like.

## Pre-processing the data

### Drop the name and the torque column

In [516]:
to_drop = ["name","torque"]
for x in to_drop:
    df = df.drop(x)
df.show(5)

+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-----+
|year|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|seats|
+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-----+
|2014|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|    5|
|2014|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp|    5|
|2006|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|    5|
|2010|       225000|   127000|Diesel| Individual|      Manual| First Owner| 23.0 kmpl|1396 CC|    90 bhp|    5|
|2007|       130000|   120000|Petrol| Individual|      Manual| First Owner| 16.1 kmpl|1298 CC|  88.2 bhp|    5|
+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------

### Replacing the year column by an age column (2021 - year )

In [517]:
df = df.withColumn('Age', ( 2021 - df['year'] ) ).drop('year')
df.show(5)

+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-----+----+
|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|seats| Age|
+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-----+----+
|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|    5| 7.0|
|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp|    5| 7.0|
|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|    5|15.0|
|       225000|   127000|Diesel| Individual|      Manual| First Owner| 23.0 kmpl|1396 CC|    90 bhp|    5|11.0|
|       130000|   120000|Petrol| Individual|      Manual| First Owner| 16.1 kmpl|1298 CC|  88.2 bhp|    5|14.0|
+-------------+---------+------+-----------+------------+------------+----------+-------+----------+----

### removing units from mileage, engine and max_power columns 

In [518]:
df= df.withColumn("mileage_clean", regexp_extract("mileage", "[+-]?([0-9]*[.])?[0-9]+", 0)).drop('mileage')
df= df.withColumn("engine_clean", regexp_extract("engine", "[+-]?([0-9]*[.])?[0-9]+", 0)).drop('engine')
df= df.withColumn("mpower_clean", regexp_extract("max_power", "[+-]?([0-9]*[.])?[0-9]+", 0)).drop('max_power')
df.show(10)

+-------------+---------+------+-----------+------------+------------+-----+----+-------------+------------+------------+
|selling_price|km_driven|  fuel|seller_type|transmission|       owner|seats| Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+------+-----------+------------+------------+-----+----+-------------+------------+------------+
|       450000|   145500|Diesel| Individual|      Manual| First Owner|    5| 7.0|         23.4|        1248|          74|
|       370000|   120000|Diesel| Individual|      Manual|Second Owner|    5| 7.0|        21.14|        1498|      103.52|
|       158000|   140000|Petrol| Individual|      Manual| Third Owner|    5|15.0|         17.7|        1497|          78|
|       225000|   127000|Diesel| Individual|      Manual| First Owner|    5|11.0|         23.0|        1396|          90|
|       130000|   120000|Petrol| Individual|      Manual| First Owner|    5|14.0|         16.1|        1298|        88.2|
|       440000|    45000

### Casting the numerical values (String  to Double)

In [519]:
df.printSchema()

root
 |-- selling_price: string (nullable = true)
 |-- km_driven: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- seats: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- mileage_clean: string (nullable = true)
 |-- engine_clean: string (nullable = true)
 |-- mpower_clean: string (nullable = true)



In [520]:
from pyspark.sql.types import DoubleType

numCols=["Age","selling_price","km_driven","mileage_clean","engine_clean",
"mpower_clean","seats"]
for x in numCols:
    df = df.withColumn(x,df[x].cast(DoubleType()))

df.printSchema()


root
 |-- selling_price: double (nullable = true)
 |-- km_driven: double (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- seats: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- mileage_clean: double (nullable = true)
 |-- engine_clean: double (nullable = true)
 |-- mpower_clean: double (nullable = true)



### counting null values in the dataFrame

In [521]:
from pyspark.sql.functions import col,sum
df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|selling_price|km_driven|fuel|seller_type|transmission|owner|seats|Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|            0|        0|   0|          0|           0|    0|  221|  0|          221|         221|         216|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+



### Replacing the null values with corresponding strategy

mileage=mean
engine=mean
max_power=mean

In [522]:
from pyspark.sql.functions import avg

def fill_with_mean(this_df, exclude=set()):
    stats = this_df.agg(*(avg(c).alias(c) for c in this_df.columns if c not in exclude))
    return this_df.na.fill(stats.first().asDict())

df = fill_with_mean(df, ["year", "selling_price", "km_driven", "fuel" , "seller_type" , "transmission","owner"])

df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|selling_price|km_driven|fuel|seller_type|transmission|owner|seats|Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|            0|        0|   0|          0|           0|    0|    0|  0|            0|           0|           0|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+



###     Skipped - (seats is double datatype now)  -  Dealing with seats null values  by setting all null values to be 5 

In [523]:

#df= df.fillna("5", subset=['seats'])
#df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

In [524]:
final_dataset = df
final_dataset.show(10)

+-------------+---------+------+-----------+------------+------------+-----+----+-------------+------------+------------+
|selling_price|km_driven|  fuel|seller_type|transmission|       owner|seats| Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+------+-----------+------------+------------+-----+----+-------------+------------+------------+
|     450000.0| 145500.0|Diesel| Individual|      Manual| First Owner|  5.0| 7.0|         23.4|      1248.0|        74.0|
|     370000.0| 120000.0|Diesel| Individual|      Manual|Second Owner|  5.0| 7.0|        21.14|      1498.0|      103.52|
|     158000.0| 140000.0|Petrol| Individual|      Manual| Third Owner|  5.0|15.0|         17.7|      1497.0|        78.0|
|     225000.0| 127000.0|Diesel| Individual|      Manual| First Owner|  5.0|11.0|         23.0|      1396.0|        90.0|
|     130000.0| 120000.0|Petrol| Individual|      Manual| First Owner|  5.0|14.0|         16.1|      1298.0|        88.2|
|     440000.0|  45000.0

### spliting the data into train validation and test 

In [525]:

# Use randomSplit with weights and seed
weights = [.8, .1, .1]
seed = 42
trainData, validationData, testData = final_dataset.randomSplit(weights, seed)

print(f"the total dataset length : {final_dataset.count()} records")
print(f"train set length : {trainData.count()} records")
print(f"validation set length : {validationData.count()} records")
print(f"test set length : {testData.count()} records")

the total dataset length : 8128 records
train set length : 6577 records
validation set length : 741 records
test set length : 810 records


### checing the Dtypes

In [526]:
trainData.dtypes

[('selling_price', 'double'),
 ('km_driven', 'double'),
 ('fuel', 'string'),
 ('seller_type', 'string'),
 ('transmission', 'string'),
 ('owner', 'string'),
 ('seats', 'double'),
 ('Age', 'double'),
 ('mileage_clean', 'double'),
 ('engine_clean', 'double'),
 ('mpower_clean', 'double')]

In [527]:
catCols = [ x for (x , dataType) in trainData.dtypes if dataType=="string" ]
numCols = [ x for (x , dataType) in trainData.dtypes if (dataType=="double") & (x !="selling_price") ]
print(f" the catogical columns are:  {catCols}")
print(f" the numerical columns are:  {numCols}")

 the catogical columns are:  ['fuel', 'seller_type', 'transmission', 'owner']
 the numerical columns are:  ['km_driven', 'seats', 'Age', 'mileage_clean', 'engine_clean', 'mpower_clean']


### Dealing with categorical variables

Let's identify the unique value for the string columns

In [528]:

df.select('seller_type').distinct().collect()

[Row(seller_type='Individual'),
 Row(seller_type='Dealer'),
 Row(seller_type='Trustmark Dealer')]

In [529]:
df.select('fuel').distinct().collect()

[Row(fuel='Diesel'), Row(fuel='CNG'), Row(fuel='LPG'), Row(fuel='Petrol')]

In [530]:
df.select('transmission').distinct().collect() 

[Row(transmission='Automatic'), Row(transmission='Manual')]

In [531]:
df.select('owner').distinct().collect()

[Row(owner='Third Owner'),
 Row(owner='Fourth & Above Owner'),
 Row(owner='Second Owner'),
 Row(owner='First Owner'),
 Row(owner='Test Drive Car')]

In [532]:
df.select('seats').distinct().collect()

[Row(seats=8.0),
 Row(seats=7.0),
 Row(seats=4.0),
 Row(seats=14.0),
 Row(seats=2.0),
 Row(seats=10.0),
 Row(seats=5.41671936259011),
 Row(seats=6.0),
 Row(seats=5.0),
 Row(seats=9.0)]

### Using  One hot encoding

counting the distinct type of our categrical variables

In [533]:
import pyspark.sql.functions as F

In [534]:

trainData.agg(F.countDistinct('fuel')).show()
trainData.agg(F.countDistinct('seats')).show()

+-----------+
|count(fuel)|
+-----------+
|          4|
+-----------+

+------------+
|count(seats)|
+------------+
|          10|
+------------+



In [536]:
trainData.groupBy('seats').count().show()

+----------------+-----+
|           seats|count|
+----------------+-----+
|             8.0|  187|
|             7.0|  905|
|5.41671936259011|  181|
|             4.0|  114|
|            14.0|    1|
|             2.0|    2|
|            10.0|   15|
|             6.0|   52|
|             5.0| 5053|
|             9.0|   67|
+----------------+-----+



In [537]:

from pyspark.ml.feature import (OneHotEncoder, StringIndexer)
string_indexer = [ StringIndexer(inputCol = x, outputCol = x + "_StringIndexer", handleInvalid='skip')
                  for x in catCols]
string_indexer

[StringIndexer_9a1b4ddb232f,
 StringIndexer_c5fe0cb136d2,
 StringIndexer_6974936e1347,
 StringIndexer_5ffea936c283]

In [538]:
One_Hot_Encoder = [ 
    OneHotEncoder(
        inputCols=[f"{x}_StringIndexer" for x in catCols],
        outputCols=[f"{x}_OneHotEncoder" for x in catCols],
    )
]

In [539]:
One_Hot_Encoder

[OneHotEncoder_7becfd66504f]

## Vector Assembling

In [540]:
from pyspark.ml.feature import VectorAssembler

In [541]:
assemblerInput=[x for x in numCols]
assemblerInput+=[f"{x}_OneHotEncoder" for x in catCols]

In [542]:
assemblerInput

['km_driven',
 'seats',
 'Age',
 'mileage_clean',
 'engine_clean',
 'mpower_clean',
 'fuel_OneHotEncoder',
 'seller_type_OneHotEncoder',
 'transmission_OneHotEncoder',
 'owner_OneHotEncoder']

In [543]:
vector_assembler = VectorAssembler(inputCols = assemblerInput, outputCol="VectorAssembler_features")

# Scaling 

In [646]:
from pyspark.ml.feature import StandardScaler
# Let us create an object of StandardScaler class
scaler=StandardScaler().setInputCol("features").setOutputCol("scaled_features")


In [693]:
stages  = []
stages += string_indexer
stages += One_Hot_Encoder
stages += [vector_assembler]
stages += scaler

#stages  = [string_indexer, One_Hot_Encoder, [vector_assembler] , scaler ]

TypeError: 'StandardScaler' object is not iterable

In [694]:
stages

[StringIndexer_9a1b4ddb232f,
 StringIndexer_c5fe0cb136d2,
 StringIndexer_6974936e1347,
 StringIndexer_5ffea936c283,
 OneHotEncoder_7becfd66504f,
 VectorAssembler_94b547ae27db]

In [695]:
%%time
from pyspark.ml import Pipeline

pipeline = Pipeline().setStages(stages)
pp_tr = pipeline.fit(trainData)

pp_df = pp_tr.transform(validationData).drop('features')



Wall time: 2.52 s


In [692]:
pp_df.show(truncate=False)

+-------------+---------+------+-----------+------------+--------------------+----------------+----+------------------+-----------------+-----------------+------------------+-------------------------+--------------------------+-------------------+------------------+-------------------------+--------------------------+-------------------+-------------------------------------------------------------------------------------------------------------------------------+
|selling_price|km_driven|fuel  |seller_type|transmission|owner               |seats           |Age |mileage_clean     |engine_clean     |mpower_clean     |fuel_StringIndexer|seller_type_StringIndexer|transmission_StringIndexer|owner_StringIndexer|fuel_OneHotEncoder|seller_type_OneHotEncoder|transmission_OneHotEncoder|owner_OneHotEncoder|VectorAssembler_features                                                                                                       |
+-------------+---------+------+-----------+------------+-------

In [671]:
pp_df.printSchema()

root
 |-- selling_price: double (nullable = true)
 |-- km_driven: double (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- seats: double (nullable = false)
 |-- Age: double (nullable = false)
 |-- mileage_clean: double (nullable = false)
 |-- engine_clean: double (nullable = false)
 |-- mpower_clean: double (nullable = false)
 |-- fuel_StringIndexer: double (nullable = false)
 |-- seller_type_StringIndexer: double (nullable = false)
 |-- transmission_StringIndexer: double (nullable = false)
 |-- owner_StringIndexer: double (nullable = false)
 |-- fuel_OneHotEncoder: vector (nullable = true)
 |-- seller_type_OneHotEncoder: vector (nullable = true)
 |-- transmission_OneHotEncoder: vector (nullable = true)
 |-- owner_OneHotEncoder: vector (nullable = true)
 |-- VectorAssembler_features: vector (nullable = true)



In [672]:
pp_df.limit(10).toPandas()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,owner,seats,Age,mileage_clean,engine_clean,mpower_clean,fuel_StringIndexer,seller_type_StringIndexer,transmission_StringIndexer,owner_StringIndexer,fuel_OneHotEncoder,seller_type_OneHotEncoder,transmission_OneHotEncoder,owner_OneHotEncoder,VectorAssembler_features
0,30000.0,90000.0,Petrol,Individual,Manual,Third Owner,5.416719,21.0,19.418783,1458.625016,91.517919,1.0,0.0,0.0,2.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 0.0, 1.0, 0.0)","[90000.0, 5.41671936259011, 21.0, 19.418783356..."
1,40000.0,32000.0,Petrol,Individual,Manual,Second Owner,4.0,25.0,16.1,796.0,37.0,1.0,0.0,0.0,1.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 1.0, 0.0, 0.0)","[32000.0, 4.0, 25.0, 16.1, 796.0, 37.0, 0.0, 1..."
2,40000.0,80000.0,Petrol,Individual,Manual,First Owner,4.0,19.0,16.1,796.0,37.0,1.0,0.0,0.0,0.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0)","[80000.0, 4.0, 19.0, 16.1, 796.0, 37.0, 0.0, 1..."
3,40000.0,100000.0,Petrol,Individual,Manual,Third Owner,4.0,22.0,16.1,796.0,37.0,1.0,0.0,0.0,2.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 0.0, 1.0, 0.0)","[100000.0, 4.0, 22.0, 16.1, 796.0, 37.0, 0.0, ..."
4,45000.0,10000.0,Petrol,Individual,Manual,Third Owner,4.0,10.0,26.0,624.0,35.0,1.0,0.0,0.0,2.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 0.0, 1.0, 0.0)","[10000.0, 4.0, 10.0, 26.0, 624.0, 35.0, 0.0, 1..."
5,45000.0,58000.0,Petrol,Individual,Manual,First Owner,4.0,18.0,16.1,796.0,37.0,1.0,0.0,0.0,0.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0)","[58000.0, 4.0, 18.0, 16.1, 796.0, 37.0, 0.0, 1..."
6,45000.0,90000.0,Petrol,Individual,Manual,Fourth & Above Owner,4.0,18.0,16.1,796.0,37.0,1.0,0.0,0.0,3.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 0.0, 0.0, 1.0)","[90000.0, 4.0, 18.0, 16.1, 796.0, 37.0, 0.0, 1..."
7,45000.0,100000.0,Diesel,Individual,Manual,Second Owner,5.0,12.0,17.2,1396.0,53.5,0.0,0.0,0.0,1.0,"(1.0, 0.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 1.0, 0.0, 0.0)","[100000.0, 5.0, 12.0, 17.2, 1396.0, 53.5, 1.0,..."
8,45957.0,90000.0,Petrol,Individual,Manual,Second Owner,5.0,21.0,19.7,796.0,46.3,1.0,0.0,0.0,1.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 1.0, 0.0, 0.0)","[90000.0, 5.0, 21.0, 19.7, 796.0, 46.3, 0.0, 1..."
9,46000.0,25000.0,Petrol,Individual,Manual,Second Owner,4.0,10.0,26.0,624.0,35.0,1.0,0.0,0.0,1.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 1.0, 0.0, 0.0)","[25000.0, 4.0, 10.0, 26.0, 624.0, 35.0, 0.0, 1..."


In [673]:
pp_df.select(assemblerInput).show(truncate=False)

+---------+----------------+----+------------------+-----------------+-----------------+------------------+-------------------------+--------------------------+-------------------+
|km_driven|seats           |Age |mileage_clean     |engine_clean     |mpower_clean     |fuel_OneHotEncoder|seller_type_OneHotEncoder|transmission_OneHotEncoder|owner_OneHotEncoder|
+---------+----------------+----+------------------+-----------------+-----------------+------------------+-------------------------+--------------------------+-------------------+
|90000.0  |5.41671936259011|21.0|19.418783356519516|1458.625015808777|91.51791898382159|(3,[1],[1.0])     |(2,[0],[1.0])            |(1,[0],[1.0])             |(4,[2],[1.0])      |
|32000.0  |4.0             |25.0|16.1              |796.0            |37.0             |(3,[1],[1.0])     |(2,[0],[1.0])            |(1,[0],[1.0])             |(4,[1],[1.0])      |
|80000.0  |4.0             |19.0|16.1              |796.0            |37.0             |(3,[1],

In [674]:
pp_df.select('VectorAssembler_features').show(truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------+
|VectorAssembler_features                                                                                                       |
+-------------------------------------------------------------------------------------------------------------------------------+
|[90000.0,5.41671936259011,21.0,19.418783356519516,1458.625015808777,91.51791898382159,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0] |
|[32000.0,4.0,25.0,16.1,796.0,37.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0]                                                     |
|[80000.0,4.0,19.0,16.1,796.0,37.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0]                                                     |
|[100000.0,4.0,22.0,16.1,796.0,37.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0]                                                    |
|[10000.0,4.0,10.0,26.0,624.0,35.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0]               

In [675]:
data = pp_df.select(
F.col("VectorAssembler_features").alias("features"),
F.col("selling_price").alias("label"))

#  TRAIN Random forest model

In [676]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator 

In [677]:
%%time
rf = RandomForestRegressor(labelCol = "label", featuresCol = "features")

Wall time: 8 ms


In [679]:
rfmodel=rf.fit(data)

# validation

In [680]:


pp_df2 = pp_tr.transform(validationData)
data2 = pp_df2.select(
F.col("VectorAssembler_features").alias("features"),
F.col("selling_price").alias("label"))



In [681]:
pred = rfmodel.transform(data)


In [682]:
pred.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)
 |-- prediction: double (nullable = false)



In [683]:
pred.show()

+--------------------+-------+------------------+
|            features|  label|        prediction|
+--------------------+-------+------------------+
|[90000.0,5.416719...|30000.0|220437.54441877134|
|[32000.0,4.0,25.0...|40000.0|140324.84323013446|
|[80000.0,4.0,19.0...|40000.0|156769.19937328473|
|[100000.0,4.0,22....|40000.0|139401.78269250004|
|[10000.0,4.0,10.0...|45000.0|193695.99138075553|
|[58000.0,4.0,18.0...|45000.0|157692.25991091915|
|[90000.0,4.0,18.0...|45000.0|139401.78269250004|
|[100000.0,5.0,12....|45000.0| 217132.3993061839|
|[90000.0,5.0,21.0...|45957.0|152035.65007856744|
|[25000.0,4.0,10.0...|46000.0|176243.09586095356|
|[80000.0,5.0,14.0...|50000.0| 207405.5009131139|
|[180000.0,5.0,21....|50000.0|144440.77078773815|
|[100000.0,4.0,14....|60000.0|143228.10124088713|
|[140000.0,5.41671...|60000.0|220437.54441877134|
|[80000.0,5.0,19.0...|65000.0|144440.77078773815|
|[20000.0,5.0,18.0...|70000.0| 362515.4618257702|
|[70000.0,5.416719...|70000.0|220437.54441877134|


In [684]:
model.featureImportances

SparseVector(23, {0: 0.0505, 1: 0.159, 2: 0.0237, 3: 0.1583, 4: 0.3751, 5: 0.0143, 6: 0.01, 8: 0.023, 9: 0.0374, 10: 0.1311, 11: 0.0043, 12: 0.0031, 14: 0.0, 15: 0.0006, 16: 0.0014, 17: 0.0002, 18: 0.008, 20: 0.0, 21: 0.0})

In [685]:
evaluator=RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='rmse')

In [686]:
rmse=evaluator.evaluate(pred)

In [687]:
print(f"the root mean square error is : {rmse}")

the root mean square error is : 191610.80631289067
