# Task 1 : 
### Installing Java, Spark3+ , and a compatible pyspark Python lib and making it work.


In [3]:
# Testing pyspark installation

import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

'C:\\spark-3.2.0-bin-hadoop3.2'

In [7]:
# Initiate spark context
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession
conf = pyspark.SparkConf().setAppName('SparkApp').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark =SparkSession(sc)


In [4]:
#Example Test code
numeric_val = sc.parallelize([1,2,3,4])
numeric_val.map(lambda x: x*x*x).collect()


[1, 8, 27, 64]

In [174]:
# Stop the spark session
sc.stop()

# Task 2 
### Read "Car details v3.csv" data with spark

In [321]:
df = spark.read.option("header", "true").csv ('Car details v3.csv')
df.show(5)

+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|                name|year|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|              torque|seats|
+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|Maruti Swift Dzir...|2014|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|      190Nm@ 2000rpm|    5|
|Skoda Rapid 1.5 T...|2014|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp| 250Nm@ 1500-2500rpm|    5|
|Honda City 2017-2...|2006|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|12.7@ 2,700(kgm@ ...|    5|
|Hyundai i20 Sport...|2010|       225000|   127000|Diesel| Individual|      

In [322]:
df.limit(10).toPandas()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner,mileage,engine,max_power,torque,seats
0,Maruti Swift Dzire VDI,2014,450000,145500,Diesel,Individual,Manual,First Owner,23.4 kmpl,1248 CC,74 bhp,190Nm@ 2000rpm,5
1,Skoda Rapid 1.5 TDI Ambition,2014,370000,120000,Diesel,Individual,Manual,Second Owner,21.14 kmpl,1498 CC,103.52 bhp,250Nm@ 1500-2500rpm,5
2,Honda City 2017-2020 EXi,2006,158000,140000,Petrol,Individual,Manual,Third Owner,17.7 kmpl,1497 CC,78 bhp,"12.7@ 2,700(kgm@ rpm)",5
3,Hyundai i20 Sportz Diesel,2010,225000,127000,Diesel,Individual,Manual,First Owner,23.0 kmpl,1396 CC,90 bhp,22.4 kgm at 1750-2750rpm,5
4,Maruti Swift VXI BSIII,2007,130000,120000,Petrol,Individual,Manual,First Owner,16.1 kmpl,1298 CC,88.2 bhp,"11.5@ 4,500(kgm@ rpm)",5
5,Hyundai Xcent 1.2 VTVT E Plus,2017,440000,45000,Petrol,Individual,Manual,First Owner,20.14 kmpl,1197 CC,81.86 bhp,113.75nm@ 4000rpm,5
6,Maruti Wagon R LXI DUO BSIII,2007,96000,175000,LPG,Individual,Manual,First Owner,17.3 km/kg,1061 CC,57.5 bhp,"7.8@ 4,500(kgm@ rpm)",5
7,Maruti 800 DX BSII,2001,45000,5000,Petrol,Individual,Manual,Second Owner,16.1 kmpl,796 CC,37 bhp,59Nm@ 2500rpm,4
8,Toyota Etios VXD,2011,350000,90000,Diesel,Individual,Manual,First Owner,23.59 kmpl,1364 CC,67.1 bhp,170Nm@ 1800-2400rpm,5
9,Ford Figo Diesel Celebration Edition,2013,200000,169000,Diesel,Individual,Manual,First Owner,20.0 kmpl,1399 CC,68.1 bhp,160Nm@ 2000rpm,5


# Task 3
### Creating a model to predict the selling price from the other variables using Sparks'mlib 

In [323]:
spark = SparkSession.builder.appName('car_price_predictor').getOrCreate()

In [324]:
spark

In [325]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- year: string (nullable = true)
 |-- selling_price: string (nullable = true)
 |-- km_driven: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- mileage: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- max_power: string (nullable = true)
 |-- torque: string (nullable = true)
 |-- seats: string (nullable = true)



All the columns are string type.
The selling_price is our target and the remainder are our features we want to predict the target with.

Let's select some columns to see how the dataframe look like.

## Pre-processing the data

### Drop the name and the torque column

In [326]:
to_drop = ["name","torque"]
for x in to_drop:
    df = df.drop(x)
df.show(5)

+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-----+
|year|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|seats|
+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-----+
|2014|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|    5|
|2014|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp|    5|
|2006|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|    5|
|2010|       225000|   127000|Diesel| Individual|      Manual| First Owner| 23.0 kmpl|1396 CC|    90 bhp|    5|
|2007|       130000|   120000|Petrol| Individual|      Manual| First Owner| 16.1 kmpl|1298 CC|  88.2 bhp|    5|
+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------

### Replacing the year column by an age column (2021 - year )

In [327]:
df = df.withColumn('Age', ( 2021 - df['year'] ) ).drop('year')
df.show(5)

+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-----+----+
|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|seats| Age|
+-------------+---------+------+-----------+------------+------------+----------+-------+----------+-----+----+
|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|    5| 7.0|
|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp|    5| 7.0|
|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|    5|15.0|
|       225000|   127000|Diesel| Individual|      Manual| First Owner| 23.0 kmpl|1396 CC|    90 bhp|    5|11.0|
|       130000|   120000|Petrol| Individual|      Manual| First Owner| 16.1 kmpl|1298 CC|  88.2 bhp|    5|14.0|
+-------------+---------+------+-----------+------------+------------+----------+-------+----------+----

### removing units from mileage, engine and max_power columns 

In [328]:
df= df.withColumn("mileage_clean", regexp_extract("mileage", "[+-]?([0-9]*[.])?[0-9]+", 0)).drop('mileage')
df= df.withColumn("engine_clean", regexp_extract("engine", "[+-]?([0-9]*[.])?[0-9]+", 0)).drop('engine')
df= df.withColumn("mpower_clean", regexp_extract("max_power", "[+-]?([0-9]*[.])?[0-9]+", 0)).drop('max_power')
df.show(10)

+-------------+---------+------+-----------+------------+------------+-----+----+-------------+------------+------------+
|selling_price|km_driven|  fuel|seller_type|transmission|       owner|seats| Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+------+-----------+------------+------------+-----+----+-------------+------------+------------+
|       450000|   145500|Diesel| Individual|      Manual| First Owner|    5| 7.0|         23.4|        1248|          74|
|       370000|   120000|Diesel| Individual|      Manual|Second Owner|    5| 7.0|        21.14|        1498|      103.52|
|       158000|   140000|Petrol| Individual|      Manual| Third Owner|    5|15.0|         17.7|        1497|          78|
|       225000|   127000|Diesel| Individual|      Manual| First Owner|    5|11.0|         23.0|        1396|          90|
|       130000|   120000|Petrol| Individual|      Manual| First Owner|    5|14.0|         16.1|        1298|        88.2|
|       440000|    45000

### Casting the numerical values (String  to Double)

In [329]:
df.printSchema()

root
 |-- selling_price: string (nullable = true)
 |-- km_driven: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- seats: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- mileage_clean: string (nullable = true)
 |-- engine_clean: string (nullable = true)
 |-- mpower_clean: string (nullable = true)



In [330]:
from pyspark.sql.types import DoubleType

numCols=["Age","selling_price","km_driven","mileage_clean","engine_clean",
"mpower_clean"]
for x in numCols:
    df = df.withColumn(x,df[x].cast(DoubleType()))

df.printSchema()


root
 |-- selling_price: double (nullable = true)
 |-- km_driven: double (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- seats: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- mileage_clean: double (nullable = true)
 |-- engine_clean: double (nullable = true)
 |-- mpower_clean: double (nullable = true)



### counting null values in the dataFrame

In [331]:
from pyspark.sql.functions import col,sum
df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|selling_price|km_driven|fuel|seller_type|transmission|owner|seats|Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|            0|        0|   0|          0|           0|    0|  221|  0|          221|         221|         216|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+



### Replacing the null values with corresponding strategy

mileage=mean
engine=mean
max_power=mean

In [332]:
from pyspark.sql.functions import avg

def fill_with_mean(this_df, exclude=set()):
    stats = this_df.agg(*(avg(c).alias(c) for c in this_df.columns if c not in exclude))
    return this_df.na.fill(stats.first().asDict())

df = fill_with_mean(df, ["year", "selling_price", "km_driven", "fuel" , "seller_type" , "transmission","owner","seats"])

df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|selling_price|km_driven|fuel|seller_type|transmission|owner|seats|Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|            0|        0|   0|          0|           0|    0|  221|  0|            0|           0|           0|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+



### Dealing with seats null values  by setting all null values to be 5 

In [333]:

df= df.fillna("5", subset=['seats'])
df.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in df.columns)).show()

+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|selling_price|km_driven|fuel|seller_type|transmission|owner|seats|Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+
|            0|        0|   0|          0|           0|    0|    0|  0|            0|           0|           0|
+-------------+---------+----+-----------+------------+-----+-----+---+-------------+------------+------------+



In [334]:
final_dataset = df
final_dataset.show(10)

+-------------+---------+------+-----------+------------+------------+-----+----+-------------+------------+------------+
|selling_price|km_driven|  fuel|seller_type|transmission|       owner|seats| Age|mileage_clean|engine_clean|mpower_clean|
+-------------+---------+------+-----------+------------+------------+-----+----+-------------+------------+------------+
|     450000.0| 145500.0|Diesel| Individual|      Manual| First Owner|    5| 7.0|         23.4|      1248.0|        74.0|
|     370000.0| 120000.0|Diesel| Individual|      Manual|Second Owner|    5| 7.0|        21.14|      1498.0|      103.52|
|     158000.0| 140000.0|Petrol| Individual|      Manual| Third Owner|    5|15.0|         17.7|      1497.0|        78.0|
|     225000.0| 127000.0|Diesel| Individual|      Manual| First Owner|    5|11.0|         23.0|      1396.0|        90.0|
|     130000.0| 120000.0|Petrol| Individual|      Manual| First Owner|    5|14.0|         16.1|      1298.0|        88.2|
|     440000.0|  45000.0

### spliting the data into train validation and test 

In [397]:

# Use randomSplit with weights and seed
weights = [.8, .1, .1]
seed = 42
trainData, validationData, testData = final_dataset.randomSplit(weights, seed)

print(f"the total dataset length : {final_dataset.count()} records")
print(f"train set length : {trainData.count()} records")
print(f"validation set length : {validationData.count()} records")
print(f"test set length : {testData.count()} records")

the total dataset length : 8128 records
train set length : 6577 records
validation set length : 741 records
test set length : 810 records


### checing the Dtypes

In [400]:
trainData.dtypes

[('selling_price', 'double'),
 ('km_driven', 'double'),
 ('fuel', 'string'),
 ('seller_type', 'string'),
 ('transmission', 'string'),
 ('owner', 'string'),
 ('seats', 'string'),
 ('Age', 'double'),
 ('mileage_clean', 'double'),
 ('engine_clean', 'double'),
 ('mpower_clean', 'double')]

In [337]:
catCols = [ x for (x , dataType) in trainData.dtypes if dataType=="string" ]
numCols = [ x for (x , dataType) in trainData.dtypes if (dataType=="double") & (x !="selling_price") ]
print(f" the catogical columns are:  {catCols}")
print(f" the numerical columns are:  {numCols}")

 the catogical columns are:  ['fuel', 'seller_type', 'transmission', 'owner', 'seats']
 the numerical columns are:  ['km_driven', 'Age', 'mileage_clean', 'engine_clean', 'mpower_clean']


### Dealing with categorical variables

Let's identify the unique value for the string columns

In [338]:

df.select('seller_type').distinct().collect()

[Row(seller_type='Individual'),
 Row(seller_type='Dealer'),
 Row(seller_type='Trustmark Dealer')]

In [339]:
df.select('fuel').distinct().collect()

[Row(fuel='Diesel'), Row(fuel='CNG'), Row(fuel='LPG'), Row(fuel='Petrol')]

In [340]:
df.select('transmission').distinct().collect() 

[Row(transmission='Automatic'), Row(transmission='Manual')]

In [341]:
df.select('owner').distinct().collect()

[Row(owner='Third Owner'),
 Row(owner='Fourth & Above Owner'),
 Row(owner='Second Owner'),
 Row(owner='First Owner'),
 Row(owner='Test Drive Car')]

In [342]:
df.select('seats').distinct().collect()

[Row(seats='7'),
 Row(seats='8'),
 Row(seats='5'),
 Row(seats='6'),
 Row(seats='9'),
 Row(seats='10'),
 Row(seats='4'),
 Row(seats='14'),
 Row(seats='2')]

### Using  One hot encoding

counting the distinct type of our categrical variables

In [343]:
import pyspark.sql.functions as F

In [344]:

trainData.agg(F.countDistinct('fuel')).show()
trainData.agg(F.countDistinct('seats')).show()

+-----------+
|count(fuel)|
+-----------+
|          4|
+-----------+

+------------+
|count(seats)|
+------------+
|           9|
+------------+



In [345]:
trainData.groupBy('seats').count().show()

+-----+-----+
|seats|count|
+-----+-----+
|    7|  905|
|    8|  187|
|    5| 5233|
|    6|   52|
|    9|   67|
|   10|   16|
|    4|  114|
|   14|    1|
|    2|    2|
+-----+-----+



In [346]:

from pyspark.ml.feature import (OneHotEncoder, StringIndexer)
string_indexer = [ StringIndexer(inputCol = x, outputCol = x + "_StringIndexer", handleInvalid='skip')
                  for x in catCols]
string_indexer

[StringIndexer_ced32d8acac9,
 StringIndexer_be8309eabe77,
 StringIndexer_90aff99aca00,
 StringIndexer_30e7b34313ea,
 StringIndexer_04abca1574e1]

In [347]:
One_Hot_Encoder = [ 
    OneHotEncoder(
        inputCols=[f"{x}_StringIndexer" for x in catCols],
        outputCols=[f"{x}_OneHotEncoder" for x in catCols],
    )
]

In [348]:
One_Hot_Encoder

[OneHotEncoder_7fcb63bc0c54]

## Vector Assembling

In [349]:
from pyspark.ml.feature import VectorAssembler

In [350]:
assemblerInput=[x for x in numCols]
assemblerInput+=[f"{x}_OneHotEncoder" for x in catCols]

In [351]:
assemblerInput

['km_driven',
 'Age',
 'mileage_clean',
 'engine_clean',
 'mpower_clean',
 'fuel_OneHotEncoder',
 'seller_type_OneHotEncoder',
 'transmission_OneHotEncoder',
 'owner_OneHotEncoder',
 'seats_OneHotEncoder']

In [352]:
vector_assembler = VectorAssembler(inputCols = assemblerInput, outputCol="VectorAssembler_features")

In [353]:
stages  = []
stages += string_indexer
stages += One_Hot_Encoder
stages += [vector_assembler]

In [354]:
stages

[StringIndexer_ced32d8acac9,
 StringIndexer_be8309eabe77,
 StringIndexer_90aff99aca00,
 StringIndexer_30e7b34313ea,
 StringIndexer_04abca1574e1,
 OneHotEncoder_7fcb63bc0c54,
 VectorAssembler_68c86ff0c317]

In [444]:
%%time
from pyspark.ml import Pipeline

pipeline = Pipeline().setStages(stages)
pp_tr = pipeline.fit(trainData)

pp_df = pp_tr.transform(trainData)



Wall time: 2.69 s


In [445]:
pp_df.show(truncate=False)

+-------------+---------+------+-----------+------------+--------------------+-----+----+------------------+-----------------+-----------------+------------------+-------------------------+--------------------------+-------------------+-------------------+------------------+-------------------------+--------------------------+-------------------+-------------------+------------------------------------------------------------------------------------------------------------------------+
|selling_price|km_driven|fuel  |seller_type|transmission|owner               |seats|Age |mileage_clean     |engine_clean     |mpower_clean     |fuel_StringIndexer|seller_type_StringIndexer|transmission_StringIndexer|owner_StringIndexer|seats_StringIndexer|fuel_OneHotEncoder|seller_type_OneHotEncoder|transmission_OneHotEncoder|owner_OneHotEncoder|seats_OneHotEncoder|VectorAssembler_features                                                                                                |
+-------------+-----

In [446]:
pp_df.printSchema()

root
 |-- selling_price: double (nullable = true)
 |-- km_driven: double (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- seats: string (nullable = false)
 |-- Age: double (nullable = false)
 |-- mileage_clean: double (nullable = false)
 |-- engine_clean: double (nullable = false)
 |-- mpower_clean: double (nullable = false)
 |-- fuel_StringIndexer: double (nullable = false)
 |-- seller_type_StringIndexer: double (nullable = false)
 |-- transmission_StringIndexer: double (nullable = false)
 |-- owner_StringIndexer: double (nullable = false)
 |-- seats_StringIndexer: double (nullable = false)
 |-- fuel_OneHotEncoder: vector (nullable = true)
 |-- seller_type_OneHotEncoder: vector (nullable = true)
 |-- transmission_OneHotEncoder: vector (nullable = true)
 |-- owner_OneHotEncoder: vector (nullable = true)
 |-- seats_OneHotEncoder: vector (nullable = true)


In [447]:
pp_df.limit(10).toPandas()

Unnamed: 0,selling_price,km_driven,fuel,seller_type,transmission,owner,seats,Age,mileage_clean,engine_clean,...,seller_type_StringIndexer,transmission_StringIndexer,owner_StringIndexer,seats_StringIndexer,fuel_OneHotEncoder,seller_type_OneHotEncoder,transmission_OneHotEncoder,owner_OneHotEncoder,seats_OneHotEncoder,VectorAssembler_features
0,29999.0,80000.0,Petrol,Individual,Manual,Third Owner,4,24.0,16.1,796.0,...,0.0,0.0,2.0,3.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(80000.0, 24.0, 16.1, 796.0, 37.0, 0.0, 1.0, 0..."
1,30000.0,10000.0,Petrol,Individual,Manual,First Owner,5,20.0,17.3,993.0,...,0.0,0.0,0.0,0.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(10000.0, 20.0, 17.3, 993.0, 60.0, 0.0, 1.0, 0..."
2,31000.0,56194.0,Petrol,Individual,Manual,Fourth & Above Owner,4,21.0,16.1,796.0,...,0.0,0.0,3.0,3.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 0.0, 0.0, 1.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(56194.0, 21.0, 16.1, 796.0, 37.0, 0.0, 1.0, 0..."
3,31504.0,110000.0,Petrol,Individual,Manual,Third Owner,4,17.0,16.1,796.0,...,0.0,0.0,2.0,3.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 0.0, 1.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(110000.0, 17.0, 16.1, 796.0, 37.0, 0.0, 1.0, ..."
4,33351.0,90000.0,Petrol,Individual,Manual,Third Owner,5,17.0,18.9,998.0,...,0.0,0.0,2.0,0.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 0.0, 1.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(90000.0, 17.0, 18.9, 998.0, 67.1, 0.0, 1.0, 0..."
5,35000.0,10000.0,Petrol,Individual,Manual,First Owner,4,14.0,16.1,796.0,...,0.0,0.0,0.0,3.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(10000.0, 14.0, 16.1, 796.0, 37.0, 0.0, 1.0, 0..."
6,35000.0,40000.0,Petrol,Individual,Manual,Second Owner,4,23.0,16.1,796.0,...,0.0,0.0,1.0,3.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(40000.0, 23.0, 16.1, 796.0, 37.0, 0.0, 1.0, 0..."
7,35000.0,75000.0,Petrol,Individual,Manual,Second Owner,5,19.0,19.418783,1458.625016,...,0.0,0.0,1.0,0.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(75000.0, 19.0, 19.418783356519516, 1458.62501..."
8,35000.0,184000.0,Petrol,Individual,Manual,Second Owner,5,19.0,19.418783,1458.625016,...,0.0,0.0,1.0,0.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(184000.0, 19.0, 19.418783356519516, 1458.6250..."
9,39000.0,42108.0,Petrol,Individual,Manual,First Owner,4,20.0,16.1,796.0,...,0.0,0.0,0.0,3.0,"(0.0, 1.0, 0.0)","(1.0, 0.0)",(1.0),"(1.0, 0.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0)","(42108.0, 20.0, 16.1, 796.0, 37.0, 0.0, 1.0, 0..."


In [448]:
pp_df.select(assemblerInput).show(truncate=False)

+---------+----+------------------+-----------------+-----------------+------------------+-------------------------+--------------------------+-------------------+-------------------+
|km_driven|Age |mileage_clean     |engine_clean     |mpower_clean     |fuel_OneHotEncoder|seller_type_OneHotEncoder|transmission_OneHotEncoder|owner_OneHotEncoder|seats_OneHotEncoder|
+---------+----+------------------+-----------------+-----------------+------------------+-------------------------+--------------------------+-------------------+-------------------+
|80000.0  |24.0|16.1              |796.0            |37.0             |(3,[1],[1.0])     |(2,[0],[1.0])            |(1,[0],[1.0])             |(4,[2],[1.0])      |(8,[3],[1.0])      |
|10000.0  |20.0|17.3              |993.0            |60.0             |(3,[1],[1.0])     |(2,[0],[1.0])            |(1,[0],[1.0])             |(4,[0],[1.0])      |(8,[0],[1.0])      |
|56194.0  |21.0|16.1              |796.0            |37.0             |(3,[1],[1

In [449]:
pp_df.select('VectorAssembler_features').show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------+
|VectorAssembler_features                                                                                                |
+------------------------------------------------------------------------------------------------------------------------+
|(23,[0,1,2,3,4,6,8,10,13,18],[80000.0,24.0,16.1,796.0,37.0,1.0,1.0,1.0,1.0,1.0])                                        |
|(23,[0,1,2,3,4,6,8,10,11,15],[10000.0,20.0,17.3,993.0,60.0,1.0,1.0,1.0,1.0,1.0])                                        |
|(23,[0,1,2,3,4,6,8,10,14,18],[56194.0,21.0,16.1,796.0,37.0,1.0,1.0,1.0,1.0,1.0])                                        |
|(23,[0,1,2,3,4,6,8,10,13,18],[110000.0,17.0,16.1,796.0,37.0,1.0,1.0,1.0,1.0,1.0])                                       |
|(23,[0,1,2,3,4,6,8,10,13,15],[90000.0,17.0,18.9,998.0,67.1,1.0,1.0,1.0,1.0,1.0])                                        |
|(23,[0,1,2,3,4,

# Scaling

In [450]:
data = pp_df.select(
F.col("VectorAssembler_features").alias("features"),
F.col("selling_price").alias("label"))

In [451]:
from pyspark.ml.feature import StandardScaler
# Let us create an object of StandardScaler class
Scalerizer=StandardScaler().setInputCol("features").setOutputCol("scaled_features")
Scalerizer.fit(data).transform(data).show(5)

+--------------------+-------+--------------------+
|            features|  label|     scaled_features|
+--------------------+-------+--------------------+
|(23,[0,1,2,3,4,6,...|29999.0|(23,[0,1,2,3,4,6,...|
|(23,[0,1,2,3,4,6,...|30000.0|(23,[0,1,2,3,4,6,...|
|(23,[0,1,2,3,4,6,...|31000.0|(23,[0,1,2,3,4,6,...|
|(23,[0,1,2,3,4,6,...|31504.0|(23,[0,1,2,3,4,6,...|
|(23,[0,1,2,3,4,6,...|33351.0|(23,[0,1,2,3,4,6,...|
+--------------------+-------+--------------------+
only showing top 5 rows



In [452]:
scaled_data = Scalerizer.fit(data).transform(data).drop('features')
scaled_data.show(5)

+-------+--------------------+
|  label|     scaled_features|
+-------+--------------------+
|29999.0|(23,[0,1,2,3,4,6,...|
|30000.0|(23,[0,1,2,3,4,6,...|
|31000.0|(23,[0,1,2,3,4,6,...|
|31504.0|(23,[0,1,2,3,4,6,...|
|33351.0|(23,[0,1,2,3,4,6,...|
+-------+--------------------+
only showing top 5 rows



#  TRAIN Random forest model

In [453]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator 

In [469]:
%%time
#rf = RandomForestRegressor(labelCol = "label", featuresCol = "scaled_features")
rf = RandomForestRegressor(labelCol = "label", featuresCol = "features")

Wall time: 8 ms


In [470]:
#rfmodel=rf.fit(scaled_data)
rfmodel=rf.fit(data)

# validation

In [457]:
pp_val= pipeline.fit(validationData)

pp_df2 = pp.transform(validationData)
data2 = pp_df2.select(
F.col("VectorAssembler_features").alias("features"),
F.col("selling_price").alias("label"))
Scalerizer2=StandardScaler().setInputCol("features").setOutputCol("scaled_features")
scaled_data2 = Scalerizer2.fit(data2).transform(data2).drop('features')


In [458]:
scaled_data2.show(5)

+-------+--------------------+
|  label|     scaled_features|
+-------+--------------------+
|30000.0|(21,[0,1,2,3,4,6,...|
|40000.0|(21,[0,1,2,3,4,6,...|
|40000.0|(21,[0,1,2,3,4,6,...|
|40000.0|(21,[0,1,2,3,4,6,...|
|45000.0|(21,[0,1,2,3,4,6,...|
+-------+--------------------+
only showing top 5 rows



In [474]:
# pred = rfmodel.transform(scaled_data2)
pred = rfmodel.transform(data2)

In [475]:
pred.printSchema()

root
 |-- features: vector (nullable = true)
 |-- label: double (nullable = true)
 |-- prediction: double (nullable = false)



In [476]:
pred.show()

Py4JJavaError: An error occurred while calling o12357.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 743.0 failed 1 times, most recent failure: Lost task 0.0 in stage 743.0 (TID 555) (DESKTOP-CRAL56H.mshome.net executor driver): org.apache.spark.SparkException: Failed to execute user defined function (RandomForestRegressionModel$$Lambda$4544/0x00000001017d8840: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => double)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:136)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:349)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.IndexOutOfBoundsException: Index 22 out of bounds [0, 21)
	at org.apache.spark.ml.linalg.SparseVector.apply(Vectors.scala:650)
	at org.apache.spark.ml.tree.CategoricalSplit.shouldGoLeft(Split.scala:99)
	at org.apache.spark.ml.tree.InternalNode.predictImpl(Node.scala:180)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$predict$1(RandomForestRegressor.scala:259)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$predict$1$adapted(RandomForestRegressor.scala:259)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.predict(RandomForestRegressor.scala:259)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$transform$1(RandomForestRegressor.scala:233)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$transform$1$adapted(RandomForestRegressor.scala:233)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:476)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:429)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2935)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:287)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:326)
	at jdk.internal.reflect.GeneratedMethodAccessor71.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function (RandomForestRegressionModel$$Lambda$4544/0x00000001017d8840: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => double)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:136)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:349)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.lang.IndexOutOfBoundsException: Index 22 out of bounds [0, 21)
	at org.apache.spark.ml.linalg.SparseVector.apply(Vectors.scala:650)
	at org.apache.spark.ml.tree.CategoricalSplit.shouldGoLeft(Split.scala:99)
	at org.apache.spark.ml.tree.InternalNode.predictImpl(Node.scala:180)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$predict$1(RandomForestRegressor.scala:259)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$predict$1$adapted(RandomForestRegressor.scala:259)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.predict(RandomForestRegressor.scala:259)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$transform$1(RandomForestRegressor.scala:233)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$transform$1$adapted(RandomForestRegressor.scala:233)
	... 17 more


In [477]:
pred.select("label","prediction").show()

Py4JJavaError: An error occurred while calling o12366.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 744.0 failed 1 times, most recent failure: Lost task 0.0 in stage 744.0 (TID 556) (DESKTOP-CRAL56H.mshome.net executor driver): org.apache.spark.SparkException: Failed to execute user defined function (RandomForestRegressionModel$$Lambda$4544/0x00000001017d8840: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => double)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:136)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:349)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: java.lang.IndexOutOfBoundsException: Index 22 out of bounds [0, 21)
	at org.apache.spark.ml.linalg.SparseVector.apply(Vectors.scala:650)
	at org.apache.spark.ml.tree.CategoricalSplit.shouldGoLeft(Split.scala:99)
	at org.apache.spark.ml.tree.InternalNode.predictImpl(Node.scala:180)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$predict$1(RandomForestRegressor.scala:259)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$predict$1$adapted(RandomForestRegressor.scala:259)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.predict(RandomForestRegressor.scala:259)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$transform$1(RandomForestRegressor.scala:233)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$transform$1$adapted(RandomForestRegressor.scala:233)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:476)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:429)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3715)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3706)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3704)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2728)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2935)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:287)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:326)
	at jdk.internal.reflect.GeneratedMethodAccessor71.invoke(Unknown Source)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.SparkException: Failed to execute user defined function (RandomForestRegressionModel$$Lambda$4544/0x00000001017d8840: (struct<type:tinyint,size:int,indices:array<int>,values:array<double>>) => double)
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala:136)
	at org.apache.spark.sql.errors.QueryExecutionErrors.failedExecuteUserDefinedFunctionError(QueryExecutionErrors.scala)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:759)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:349)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: java.lang.IndexOutOfBoundsException: Index 22 out of bounds [0, 21)
	at org.apache.spark.ml.linalg.SparseVector.apply(Vectors.scala:650)
	at org.apache.spark.ml.tree.CategoricalSplit.shouldGoLeft(Split.scala:99)
	at org.apache.spark.ml.tree.InternalNode.predictImpl(Node.scala:180)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$predict$1(RandomForestRegressor.scala:259)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$predict$1$adapted(RandomForestRegressor.scala:259)
	at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
	at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)
	at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)
	at scala.collection.TraversableLike.map(TraversableLike.scala:286)
	at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
	at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.predict(RandomForestRegressor.scala:259)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$transform$1(RandomForestRegressor.scala:233)
	at org.apache.spark.ml.regression.RandomForestRegressionModel.$anonfun$transform$1$adapted(RandomForestRegressor.scala:233)
	... 17 more
