### Random Forest Regression

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName("Random Forest Regression").getOrCreate()
data = spark.read.csv("Data/Car_Sales.csv", header = True, inferSchema = True)
data.show()

In [None]:
#A function to perform One Hot Encoding
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
def OneHotEncoding(df, col_name = ""):
    """
        This function perform One Hot Encoding on a data column.
        It takes two arugment and returns a coded Dataframe on  the specified column
        1. df : Represent the dataframe
        2. col_name : Represent the name to be used
    """
    ###Gather the distinct values 
    distinct_values = list(df.select(col_name).distinct().toPandas()[col_name])
    # for each of the gathered values create a new column
#For Body
    for distinct_value in distinct_values:
        function = udf(lambda item: 1 if item == distinct_value else 0, IntegerType())
        new_column_name = col_name+'_'+distinct_value
        df = df.withColumn(new_column_name, function(col(col_name)))
    
    return df

In [None]:
data.columns

In [None]:
#OneHotEncoding
cols = ['Brand','Body','Engine Type','Registration']
for item in cols:
    data = OneHotEncoding(data, col_name = item)

data.printSchema()

In [None]:
#Changing data types to int
col = ['Price', "EngineV"]
for i in col:
    data = data.withColumn(i, data[i].cast('double'))

data.printSchema()

In [None]:
#Dropping unwated columns
col_to_drop = ['Brand','Body','Engine Type','Year','Model', 'Registration']
for item in col_to_drop:
    data = data.drop(item)

data.printSchema()

In [None]:
#Dropping Null Values
data = data.na.drop()

In [None]:
data.columns

In [None]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['Mileage',
                                                'EngineV',
                                                'Brand_Volkswagen',
                                                'Brand_Mitsubishi',
                                                'Brand_Audi',
                                                'Brand_Mercedes-Benz',
                                                'Brand_Renault',
                                                'Brand_BMW',
                                                'Brand_Toyota',
                                                'Body_van',
                                                'Body_crossover',
                                                'Body_other',
                                                'Body_sedan',
                                                'Body_hatch',
                                                'Body_vagon',
                                                'Engine Type_Diesel',
                                                'Engine Type_Other',
                                                'Engine Type_Gas',
                                                'Engine Type_Petrol',
                                                'Registration_no',
                                                'Registration_yes'], outputCol = 'features')

output = featureassembler.transform(data)

In [None]:
finalized_data = output.select("features", "Price")

In [None]:
train, test = finalized_data.randomSplit([0.7, 0.3], 1234)

* Model Training

In [None]:
rf = RandomForestRegressor(featuresCol = "features", labelCol = "Price")

In [None]:

model = rf.fit(train)

In [None]:
predictions = model.transform(test)
predictions.select("prediction", "Price", "features").show()

* Model Evaluations

In [None]:
evaluator = RegressionEvaluator(labelCol = "Price", predictionCol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

In [None]:
rfModel = model.stages[1]
print(rfModel)