In [4]:
#Code Snippet 32
#Step 1 - Importing the data and essential libraries 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkRFRegression').getOrCreate()
data = spark.read.csv('car-performance-price.csv',header=True,inferSchema=True)
print("Initial Data")
data.show(3)
#Step 2 - Data pre-processing and converting data to spark accepted format
data = data.na.drop()
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['horsepower','peak-rpm','city-mileage','highway-mileage'],outputCol='features')
assembler_data = assembler.transform(data)
final_data = assembler_data.select('features','price')
print("Consolidated Data with features and labels")
final_data.show(3)
#Step 3 - Training our Decision model 
# Splliting the data into 80 and 20 percent
train_data,test_data=final_data.randomSplit([0.8,0.2])
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(labelCol='price',featuresCol='features',numTrees=120)
rf_model = rf.fit(train_data)
rf_predictions = rf_model.transform(test_data)
#Step 4 - Evaluating our Trained Model
from pyspark.ml.evaluation import RegressionEvaluator
regression_evaluator_r2 = RegressionEvaluator(predictionCol='prediction',labelCol='price',metricName="r2")
R2 = regression_evaluator_r2.evaluate(rf_predictions)
print("The R Square value is {}".format(R2))
print("\nDetemining which feature played a major role in Decision Making")
print(rf_model.featureImportances)

Initial Data
+----------+--------+------------+---------------+-----+
|horsepower|peak-rpm|city-mileage|highway-mileage|price|
+----------+--------+------------+---------------+-----+
|       111|    5000|          21|             27|13495|
|       111|    5000|          21|             27|16500|
|       154|    5000|          19|             26|16500|
+----------+--------+------------+---------------+-----+
only showing top 3 rows

Consolidated Data with features and labels
+--------------------+-----+
|            features|price|
+--------------------+-----+
|[111.0,5000.0,21....|13495|
|[111.0,5000.0,21....|16500|
|[154.0,5000.0,19....|16500|
+--------------------+-----+
only showing top 3 rows

The R Square value is 0.8613056895771833

Detemining which feature played a major role in Decision Making
(4,[0,1,2,3],[0.38295168531302315,0.0618069348638923,0.25407192370453,0.3011694561185545])
