### Decision Tree Regression 

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName("Decision Tree Regression").getOrCreate()
data = spark.read.csv("Data\real_estate_price_size_year_view.csv", header = True, inferSchema = True)
data.show()

In [None]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType


def OneHotEncoding(df, col_name=""):
    ###Gather the distinct values
    distinct_values = list(df.select(col_name).distinct().toPandas()[col_name])
    # for each of the gathered values create a new column
#For Body
    for distinct_value in distinct_values:
        function = udf(lambda item: 1 if item ==
                       distinct_value else 0, IntegerType())
        new_column_name = col_name+'_'+distinct_value
        df = df.withColumn(new_column_name, function(col(col_name)))

    return df


In [None]:
data = OneHotEncoding(data, "view")
data.printSchema()

In [None]:
data.columns

In [None]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['size', 'year', 'view_Sea view', 'view_No sea view'], outputCol = "feature")
finalized_data = featureassembler.transform(data)
finalized_data.printSchema()

In [None]:
#Split the data
(train, test) = finalized_data.randomSplit([0.7, 0.3], 1234)

In [None]:
dt = DecisionTreeRegressor(featuresCol = "feature", labelCol = "price")

In [None]:
model = dt.fit(train)

In [None]:
predictions = model.transform(test)

In [None]:
#Select example rows to display
predictions.select("prediction", "label", "features").show()

In [None]:
#Select (predictions, true label) and compute test errors
evaluator = RegressionEvaluator(labelCol = "label", predictionCol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g " % rmse)

treeModel = model.stages[1]
#Summary Only
print(treeModel)