In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName("Gradient- Boosted Tree").getOrCreate()
data= spark.read.csv(r"C:\Users\Abdul Majeed Ahmed\Desktop\GitHub\Data\real_estate_price_size_year_view.csv", header = True, inferSchema =True)
data.show()

In [None]:
#A function to perform One Hot Encoding
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType


def OneHotEncoding(df, col_name=""):
    """
        This function perform One Hot Encoding on a data column.
        It takes two arugment and returns a coded Dataframe on  the specified column
        1. df : Represent the dataframe
        2. col_name : Represent the name to be used
    """
    ###Gather the distinct values
    distinct_values = list(df.select(col_name).distinct().toPandas()[col_name])
    # for each of the gathered values create a new column
#For Body
    for distinct_value in distinct_values:
        function = udf(lambda item: 1 if item ==
                       distinct_value else 0, IntegerType())
        new_column_name = col_name+'_'+distinct_value
        df = df.withColumn(new_column_name, function(col(col_name)))

    return df

In [None]:
data = OneHotEncoding(data, "view")
data.printSchema()

In [None]:
data = data.drop("view")
data = data.na.drop()

In [None]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['size', 'year', 'view_Sea view', 'view_No sea view'], outputCol = 'features')
output = featureassembler.transform(data)
output.printSchema()

In [None]:
(train, test) = output.randomSplit([0.7, 0.3], 122)

In [None]:
gbt = GBTRegressor(featuresCol = "features",labelCol = "price", maxIter = 10)
model = gbt.fit(train)

In [None]:
predictions = model.transform(test)
predictions.select("prediction","price", "features")

In [None]:
evaluator = RegressionEvaluator(labelCol = "price", predictionCol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE on test data = %g" % rmse)

In [None]:
gbtModel = model.stages[1]
print(gbtModel)