## Factorization Machine Regression

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import FMRegressor
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName("Factorization Machine Regression").getOrCreate()
data =spark.read.csv("Data/real_estate_price_size_year_view.csv", header = True, inferSchema = True)
data.show(10)

In [None]:
#A function to perform One Hot Encoding
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType


def OneHotEncoding(df, col_name=""):
    """
        This function perform One Hot Encoding on a data column.
        It takes two arugment and returns a coded Dataframe on  the specified column
        1. df : Represent the dataframe
        2. col_name : Represent the name to be used
    """
    ###Gather the distinct values
    distinct_values = list(df.select(col_name).distinct().toPandas()[col_name])
    # for each of the gathered values create a new column
#For Body
    for distinct_value in distinct_values:
        function = udf(lambda item: 1 if item ==
                       distinct_value else 0, IntegerType())
        new_column_name = col_name+'_'+distinct_value
        df = df.withColumn(new_column_name, function(col(col_name)))

    return df


In [None]:
#Encoding data (creating dummy variables)
data = data.na.drop()
data = OneHotEncoding(data, "view")
data = data.drop("view")
data.printSchema()

In [None]:
#Assemblying vectors
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols = ['size', 'year', 'view_Sea view', 'view_No sea view',], outputCol = "features")
finalized_data = featureassembler.transform(data)
finalized_data.printSchema()

In [None]:
#Splitting Data
train, test = finalized_data.randomSplit([0.6, 0.4], 444)

In [None]:
#Training regressor
fm = FMRegressor(featuresCol = "features",labelCol = "price", stepSize = 0.0001)
model = fm.fit(train)

In [None]:
#Model predictions
predictions = model.transform(test)
predictions.select("prediction", "price", "features").show()

In [None]:
#Evaluate model
evaluator = RegressionEvaluator(labelCol = "price", predictioncol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE on test data = %g " rmse)

In [None]:
#Useful data info
fmModel = model.stages[1]
print("Factors: " + str(fmModel.factors))
print("linear: " + str(fmModel.linear))
print("intercept: " + str(fmModel.intercept))