### Linear Regression

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline

In [None]:
spark = SparkSession.builder.appName("Linear Regression").getOrCreate()
data = spark.read.csv(r"C:\Users\Abdul Majeed Ahmed\Desktop\GitHub\Data\real_estate_price_size_year_view.csv", header=True, inferSchema=True)
data.show()

In [None]:
data.printSchema()

In [None]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
def OneHotEncoding(df, col_name = ""):
    ###Gather the distinct values 
    distinct_values = list(df.select(col_name).distinct().toPandas()[col_name])
    # for each of the gathered values create a new column
#For Body
    for distinct_value in distinct_values:
        function = udf(lambda item: 1 if item == distinct_value else 0, IntegerType())
        new_column_name = col_name+'_'+distinct_value
        df = df.withColumn(new_column_name, function(col(col_name)))
    
    return df

In [None]:
data.columns

In [None]:
#Encodin view
data = OneHotEncoding(data, col_name = "view")
data.columns

In [None]:
data.printSchema()

In [None]:
#Checking the number of Columns in the our dataset
len(data.columns)

##### Feature Assembling

In [None]:
from pyspark.ml.feature import VectorAssembler

featureassembler = VectorAssembler(inputCols = ['size', 'year', 'view_Sea view', 'view_No sea view'] , outputCol = 'features')

output = featureassembler.transform(data)
output.columns

In [None]:
(train, test) = output.randomSplit([0.7, 0.3], 123445)

In [None]:
lr  = LinearRegression(featuresCol = "features", labelCol = "price",maxIter = 10, regParam = 0.3, elasticNetParam = 0.8)
lrModel = lr.fit(train)

In [None]:
#Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % train.totalIterations)

In [None]:
print(f"objectiveHistory: {str(trainingSummary.objectiveHistory)}")

In [None]:
trainingSummary.residuals.show()

In [None]:
print("RMSE: %F " % trainingSummary.rootMeanSquaredError)

In [None]:
print("r2: %f" % trainingSummary.r2)

In [None]:
prediction = lrModel.transform(test)
prediction.show()