In [1]:
#Code Snippet 27
#Step 1 - Importing the data and essential libraries 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('MultiVariableLinearReg').getOrCreate()
from pyspark.ml.regression import LinearRegression
data = spark.read.csv('multi_variable_regression.csv',header=True,inferSchema=True)
print("Initial Data")
data.show(3)
#importing the VectorAssembler to convert the features into spark accepted format
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
#Step 2 - Data pre-processing and converting any string data to spark accepted format
#importing the StringIndexer to convert the locality feature into spark accepted format
from pyspark.ml.feature import StringIndexer
#convert the locality feature of string type into spark accepted data format
string_index_object = StringIndexer(inputCol='area',outputCol='area_feature')
string_indexed_df_object = string_index_object.fit(data)
final_data = string_indexed_df_object.transform(data)
print("Data after converting the string column locality into spark accepted feature")
final_data.show(3)
print("Columns present in our Data and a sample row value\n")
print(final_data.columns)
#Step 3 - Data pre-processing and converting the numeric data to spark accepted format
#converting the feature(s) into spark accepted data format
#Passing multiple columns as the input columns
assembler_object = VectorAssembler(inputCols=['house_size', 'bedrooms', 'floors','house_age', 'area_feature'], outputCol='house_features')
feature_vector_dataframe = assembler_object.transform(final_data)
print(feature_vector_dataframe.head(1))
feature_vector_dataframe.printSchema()
formatted_data = feature_vector_dataframe.select('house_features','price_sold')
print("Consolidated Data with accepted features and labels")
formatted_data.show(3)
#Step 4 - Training our Linear Regression model with multiple variables
# Splitting the data into 60 and 40 percent
train_data, test_data = formatted_data.randomSplit([0.6,0.4]) 
#Defining our Linear regression
lireg = LinearRegression(featuresCol='house_features',labelCol='price_sold')
#Training our model with training data
lireg_model = lireg.fit(train_data)
#Step 5 - Evaluating of Trained Model
#Evaluating our model with testing data
test_results = lireg_model.evaluate(test_data)
print("Residuals info - distance between data points and fitted regression line")
test_results.residuals.show(4)
print("Root Mean Square Error {}".format(test_results.rootMeanSquaredError))
print("R square value {}".format(test_results.r2))
#Step 6 - Performing Predictions with novel data
#Creating unlabeled data from test data by removing the label in order to get predictions
unlabeled_data =  test_data.select('house_features')
predictions = lireg_model.transform(unlabeled_data)
print("\nPredictions for Novel Data")
predictions.show(4)
#Checking our model with new value manually
print("Coeffecients are {}".format(lireg_model.coefficients))
print("\nIntercept is {}".format(lireg_model.intercept))
new_house_size = 1750
new_house_number_of_bedrooms = 3
new_house_number_of_floors = 2
new_house_age = 5
#Mimicking the hypothesis function to get a prediction
new_price = ((lireg_model.intercept) + (lireg_model.coefficients[0])*new_house_size +(lireg_model.coefficients[1])*new_house_number_of_bedrooms +      
             (lireg_model.coefficients[2])*new_house_number_of_floors + (lireg_model.coefficients[3])*new_house_age)
print("\nPredicted house price for the house of size {}, having {} bedrooms ,{} floors and the age of the house being {} is {}".format(new_house_size,new_house_number_of_bedrooms,new_house_number_of_floors,new_house_age,new_price))

Initial Data
+----------+--------+------+---------+----------+----------+
|house_size|bedrooms|floors|house_age|      area|price_sold|
+----------+--------+------+---------+----------+----------+
|      1490|       2|     2|       10|Ave Avenue|        60|
|      2500|       3|     2|       20|Ave Avenue|        95|
|      1200|       2|     1|        5|   MG Road|        55|
+----------+--------+------+---------+----------+----------+
only showing top 3 rows

Data after converting the string column locality into spark accepted feature
+----------+--------+------+---------+----------+----------+------------+
|house_size|bedrooms|floors|house_age|      area|price_sold|area_feature|
+----------+--------+------+---------+----------+----------+------------+
|      1490|       2|     2|       10|Ave Avenue|        60|         0.0|
|      2500|       3|     2|       20|Ave Avenue|        95|         0.0|
|      1200|       2|     1|        5|   MG Road|        55|         1.0|
+----------+--