In [2]:
import findspark

In [3]:
findspark.init('/home/bowen/spark-2.4.4-bin-hadoop2.7/')

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

In [5]:
spark = SparkSession.builder.appName('lr_house_price').getOrCreate()
df = spark.read.csv('home_data.csv', inferSchema=True, header=True)

In [6]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- date: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- bedrooms: integer (nullable = true)
 |-- bathrooms: double (nullable = true)
 |-- sqft_living: integer (nullable = true)
 |-- sqft_lot: integer (nullable = true)
 |-- floors: double (nullable = true)
 |-- waterfront: integer (nullable = true)
 |-- view: integer (nullable = true)
 |-- condition: integer (nullable = true)
 |-- grade: integer (nullable = true)
 |-- sqft_above: integer (nullable = true)
 |-- sqft_basement: integer (nullable = true)
 |-- yr_built: integer (nullable = true)
 |-- yr_renovated: integer (nullable = true)
 |-- zipcode: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- sqft_living15: integer (nullable = true)
 |-- sqft_lot15: integer (nullable = true)



In [7]:
df.head(1)

[Row(id=7129300520, date='20141013T000000', price=221900, bedrooms=3, bathrooms=1.0, sqft_living=1180, sqft_lot=5650, floors=1.0, waterfront=0, view=0, condition=3, grade=7, sqft_above=1180, sqft_basement=0, yr_built=1955, yr_renovated=0, zipcode=98178, lat=47.5112, long=-122.257, sqft_living15=1340, sqft_lot15=5650)]

In [118]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

In [142]:
#prepare data for Spark ml lib
selected_features = ["bedrooms", "bathrooms", "sqft_living", "sqft_lot", "floors", "waterfront", "view", "condition", 
                     "grade", "sqft_above", "sqft_basement", "yr_built", "yr_renovated", "sqft_living15", "sqft_lot15"]
assembler = VectorAssembler(inputCols=selected_features, outputCol='features')
house_df = assembler.transform(df)
house_df = house_df.select(['features', 'price'])

In [143]:
house_df.head(1)

[Row(features=DenseVector([3.0, 1.0, 1180.0, 5650.0, 1.0, 0.0, 0.0, 3.0, 7.0, 1180.0, 0.0, 1955.0, 0.0, 1340.0, 5650.0]), price=221900)]

In [155]:
#split dataset for training
train_set, test_set = house_df.randomSplit([0.8, 0.2])
lr = LinearRegression(labelCol='price', maxIter=100, regParam=0, elasticNetParam=0.8)
lr_model = lr.fit(train_set)
print ("Coefficient: ", lr_model.coefficients)
print ("Intercept: ", lr_model.intercept)

Coefficient:  [-41706.85390612283,46463.29881921014,-13709.010017204391,-0.03720132857305679,24756.158279555963,578796.5221890734,44720.1301479482,20366.365727344688,121343.31499211612,13873.487810413231,13873.36113393493,-3538.809587299982,13.255873178823396,22.02267713335554,-0.5241106157735834]
Intercept:  6129142.857330269


In [157]:
training_summary = lr_model.summary
print ("RMSE: ", round(training_summary.rootMeanSquaredError, 3))
print ("R2: ", round(training_summary.r2, 3))

RMSE:  217840.212
R2:  0.655


In [159]:
test_results = lr_model.evaluate(test_set)

In [160]:
print ("RMSE: ", test_results.rootMeanSquaredError)
print ("R2: ", test_results.r2)

RMSE:  208949.04141699386
R2:  0.6450752037535077


In [192]:
opt_selected_features = ["bedrooms", "bathrooms", "sqft_living", "grade", "sqft_above", "sqft_living15", "waterfront"]

#Lasso
opt_selected_features = ["sqft_living", "sqft_lot", "sqft_basement", "yr_built", "yr_renovated", "sqft_living15", "sqft_lot15"]

#random forests
opt_selected_features = ['sqft_lot', 'sqft_living', 'sqft_above', 'yr_built', 'bathrooms', 'sqft_basement',
                        'bedrooms']
opt_assembler = VectorAssembler(inputCols=selected_features, outputCol='features')
opt_house_df = opt_assembler.transform(df)
opt_house_df = opt_house_df.select(['features', 'price'])

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                       withStd=True, withMean=False)
scalerModel = scaler.fit(opt_house_df)
scaledData = scalerModel.transform(opt_house_df)

#train the model
opt_train_set, opt_test_set = scaledData.randomSplit([0.8, 0.2])
opt_lr = LinearRegression(featuresCol='scaledFeatures', labelCol='price', maxIter=1000, regParam=0.3, elasticNetParam=0.8)
opt_lr_model = opt_lr.fit(opt_train_set)
#print ("Coefficient: ", opt_lr_model.coefficients)
#print ("Intercept: ", opt_lr_model.intercept)
opt_training_summary = opt_lr_model.summary
print ('---trainning set---')
print ("RMSE: ", round(opt_training_summary.rootMeanSquaredError, 3))
print ("R2: ", round(opt_training_summary.r2, 3))

#test the model
print ('---testing set---')
opt_test_results = lr_model.evaluate(opt_test_set)
print ("RMSE: ", round(opt_test_results.rootMeanSquaredError,3))
print ("R2: ", round(opt_test_results.r2, 3))

---trainning set---
RMSE:  219631.831
R2:  0.649
---testing set---
RMSE:  200889.37
R2:  0.674


In [None]:
opt_selected_features = ['sqft_lot', 'sqft_living', 'floors', 'view', 'bathrooms', 'waterfront','bedrooms']
opt_assembler = VectorAssembler(inputCols=selected_features, outputCol='features')
opt_house_df = opt_assembler.transform(df)
opt_house_df = opt_house_df.select(['features', 'price'])

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                       withStd=True, withMean=False)
scalerModel = scaler.fit(opt_house_df)
scaledData = scalerModel.transform(opt_house_df)

#train the model
opt_train_set, opt_test_set = scaledData.randomSplit([0.8, 0.2])
opt_lr = LinearRegression(featuresCol='scaledFeatures', labelCol='price', maxIter=1000, regParam=0.3, elasticNetParam=0.8)
opt_lr_model = opt_lr.fit(opt_train_set)
#print ("Coefficient: ", opt_lr_model.coefficients)
#print ("Intercept: ", opt_lr_model.intercept)
opt_training_summary = opt_lr_model.summary
print ('---trainning set---')
print ("RMSE: ", round(opt_training_summary.rootMeanSquaredError, 3))
print ("R2: ", round(opt_training_summary.r2, 3))

#test the model
print ('---testing set---')
opt_test_results = lr_model.evaluate(opt_test_set)
print ("RMSE: ", round(opt_test_results.rootMeanSquaredError,3))
print ("R2: ", round(opt_test_results.r2, 3))