based on this: https://towardsdatascience.com/building-a-linear-regression-with-pyspark-and-mllib-d065c3ba246a

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext



Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1675261607133_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
#from sklearn.datasets import fetch_openml
#housing = fetch_openml(name="house_prices", as_frame=True)

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing(as_frame=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
housing.frame.head()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

   MedInc  HouseAge  AveRooms  ...  Latitude  Longitude  MedHouseVal
0  8.3252      41.0  6.984127  ...     37.88    -122.23        4.526
1  8.3014      21.0  6.238137  ...     37.86    -122.22        3.585
2  7.2574      52.0  8.288136  ...     37.85    -122.24        3.521
3  5.6431      52.0  5.817352  ...     37.85    -122.25        3.413
4  3.8462      52.0  6.281853  ...     37.85    -122.25        3.422

[5 rows x 9 columns]

In [4]:
house_df = spark.createDataFrame(housing.frame)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
house_df.cache()
house_df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- MedInc: double (nullable = true)
 |-- HouseAge: double (nullable = true)
 |-- AveRooms: double (nullable = true)
 |-- AveBedrms: double (nullable = true)
 |-- Population: double (nullable = true)
 |-- AveOccup: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- MedHouseVal: double (nullable = true)

In [6]:
house_df.describe().toPandas().transpose()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

                 0                    1  ...                   3                   4
summary      count                 mean  ...                 min                 max
MedInc       20640    3.870671002906976  ...              0.4999             15.0001
HouseAge     20640   28.639486434108527  ...                 1.0                52.0
AveRooms     20640    5.428999742190379  ...  0.8461538461538461   141.9090909090909
AveBedrms    20640   1.0966751496062077  ...  0.3333333333333333   34.06666666666667
Population   20640   1425.4767441860465  ...                 3.0             35682.0
AveOccup     20640   3.0706551594363747  ...  0.6923076923076923  1243.3333333333333
Latitude     20640     35.6318614341085  ...               32.54               41.95
Longitude    20640  -119.56970445736431  ...             -124.35             -114.31
MedHouseVal  20640   2.0685581690891497  ...             0.14999             5.00001

[10 rows x 5 columns]

In [7]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['AveOccup','HouseAge', 'AveRooms','AveBedrms','Population'], outputCol = 'features')
vhouse_df = vectorAssembler.transform(house_df)
vhouse_df = vhouse_df.select(['features', 'MedInc'])
vhouse_df.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+------+
|            features|MedInc|
+--------------------+------+
|[2.55555555555555...|8.3252|
|[2.10984182776801...|8.3014|
|[2.80225988700564...|7.2574|
+--------------------+------+
only showing top 3 rows

In [8]:
splits = vhouse_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='MedInc', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Coefficients: [0.0,0.0,0.26530481182337584,-0.8915929879900834,0.0]
Intercept: 3.4061680398132093

In [10]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

RMSE: 1.681297
r2: 0.211191

In [11]:
lr_predictions = lr_model.transform(test_df)
lr_predictions

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[features: vector, MedInc: double, prediction: double]

In [12]:

lr_predictions.select("prediction","MedInc","features").show(5)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+------------------+------+--------------------+
|        prediction|MedInc|            features|
+------------------+------+--------------------+
| 3.064081733724244|2.0096|[1.49332759362892...|
| 2.962378031447533| 0.977|[1.53157894736842...|
| 3.625078756165531|2.3958|[1.53170731707317...|
|3.7499359335872535|2.4896|[1.57843137254901...|
|3.5395165817801675|3.1691|[1.62559241706161...|
+------------------+------+--------------------+
only showing top 5 rows

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="MedInc",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

R Squared (R2) on test data = 0.223587