In [1]:
import pyspark
sc = pyspark.SparkContext(appName="mlib-reg")
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('mlib-reg').getOrCreate()

### Using DataFrames - ml

In [19]:
df = spark.read.csv("Resources/boston.csv",inferSchema=True,header=True)
df.show(5)

+-------+----+----+---+-----+-----+----+------+---+-----+----+------+----+----+
|      0|   1|   2|  3|    4|    5|   6|     7|  8|    9|  10|    11|  12|  13|
+-------+----+----+---+-----+-----+----+------+---+-----+----+------+----+----+
|0.00632|18.0|2.31|0.0|0.538|6.575|65.2|  4.09|1.0|296.0|15.3| 396.9|4.98|24.0|
|0.02731| 0.0|7.07|0.0|0.469|6.421|78.9|4.9671|2.0|242.0|17.8| 396.9|9.14|21.6|
|0.02729| 0.0|7.07|0.0|0.469|7.185|61.1|4.9671|2.0|242.0|17.8|392.83|4.03|34.7|
|0.03237| 0.0|2.18|0.0|0.458|6.998|45.8|6.0622|3.0|222.0|18.7|394.63|2.94|33.4|
|0.06905| 0.0|2.18|0.0|0.458|7.147|54.2|6.0622|3.0|222.0|18.7| 396.9|5.33|36.2|
+-------+----+----+---+-----+-----+----+------+---+-----+----+------+----+----+
only showing top 5 rows



In [31]:
df = df.na.drop()

In [36]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols=['0','1','2','3','4','5','6','7','8','9','10','11','12'],
                                  outputCol='features')
v_boston = vectorAssembler.transform(df)
v_boston = v_boston.select(['features','13'])
v_boston.show(5)

+--------------------+----+
|            features|  13|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
+--------------------+----+
only showing top 5 rows



In [38]:
(train_df,test_df) = v_boston.randomSplit([0.7,0.3])

In [41]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol='features',labelCol='13')
lr_model = lr.fit(train_df)
training_summary = lr_model.summary

In [43]:
print("MAE: " + str(training_summary.meanAbsoluteError))

MAE: 3.334699364240313


In [44]:
predictions = lr_model.transform(test_df)
predictions.show(5)

+--------------------+----+------------------+
|            features|  13|        prediction|
+--------------------+----+------------------+
|[0.01301,35.0,1.5...|32.7|29.995315349159245|
|[0.01381,80.0,0.4...|50.0| 41.09489863381874|
|[0.01432,100.0,1....|31.6| 32.92170894032721|
|[0.02055,85.0,0.7...|24.7|24.590997592138343|
|[0.02729,0.0,7.07...|34.7|30.683213879398213|
+--------------------+----+------------------+
only showing top 5 rows



In [47]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction",labelCol="13",metricName="mae")
lr_evaluator.evaluate(predictions)

3.1919906014237633

### Using RDDs- mllib

In [49]:
df = spark.read.csv("Resources/boston.csv",inferSchema=True,header=True)
df.show(5)

+-------+----+----+---+-----+-----+----+------+---+-----+----+------+----+----+
|      0|   1|   2|  3|    4|    5|   6|     7|  8|    9|  10|    11|  12|  13|
+-------+----+----+---+-----+-----+----+------+---+-----+----+------+----+----+
|0.00632|18.0|2.31|0.0|0.538|6.575|65.2|  4.09|1.0|296.0|15.3| 396.9|4.98|24.0|
|0.02731| 0.0|7.07|0.0|0.469|6.421|78.9|4.9671|2.0|242.0|17.8| 396.9|9.14|21.6|
|0.02729| 0.0|7.07|0.0|0.469|7.185|61.1|4.9671|2.0|242.0|17.8|392.83|4.03|34.7|
|0.03237| 0.0|2.18|0.0|0.458|6.998|45.8|6.0622|3.0|222.0|18.7|394.63|2.94|33.4|
|0.06905| 0.0|2.18|0.0|0.458|7.147|54.2|6.0622|3.0|222.0|18.7| 396.9|5.33|36.2|
+-------+----+----+---+-----+-----+----+------+---+-----+----+------+----+----+
only showing top 5 rows



In [50]:
import numpy as np
df_pd = df.toPandas()

df_pd.replace('?',np.NaN,inplace=True)
df_pd.dropna(inplace=True)
df_pd.to_csv("Resources/boston_new.csv",header=False,index=False)

In [51]:
from pyspark.mllib.regression import LabeledPoint
points = sc.textFile("Resources/boston_new.csv")
def prepareLabelledPoints(row):
    values = [float(s) for s in row.strip().split(',')]
    return LabeledPoint(float(values[13]),values[:13])
pointrdd = points.map(prepareLabelledPoints)

In [52]:
pointrdd.take(5)

[LabeledPoint(24.0, [0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98]),
 LabeledPoint(21.6, [0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14]),
 LabeledPoint(34.7, [0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03]),
 LabeledPoint(33.4, [0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94]),
 LabeledPoint(36.2, [0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33])]

#### Train-test split

In [53]:
(trainData,testData) = pointrdd.randomSplit([0.7,0.3])

In [54]:
from pyspark.mllib.regression import LinearRegressionWithSGD
trained_model = LinearRegressionWithSGD.train(trainData,100,step=0.01,initialWeights=np.random.rand(13))

In [55]:
print("Weights : {}".format(trained_model.weights))

Weights : [2.06740410926e+269,4.3468025625e+269,5.43075930541e+269,3.08127200446e+267,2.53470152037e+268,2.77940879411e+269,3.21135468607e+270,1.59404065165e+269,5.02518404161e+269,1.98949404805e+271,8.35987817719e+269,1.60940010232e+271,6.01260635242e+269]


In [56]:
predictions = trained_model.predict(testData.map(lambda x: x.features))
predictions.take(5)

[1.2513852241057594e+274,
 1.1483475144266899e+274,
 1.0937954264995798e+274,
 1.2876862920940184e+274,
 1.2687858031293608e+274]

In [57]:
pred = predictions.collect()
labels = testData.map(lambda x: x.label).collect()
total_samples = len(pred)
mae = 0
for p,l in zip(pred,labels):
    mae += abs(p-l)
mae/=total_samples
print("Mean absolute error: {}".format(mae))

Mean absolute error: 1.3733044483531845e+274
