In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
# Load training data
training = spark.read.format("libsvm")\
    .load("dbfs:/FileStore/shared_uploads/sejalarya2000@gmail.com/sample_linear_regression_data.txt")


In [0]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [0]:
# Fit the model
lrModel = lr.fit(training)

In [0]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

Coefficients: [0.0,0.3229251667740594,-0.3438548034562219,1.915601702345841,0.05288058680386255,0.765962720459771,0.0,-0.15105392669186676,-0.21587930360904645,0.2202536918881343]
Intercept: 0.15989368442397356


In [0]:
# 
trainingSummary = lrModel.summary

In [0]:
print("numIterations: %d" % trainingSummary.totalIterations)

numIterations: 6


In [0]:
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))

objectiveHistory: [0.49999999999999994, 0.4967620357443381, 0.49363616643404634, 0.4936351537897608, 0.4936351214177871, 0.49363512062528014, 0.4936351206216114]


In [0]:
trainingSummary.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|  -9.889232683103197|
|  0.5533794340053553|
|  -5.204019455758822|
| -20.566686715507508|
|    -9.4497405180564|
|  -6.909112502719487|
|  -10.00431602969873|
|  2.0623978070504845|
|  3.1117508432954772|
|  -15.89360822941938|
|  -5.036284254673026|
|  6.4832158769943335|
|  12.429497299109002|
|  -20.32003219007654|
|    -2.0049838218725|
| -17.867901734183793|
|   7.646455887420495|
| -2.2653482182417406|
|-0.10308920436195645|
|  -1.380034070385301|
+--------------------+
only showing top 20 rows



In [0]:
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)

RMSE: 10.189077


In [0]:
print("r2: %f" % trainingSummary.r2)

r2: 0.022861


In [0]:
#Decision Tree Regression

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


In [0]:
# Load the data stored in LIBSVM format as a DataFrame.
data = spark.read.format("libsvm").load("dbfs:/FileStore/shared_uploads/sejalarya2000@gmail.com/sample_linear_regression_data.txt")

In [0]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

In [0]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

In [0]:
# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

In [0]:
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])


In [0]:
# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

In [0]:
# Make predictions.
predictions = model.transform(testData)


In [0]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+-------------------+-------------------+--------------------+
|         prediction|              label|            features|
+-------------------+-------------------+--------------------+
|-1.6668250062854244|-28.571478869743427|(10,[0,1,2,3,4,5,...|
|-1.6668250062854244|-26.736207182601724|(10,[0,1,2,3,4,5,...|
|  5.362515535476564|-22.949825936196074|(10,[0,1,2,3,4,5,...|
|  7.129843768862276|-20.212077258958672|(10,[0,1,2,3,4,5,...|
| 1.4599636103415599|-17.803626188664516|(10,[0,1,2,3,4,5,...|
+-------------------+-------------------+--------------------+
only showing top 5 rows



In [0]:
# 
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


Root Mean Squared Error (RMSE) on test data = 12.5759


In [0]:
treeModel = model.stages[1]

In [0]:
# summary only
print(treeModel)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_317b8b79f8e2, depth=5, numNodes=51, numFeatures=10
