In [1]:
# resource: http://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
training = spark.read.format('libsvm').load('FileStore/tables/sample_linear_regression_data.txt')

In [4]:
training.show()

In [5]:
lr = LinearRegression(featuresCol = 'features', labelCol = 'label', predictionCol = 'prediction')

In [6]:
lrModel = lr.fit(training)

In [7]:
lrModel.coefficients

In [8]:
lrModel.intercept

In [9]:
training_summary = lrModel.summary

In [10]:
# evaluate metrics
training_summary.r2

In [11]:
training_summary.rootMeanSquaredError

In [12]:
# train-test split
all_data = spark.read.format('libsvm').load('FileStore/tables/sample_linear_regression_data.txt')

In [13]:
# randomly split the train/test dataset
split_object = all_data.randomSplit([0.7,0.3])
split_object

In [14]:
train_data, test_data = all_data.randomSplit([0.7,0.3])

In [15]:
train_data.describe().show()

In [16]:
test_data.describe().show()

In [17]:
# fit on training data
correct_model = lr.fit(train_data)

In [18]:
# evaluate on testing data
test_results = correct_model.evaluate(test_data)

In [19]:
test_results.residuals.show()

In [20]:
test_results.rootMeanSquaredError

In [21]:
# deploy a model
unlabeled_data = test_data.select('features')
unlabeled_data.show()

In [22]:
predictions = correct_model.transform(unlabeled_data)

In [23]:
predictions.show()

In [24]:
# a more realistic model
# explore data from an e-commerce company's website and mobile app
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [25]:
from pyspark.ml.regression import LinearRegression

In [26]:
data = spark.read.csv('FileStore/tables/Ecommerce_Customers.csv', inferSchema = True, header = True)
data.printSchema()

In [27]:
for item in data.head(1)[0]:
  print(item)

In [28]:
# setup the dataframe for MLlib
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [29]:
# convert all the numerical values into one list
data.columns

In [30]:
# take all the x's to generate one vector
assembler = VectorAssembler(inputCols = ['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'],
                         outputCol = 'features')

In [31]:
output = assembler.transform(data)

In [32]:
output.printSchema()

In [33]:
output.head(1)
# a new feature is created

In [34]:
final_data = output.select('features', 'Yearly Amount Spent')

In [35]:
final_data.show()

In [36]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [37]:
train_data.describe().show()

In [38]:
test_data.describe().show()

In [39]:
lr = LinearRegression(labelCol = 'Yearly Amount Spent')

In [40]:
lr_model = lr.fit(train_data)

In [41]:
test_results = lr_model.evaluate(test_data)

In [42]:
test_results.residuals.show()

In [43]:
test_results.r2

In [44]:
test_results.rootMeanSquaredError

In [45]:
final_data.describe().show()
# compare the RMSE with the original y's distribution
# will find that the RMSE indicates the model's error is pretty small
# note that RMSE has the same unit as the original data, therefore we can compare

In [46]:
# deploy the model (i.e. predict with unlabeled data)
unlabeled_data = test_data.select('features')
predictions = lr_model.transform(unlabeled_data)
predictions.show()