In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [2]:
from pyspark.ml.regression import LinearRegression

In [4]:
data = spark.read.csv('./data/Ecommerce_Customers.csv', inferSchema=True, header=True)

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [12]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
data.columns[3:-1]

['Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership']

In [15]:
assembler = VectorAssembler(inputCols=data.columns[3:-1], outputCol='features')
output = assembler.transform(data)

In [20]:
final_data = output.select(['features', 'Yearly Amount Spent'])

In [23]:
train_data, test_data = final_data.randomSplit([0.7, 0.3], 1405)

In [28]:
lr = LinearRegression(labelCol='Yearly Amount Spent', featuresCol='features', predictionCol='Predicted Expenditure')

In [29]:
lr_model = lr.fit(train_data)

In [30]:
test_results = lr_model.evaluate(test_data)

In [32]:
test_results.rootMeanSquaredError

10.446341163822199

In [33]:
test_results.r2

0.9809066268099499

In [34]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+

