In [7]:
import findspark
findspark.init('/home/aforestier10/Downloads/spark-3.5.3-bin-hadoop3')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LinReg').getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [2]:
from pyspark.ml.regression import LinearRegression

In [3]:
data = spark.read.csv('Ecommerce_Customers.csv', inferSchema=True, header=True)

                                                                                

In [4]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [6]:
for item in data.head(1)[0]:
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


In [8]:
# Setup data - imports
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [12]:
# Create assembler object pf only useful feature columns (single vector of features)
features = ['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership']
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [14]:
# Transform data
output = assembler.transform(data)
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [15]:
final_data = output.select('features', 'Yearly Amount Spent')

In [16]:
# Split data
train_data, test_data = final_data.randomSplit([.7, .3])

In [17]:
train_data.describe().show()



+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                354|
|   mean|  498.5686419113297|
| stddev|  81.07980014987743|
|    min| 256.67058229005585|
|    max|  744.2218671047146|
+-------+-------------------+



                                                                                

In [18]:
test_data.describe().show()

[Stage 8:>                                                          (0 + 1) / 1]

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                146|
|   mean| 501.12136912797735|
| stddev|  75.10242033635794|
|    min|  275.9184206503857|
|    max|  765.5184619388373|
+-------+-------------------+



                                                                                

In [19]:
# Create model
lr = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent', predictionCol='prediction')

In [20]:
# Fit model
lr_model = lr.fit(train_data)

                                                                                

In [21]:
# Results
test_results = lr_model.evaluate(test_data)

In [22]:
# See residuals
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 11.064818010774957|
|  5.534266631760033|
| -5.833387800921912|
|-0.5280723853314839|
| 4.3380042075854135|
|-3.4010669843090113|
| -26.19535813525465|
| 2.5675045114929276|
| -1.988382033865605|
| -5.319961950053937|
| -17.03767626913242|
|  11.64379213088057|
| 12.290278611485803|
| -13.54160767728331|
| -5.751358343606739|
| -3.687884176613693|
|  5.556793664737597|
|0.45599357608119817|
|  4.794327422939318|
| 0.8684816672741817|
+-------------------+
only showing top 20 rows



In [24]:
# See rmse
test_results.rootMeanSquaredError

9.985459531660368

In [25]:
# See r^2
test_results.r2

0.9822002829443173

In [26]:
# See stats on data to see how these values are
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



* pretty good! 500 avg spent and rmse is only 10!
* R^2 says model explains 98% of variance in the data!

### How to deploy

In [27]:
unlabeled_data = test_data.select('features')

In [28]:
y_pred = lr_model.transform(unlabeled_data)

In [29]:
y_pred.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.7377203726281...| 450.7159241854549|
|[31.0472221394875...|386.96313255726136|
|[31.0613251567161...| 493.3888458588235|
|[31.2606468698795...|421.85470364228286|
|[31.3662121671876...| 426.2508783488995|
|[31.5171218025062...| 279.3194876346947|
|[31.6739155032749...|501.92042604513586|
|[31.7366356860502...|  494.365941744039|
|[31.8627411090001...| 558.2865232079123|
|[31.8745516945853...|397.60520619632143|
|[31.9048571310136...|490.98753369194856|
|[31.9096268275227...| 551.8022435423586|
|[31.9262720263601...| 379.9146548328406|
|[31.9365486184489...| 440.7409925726115|
|[31.9453957483445...| 662.7712822812587|
|[31.9673209478824...|449.43772541626595|
|[31.9764800614612...| 325.0376523693626|
|[32.0047530203648...|463.28998754454824|
|[32.0215955013870...| 516.7778473348881|
|[32.0444861274404...| 447.3613475192758|
+--------------------+------------

Above shows predicted price spent based on feature inputs