#### Dependencies
___

In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

#### Creation Session
____

In [2]:
spark = SparkSession.builder.appName('lr').getOrCreate()

#### Load Data
___

In [4]:
data = spark.read.csv('resources/Ecommerce_Customers.csv',inferSchema=True, header=True)

data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

### Features selection
___________

In [7]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [13]:
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], outputCol='features')
output = assembler.transform(data)

output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
output.select('features').head(5)

[Row(features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826])),
 Row(features=DenseVector([31.9263, 11.1095, 37.269, 2.664])),
 Row(features=DenseVector([33.0009, 11.3303, 37.1106, 4.1045])),
 Row(features=DenseVector([34.3056, 13.7175, 36.7213, 3.1202])),
 Row(features=DenseVector([33.3307, 12.7952, 37.5367, 4.4463]))]

In [22]:
final_data = output.select(['features', 'Yearly Amount Spent'])
final_data.show(n=5)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
+--------------------+-------------------+
only showing top 5 rows



#### Split Training & Test Data
_____

In [23]:
train, test = final_data.randomSplit([0.7, 0.3])

In [24]:
train.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                355|
|   mean| 499.24019973376795|
| stddev|  78.12277611021025|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [25]:
test.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                145|
|   mean|  499.4948153366056|
| stddev|  82.43677094334411|
|    min|  282.4712457199145|
|    max|  708.9351848669818|
+-------+-------------------+



#### Create Linear Regression Model
____

In [26]:
model = LinearRegression(featuresCol='features', labelCol='Yearly Amount Spent', predictionCol='predictions').fit(train)

#### Evaluate Model 
_____

In [27]:
results = model.evaluate(test)
results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-11.877294121485761|
| -5.393333369253071|
| -5.051462870701641|
| -4.133221184184947|
|-7.6734898118429555|
| -4.754059299467826|
| 19.761556882079333|
|  3.812467983661577|
| -4.183811246969299|
| -5.573188713104059|
|-14.885496537539439|
|-2.2744677806703066|
|  7.003938773604148|
| -4.585489235774048|
|  7.742478828154333|
|-2.2612935304812254|
| 12.269148914218306|
|0.30396598137616593|
| -9.081970863337574|
|   3.94790503213693|
+-------------------+
only showing top 20 rows



#### RootMeanSquare & R2
_________

In [32]:
# Around U$ 10 off & 98% of the variance is explained

print(f'Root Mean Square : {results.rootMeanSquaredError} - R2 : {results.r2}')

Root Mean Square : 9.574764032489666 - R2 : 0.9864162485828267


In [33]:
# mean is U$ 500, std U$ 80, RootMeanSquare is low 

final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



#### Model Deployment (only features)
_____

In [36]:
unlabeled_data = test.select('features')  # simulation

predictions = model.transform(unlabeled_data) # predicting yearly spending amount

predictions.show(n=5)

+--------------------+------------------+
|            features|       predictions|
+--------------------+------------------+
|[30.3931845423455...|331.80616392467937|
|[30.4925366965402...| 287.8645790891676|
|[30.8364326747734...|472.55336329769125|
|[30.8794843441274...| 494.3398211690396|
|[31.1280900496166...| 564.9261765588976|
+--------------------+------------------+
only showing top 5 rows

