In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.context import SparkContext
from pyspark.sql import SparkSession

In [3]:
sc = SparkContext(appName='LR_ECom').getOrCreate()

In [4]:
spark = SparkSession(sparkContext=sc)

In [5]:
from pyspark.ml.regression import LinearRegression

In [6]:
df_ecom = spark.read \
    .option('header', True) \
    .option('inferSchema', True) \
    .csv('data/Ecommerce_Customers.csv')
df_ecom.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [7]:
df_ecom.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [9]:
df_ecom.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [10]:
assembler = VectorAssembler(
    inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership', 'Yearly Amount Spent'], 
    outputCol='features'
)

In [11]:
output = assembler.transform(df_ecom)

In [12]:
output.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|            features|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|[34.4972677251122...|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|[31.9262720263601...|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37

In [14]:
output.select('features').show(truncate=False)

+-----------------------------------------------------------------------------------------------+
|features                                                                                       |
+-----------------------------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615,587.9510539684005]   |
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262,392.2049334443264]   |
|[33.000914755642675,11.330278057777512,37.110597442120856,4.104543202376424,487.54750486747207]|
|[34.30555662975554,13.717513665142507,36.72128267790313,3.120178782748092,581.8523440352177]   |
|[33.33067252364639,12.795188551078114,37.53665330059473,4.446308318351434,599.4060920457634]   |
|[33.871037879341976,12.026925339755056,34.47687762925054,5.493507201364199,637.102447915074]   |
|[32.02159550138701,11.366348309710526,36.68377615286961,4.685017246570912,521.5721747578274]   |
|[32.739142938380326

In [15]:
output.head(1)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005, features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826, 587.9511]))]

In [17]:
df_ecom_vec = output.select(['features', 'Yearly Amount Spent'])
df_ecom_vec.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
|[31.9365486184489...|  427.1993848953282|
|[33.9925727749537...|  492.6060127179966|
|[33.8793608248049...|  522.3374046069357|
|[29.5324289670579...|  408.6403510726275|
|[33.1903340437226...|  573.4158673313865|
|[32.3879758531538...|  470.4527333009554|
|[30.7377203726281...|  461.7807421962299|
|[32.1253868972878...| 457.84769594494855|
|[32.3388993230671...| 407.70454754954415|
|[32.1878120459321...|  452.3156754800354|
|[32.6178560628234...|   605.061038804892|
+----------

In [18]:
df_ecom_vec_train, df_ecom_vec_test = df_ecom_vec.randomSplit([0.7, 0.3])

In [19]:
df_ecom_vec.count(), df_ecom_vec_train.count(), df_ecom_vec_test.count()

(500, 343, 157)

In [20]:
lr_regressor = LinearRegression(labelCol='Yearly Amount Spent')

In [21]:
mod_lr_ecom = lr_regressor.fit(df_ecom_vec_train)

In [28]:
mod_lr_ecom_summary = mod_lr_ecom.evaluate(df_ecom_vec_test)

In [29]:
mod_lr_ecom_summary.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|3.353761712787673...|
|-7.95807864051312...|
|4.320099833421409...|
|-2.04636307898908...|
|-3.24007487506605...|
|-9.89075488178059...|
|5.059064278611913...|
|7.617018127348274...|
|1.421085471520200...|
|-4.14956957683898...|
|-2.95585778076201...|
|-1.19371179607696...|
|-2.84217094304040...|
|-1.59161572810262...|
|7.617018127348274...|
|-4.20641299569979...|
|-5.79802872380241...|
|-3.92219590139575...|
|-1.13686837721616...|
|-2.84217094304040...|
+--------------------+
only showing top 20 rows





In [30]:
mod_lr_ecom_summary.rootMeanSquaredError

4.629885147656847e-12

In [31]:
mod_lr_ecom_summary.r2

1.0

In [32]:
df_ecom.select('Yearly Amount Spent').describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



### Deployment of Model

In [33]:
df_ecom_vec_test_unlabeled = df_ecom_vec_test.select('features')

In [34]:
df_ecom_vec_test_unlabeled.show()

+--------------------+
|            features|
+--------------------+
|[29.5324289670579...|
|[30.5743636841713...|
|[30.7377203726281...|
|[30.8364326747734...|
|[31.0613251567161...|
|[31.1239743499119...|
|[31.1695067987115...|
|[31.3123495994443...|
|[31.3584771924370...|
|[31.4474464941278...|
|[31.5147378578019...|
|[31.5171218025062...|
|[31.5257524169682...|
|[31.5761319713222...|
|[31.6005122003032...|
|[31.7207699002873...|
|[31.8093003166791...|
|[31.8279790554652...|
|[31.8530748017465...|
|[31.9120759292006...|
+--------------------+
only showing top 20 rows



In [35]:
df_ecom_prediction = mod_lr_ecom.transform(df_ecom_vec_test_unlabeled)

In [36]:
df_ecom_prediction.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...| 408.6403510726241|
|[30.5743636841713...|442.06441375806645|
|[30.7377203726281...|461.78074219622556|
|[30.8364326747734...|467.50190042699165|
|[31.0613251567161...|487.55545805790484|
|[31.1239743499119...|486.94705383977566|
|[31.1695067987115...|427.35653080228775|
|[31.3123495994443...|  463.591418027933|
|[31.3584771924370...|  495.175950449474|
|[31.4474464941278...|418.60274209522817|
|[31.5147378578019...|489.81248799646437|
|[31.5171218025062...| 275.9184206503869|
|[31.5257524169682...| 443.9656268098847|
|[31.5761319713222...| 541.2265839893299|
|[31.6005122003032...| 479.1728514910893|
|[31.7207699002873...| 538.7749334780272|
|[31.8093003166791...|  536.771899362847|
|[31.8279790554652...|440.00274754694544|
|[31.8530748017465...| 459.2851234623531|
|[31.9120759292006...|  387.534716305708|
+--------------------+------------