In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('LR').getOrCreate()

In [4]:
df = spark.read.csv('Ecommerce_Customers.csv', header=True, inferSchema=True)

In [7]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [11]:
df.describe().toPandas()

Unnamed: 0,summary,Email,Address,Avatar,Avg Session Length,Time on App,Time on Website,Length of Membership,Yearly Amount Spent
0,count,500,500,500,500.0,500.0,500.0,500.0,500.0
1,mean,,,,33.05319351819619,12.052487937166134,37.06044542094859,3.533461555915055,499.3140382585909
2,stddev,,,,0.9925631110845354,0.9942156084725424,1.0104889067564031,0.9992775024112583,79.3147815497068
3,min,aaron04@yahoo.com,"0001 Mack MillNorth Jennifer, NE 42021-5936",AliceBlue,29.532428967057943,8.508152176032603,33.91384724758464,0.2699010899842742,256.67058229005585
4,max,zscott@wright.com,Unit 7502 Box 8345DPO AE 53747,YellowGreen,36.13966248879052,15.126994288792469,40.005181638101895,6.922689335035808,765.5184619388373


In [13]:
from pyspark.ml.feature import VectorAssembler

In [14]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [15]:
assembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 
                                       'Length of Membership'], outputCol='features')
output = assembler.transform(df)

In [17]:
output.select('features').head(1)

[Row(features=DenseVector([34.4973, 12.6557, 39.5777, 4.0826]))]

In [18]:
final_data = output.select(['features', 'Yearly Amount Spent'])

In [19]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [20]:
final_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                500|
|   mean|  499.3140382585909|
| stddev|   79.3147815497068|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [21]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                359|
|   mean|  504.3351435040122|
| stddev|   79.5026957944929|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [22]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                141|
|   mean| 486.52980575429063|
| stddev|  77.65436288487135|
|    min|   266.086340948469|
|    max|  689.2356997616951|
+-------+-------------------+



In [23]:
from pyspark.ml.regression import LinearRegression

In [24]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [25]:
lr_model = lr.fit(train_data)

In [26]:
test_results = lr_model.evaluate(test_data)

In [28]:
test_results.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|0.08412594943450813|
| -16.70524856184022|
|  -5.09318872239777|
|  5.958368289121836|
| -22.66050660683652|
|  21.00885367281512|
|  2.375883640778625|
| 3.7615027952396076|
| -7.093072713659353|
|-18.508846031100347|
| -2.296569131809406|
| -6.448732232768748|
|-1.7608239882226258|
| -4.224803353993423|
| -3.182833024521358|
| 7.6319263961156025|
| 1.0324634554949625|
| 7.4623228363412295|
|-10.569900092709645|
| -5.824705464716715|
+-------------------+
only showing top 20 rows



In [29]:
test_results.rootMeanSquaredError

9.611853137067396

In [30]:
test_results.r2

0.9845697270635556

In [31]:
unlabeled_data = test_data.select('features')

In [33]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[30.5743636841713...|
|[30.8162006488763...|
|[30.8364326747734...|
|[30.9716756438877...|
|[31.1239743499119...|
|[31.2834474760581...|
|[31.3091926408918...|
|[31.3662121671876...|
|[31.4474464941278...|
|[31.5702008293202...|
|[31.5761319713222...|
|[31.7207699002873...|
|[31.7216523605090...|
|[31.7656188210424...|
|[31.8186165667690...|
|[31.8209982016720...|
|[31.8293464559211...|
|[31.8512531286083...|
|[31.8648325480987...|
|[31.8745516945853...|
+--------------------+
only showing top 20 rows



In [34]:
predictions = lr_model.transform(unlabeled_data)

In [36]:
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.5743636841713...|441.98028780863115|
|[30.8162006488763...| 282.7915895103092|
|[30.8364326747734...| 472.5950891493874|
|[30.9716756438877...| 488.6802414677709|
|[31.1239743499119...| 509.6075604466023|
|[31.2834474760581...| 570.7722357528523|
|[31.3091926408918...|  430.344834199155|
|[31.3662121671876...| 426.8273797612453|
|[31.4474464941278...|425.69581480888337|
|[31.5702008293202...| 564.4543381725052|
|[31.5761319713222...| 543.5231531211377|
|[31.7207699002873...| 545.2236657107917|
|[31.7216523605090...|349.53775062009527|
|[31.7656188210424...|500.77888498960056|
|[31.8186165667690...|  449.601506394657|
|[31.8209982016720...|417.04335461709775|
|[31.8293464559211...|   384.11987453248|
|[31.8512531286083...|465.52992383045716|
|[31.8648325480987...| 450.4611805695233|
|[31.8745516945853...| 398.1099497109842|
+--------------------+------------