In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Customer').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
dataset = spark.read.csv('assets/Ecommerce_Customers.csv', inferSchema=True, header=True)

In [7]:
dataset.show(20)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [8]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [9]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [10]:
featureassembler = VectorAssembler(inputCols=['Avg Session Length', 'Time on App', 'Time on Website', 'Length of Membership'], outputCol='Independant Features')

In [11]:
output = featureassembler.transform(dataset)

In [12]:
output.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independant Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [13]:
output.select('Independant Features').show()

+--------------------+
|Independant Features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [14]:
finalized_data = output.select('Independant Features', 'Yearly Amount Spent')

In [16]:
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

In [17]:
regressor = LinearRegression(featuresCol='Independant Features', labelCol='Yearly Amount Spent')
regressor = regressor.fit(train_data)

In [18]:
regressor.coefficients, regressor.intercept

(DenseVector([25.9136, 39.1934, 0.0782, 61.5851]), -1050.2070260245644)

In [19]:
pred_result = regressor.evaluate(test_data)

In [20]:
pred_result.predictions.show()

+--------------------+-------------------+------------------+
|Independant Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.4925367,11.56...|        282.4712457|287.21231525419944|
|[30.97167564,11.7...|        494.6386098| 487.5899386626136|
|[31.06132516,12.3...|        487.5554581|493.70982657397053|
|[31.12809005,13.2...|        557.2526867| 564.7004498531273|
|[31.1695068,13.97...|        427.3565308| 417.8523041539165|
|[31.3123496,11.68...|         463.591418| 443.5606132944556|
|[31.38958548,10.9...|        410.0696111| 408.2533790938048|
|[31.44744649,10.1...|        418.6027421| 424.6158117101172|
|[31.65480968,13.0...|        475.2634237| 468.5013090339892|
|[31.66104982,11.3...|        416.3583536| 416.8072543369906|
|[31.81861657,11.2...|        446.4186734| 448.4275475696627|
|[31.8530748,12.14...|        459.2851235| 461.3508444681968|
|[31.86274111,14.0...|        556.2981412| 558.8537556739186|
|[31.885