In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('Customers').getOrCreate()

In [3]:
spark

In [5]:
from pyspark.ml.regression import LinearRegression

In [6]:
dataset=spark.read.csv("/content/Ecommerce_Customers.csv",inferSchema=True,header=True)

In [8]:
type(dataset)

pyspark.sql.dataframe.DataFrame

In [9]:
dataset.dtypes

[('Email', 'string'),
 ('Address', 'string'),
 ('Avg Session Length', 'double'),
 ('Time on App', 'double'),
 ('Time on Website', 'double'),
 ('Length of Membership', 'double'),
 ('Yearly Amount Spent', 'double')]

In [10]:
dataset.select(['Email','Address']).show()

+--------------------+--------------------+
|               Email|             Address|
+--------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|
|   hduke@hotmail.com|4547 Archer Commo...|
|    pallen@yahoo.com|24645 Valerie Uni...|
|riverarebecca@gma...|1414 David Throug...|
|mstephens@davidso...|14023 Rodriguez P...|
|alvareznancy@luca...|645 Martha Park A...|
|katherine20@yahoo...|68388 Reyes Light...|
|  awatkins@yahoo.com|Unit 6538 Box 898...|
|vchurch@walter-ma...|860 Lee KeyWest D...|
|    bonnie69@lin.biz|PSC 2734, Box 525...|
|andrew06@peterson...|26104 Alexander G...|
|ryanwerner@freema...|Unit 2413 Box 034...|
|   knelson@gmail.com|6705 Miller Orcha...|
|wrightpeter@yahoo...|05302 Dunlap Ferr...|
|taylormason@gmail...|7773 Powell Sprin...|
| jstark@anderson.com|49558 Ramirez Roa...|
| wjennings@gmail.com|6362 Wilson Mount...|
|rebecca45@hale-ba...|8982 Burton RowWi...|
|alejandro75@hotma...|64475 Andre Club ...|
|samuel46@love-wes...|544 Alexan

In [11]:
dataset

DataFrame[Email: string, Address: string, Avg Session Length: double, Time on App: double, Time on Website: double, Length of Membership: double, Yearly Amount Spent: double]

In [12]:
dataset.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [13]:
dataset.head(10)

[Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avg Session Length=34.49726773, Time on App=12.65565115, Time on Website=39.57766802, Length of Membership=4.082620633, Yearly Amount Spent=587.951054),
 Row(Email='hduke@hotmail.com', Address='4547 Archer CommonDiazchester, CA 06566-8576', Avg Session Length=31.92627203, Time on App=11.10946073, Time on Website=37.26895887, Length of Membership=2.664034182, Yearly Amount Spent=392.2049334),
 Row(Email='pallen@yahoo.com', Address='24645 Valerie Unions Suite 582Cobbborough, DC 99414-7564', Avg Session Length=33.00091476, Time on App=11.33027806, Time on Website=37.11059744, Length of Membership=4.104543202, Yearly Amount Spent=487.5475049),
 Row(Email='riverarebecca@gmail.com', Address='1414 David ThroughwayPort Jason, OH 22070-1220', Avg Session Length=34.30555663, Time on App=13.71751367, Time on Website=36.72128268, Length of Membership=3.120178783, Yearly Amount Spent=581.852344),
 Row(Emai

In [14]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [15]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [16]:
featureassembler=VectorAssembler(inputCols=["Avg Session Length","Time on App","Time on Website","Length of Membership"],outputCol="Independent Features")

In [17]:
output=featureassembler.transform(dataset)

In [18]:
output.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independent Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [19]:
output.select("Independent Features").show()

+--------------------+
|Independent Features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [20]:
output.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'Independent Features']

In [21]:
finalized_data=output.select("Independent Features","Yearly Amount Spent")

In [22]:
finalized_data.show()

+--------------------+-------------------+
|Independent Features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
|[33.99257277,13.3...|        492.6060127|
|[33.87936082,11.5...|        522.3374046|
|[29.53242897,10.9...|        408.6403511|
|[33.19033404,12.9...|        573.4158673|
|[32.38797585,13.1...|        470.4527333|
|[30.73772037,12.6...|        461.7807422|
|[32.1253869,11.73...|        457.8476959|
|[32.33889932,12.0...|        407.7045475|
|[32.18781205,14.7...|        452.3156755|
|[32.61785606,13.9...|        605.0610388|
+----------

In [23]:
train_data,test_data=finalized_data.randomSplit([0.75,0.25])

In [24]:
train_data.count()

371

In [25]:
test_data.count()

129

In [26]:
regressor=LinearRegression(featuresCol='Independent Features', labelCol='Yearly Amount Spent')
regressor=regressor.fit(train_data)

In [27]:
regressor.coefficients

DenseVector([25.7461, 38.7417, 0.6197, 61.8607])

In [28]:
regressor.intercept

-1060.3177002417435

In [29]:
pred_results=regressor.evaluate(test_data)

In [30]:
pred_results.predictions.show(40)

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.73772037,12.6...|        461.7807422| 450.7820313354307|
|[30.97167564,11.7...|        494.6386098| 487.7478527969274|
|[31.06132516,12.3...|        487.5554581|493.52645424555953|
|[31.06621816,11.7...|        448.9332932|461.73355783107104|
|[31.12397435,12.3...|        486.9470538| 508.2472622160801|
|[31.12809005,13.2...|        557.2526867| 564.8989607807507|
|[31.1695068,13.97...|        427.3565308| 416.5622690271264|
|[31.28344748,12.7...|        591.7810894| 569.7292310946609|
|[31.3123496,11.68...|         463.591418| 444.8841309095378|
|[31.38958548,10.9...|        410.0696111|  409.482350316935|
|[31.57020083,13.3...|        545.9454921| 563.5906634180401|
|[31.6005122,12.22...|        479.1728515| 460.8528601619648|
|[31.60983957,12.7...|        444.5455497|426.89591906209057|
|[31.625

In [31]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(8.678809773032123, 114.7037925117276)