In [9]:
import findspark
findspark.init('C:/spark/spark-2.4.5-bin-hadoop2.7')
from pyspark.sql import SparkSession

In [10]:
import pyspark

In [11]:
from pyspark.sql import SparkSession

In [12]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [13]:
df = spark.read.csv("Ecommerce Customers.csv", inferSchema=True, header=True)

In [14]:
df.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg. Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [15]:
for item in df.head(5)[3]:
    print(item)

Diazchester
 CA 06566-8576"
DarkGreen
31.926272026360156
11.109460728682564
37.268958868297744
2.66403418213262
392.2049334443264


In [16]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [17]:
df.columns

['Email',
 'Address',
 'Avatar',
 'Avg. Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [22]:
from pyspark.sql.functions import isnan, isnull, mean

amount_mean = df.select(mean('Yearly Amount Spent')).collect()[0][0]
df = df.fillna(amount_mean, subset=['Yearly Amount Spent'])

In [30]:
null_col = df.select(isnull('Yearly Amount Spent').alias('Yearly Amount Spent'))
null_col.filter(null_col['Yearly Amount Spent'] == True).show()

+-------------------+
|Yearly Amount Spent|
+-------------------+
+-------------------+



In [31]:
input_features = ['Time on App', 'Time on Website', 'Length of Membership']
assembler = VectorAssembler(inputCols=input_features, outputCol='features')

In [32]:
output = assembler.setHandleInvalid('skip').transform(df)

In [33]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg. Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = false)
 |-- features: vector (nullable = true)



In [34]:
output.head(1)

[Row(Email='Wrightmouth', Address=' MI 82180-9605"', Avatar='Violet', Avg. Session Length=34.49726772511229, Time on App=12.655651149166752, Time on Website=39.57766801952616, Length of Membership=4.082620632952961, Yearly Amount Spent=587.9510539684005, features=DenseVector([12.6557, 39.5777, 4.0826]))]

In [35]:
final_data = output.select('features', 'Yearly Amount Spent')

In [36]:
final_data.show()

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[12.6556511491667...|  587.9510539684005|
|[11.1094607286825...|  392.2049334443264|
|[11.3302780577775...| 487.54750486747207|
|[13.7175136651425...|  581.8523440352178|
|[12.7951885510781...|  599.4060920457634|
|[12.0269253397550...|   637.102447915074|
|[11.3663483097105...|  521.5721747578274|
|[37.3733588585475...| 499.91985771641924|
|[13.3862352756764...|  570.2004089636195|
|[37.1451682235281...| 499.91985771641924|
|[13.3389754476621...|  492.6060127179966|
|[37.0879260709838...| 499.91985771641924|
|[10.9612984001540...| 408.64035107262754|
|[12.9592260916093...|  573.4158673313865|
|[13.1487256920565...| 470.45273330095546|
|[12.6366060520001...|  461.7807421962299|
|[11.7338616908573...| 457.84769594494855|
|[12.0131946940144...|  407.7045475495441|
|[14.7153875441565...| 452.31567548003545|
|[13.9895925558252...|   605.061038804892|
+----------

In [37]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [38]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                355|
|   mean|  502.3222112969249|
| stddev|  76.53461675559163|
|    min|  275.9184206503857|
|    max|  725.5848140556805|
+-------+-------------------+



In [39]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                145|
|   mean|  494.0382334331119|
| stddev|   69.6462535891278|
|    min|   266.086340948469|
|    max|  765.5184619388372|
+-------+-------------------+



In [40]:
from pyspark.ml.regression import LinearRegression

In [41]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [42]:
lr_model = lr.fit(train_data)

In [43]:
test_results = lr_model.evaluate(test_data)

In [55]:
print("RMSE: ", test_results.rootMeanSquaredError)
print("r2 error: ", test_results.r2)

RMSE:  63.436634104303025
r2 error:  0.16460811209707826


In [56]:
unlabeled_data = test_data.select('features')

In [57]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[9.84612490873623...|
|[9.98451439654646...|
|[10.0125833662230...|
|[10.1631790600525...|
|[10.3147179218938...|
|[10.3201162550591...|
|[10.3984577130856...|
|[10.5373075378408...|
|[10.6077238708590...|
|[10.6279492261562...|
|[10.6517937834741...|
|[10.6746534726919...|
|[10.7191497406283...|
|[10.7485336554157...|
|[10.8616041991289...|
|[10.8691638141303...|
|[10.9025562270197...|
|[10.9523533804586...|
|[10.9567909677916...|
|[10.9612984001540...|
+--------------------+
only showing top 20 rows



In [60]:
test_predictions = lr_model.transform(unlabeled_data).select('prediction')

In [61]:
test_predictions.show()

+------------------+
|        prediction|
+------------------+
|441.28007514452554|
| 430.6104466305561|
|468.13260447073594|
|462.64420208150113|
| 450.9483863077504|
|462.76418936557934|
| 452.6838795870792|
| 441.8738783660932|
|460.29860059804423|
|479.37579797992385|
| 466.4223387830973|
| 479.9702505622487|
|476.78853211461455|
| 447.2293194989802|
| 463.1713153742362|
|448.97198664767643|
|456.61137341104904|
|481.56513918798953|
| 476.2356074108079|
| 478.3737111145649|
+------------------+
only showing top 20 rows

