In [1]:
import pandas as pd
import numpy as np

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
# setup spark
sc = pyspark.SparkContext()
ss = SparkSession(sc)

In [3]:
# get the data
fil = './Exercise Files/cogsley_sales.csv'
sales = ss.read.load(fil, format='com.databricks.spark.csv', header='true', inferSchema='true')
sales.printSchema()
display(sales.show())

root
 |-- RowID: integer (nullable = true)
 |-- OrderID: integer (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- OrderMonthYear: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Quote: integer (nullable = true)
 |-- DiscountPct: double (nullable = true)
 |-- Rate: integer (nullable = true)
 |-- SaleAmount: double (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- CompanyName: string (nullable = true)
 |-- Sector: string (nullable = true)
 |-- Industry: string (nullable = true)
 |-- City: string (nullable = true)
 |-- ZipCode: integer (nullable = true)
 |-- State: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- ProjectCompleteDate: string (nullable = true)
 |-- DaystoComplete: integer (nullable = true)
 |-- ProductKey: string (nullable = true)
 |-- ProductCategory: string (nullable = true)
 |-- ProductSubCategory: string (nullable = true)
 |-- Consultant: string (nullable = true)
 |-- Manager: string (nullable = t

None

In [4]:
# drop nulls & summarize by month-year
sales = sales.dropna()
summary = sales.select('OrderMonthYear', 'SaleAmount').groupBy('OrderMonthYear').sum().orderBy('OrderMonthYear').toDF('OrderMonthYear','SaleAmount')
results = summary.rdd.map(lambda row: (int(row.OrderMonthYear.replace('-', '')), row.SaleAmount)).toDF(['OrderMonthYear','SaleAmount']) # need to "cast" to RDD first to use map, as df.map no longer is an alias to RDD.map
results.show()

+--------------+-----------------+
|OrderMonthYear|       SaleAmount|
+--------------+-----------------+
|      20090101|741024.2000000001|
|      20090201|544241.1499999998|
|      20090301|        563502.15|
|      20090401|619011.4000000001|
|      20090501|641158.6999999998|
|      20090601|        558288.55|
|      20090701|673657.1000000002|
|      20090801|        662651.85|
|      20090901|650729.3500000001|
|      20091001|        571600.35|
|      20091101|566817.6499999999|
|      20091201|560466.6499999999|
|      20100101|         577707.6|
|      20100201|         585992.3|
|      20100301|528872.8000000002|
|      20100401|489686.8499999999|
|      20100501|        745586.95|
|      20100601|609012.1999999998|
|      20100701|581447.4500000001|
|      20100801|619166.9999999999|
+--------------+-----------------+
only showing top 20 rows



In [5]:
# convert cols to the label (response) & features (WTF stupid terminology is "label"?)
#data = results.select('OrderMonthYear', 'SaleAmount').rdd.map(lambda row: LabeledPoint(row[1], [row[0]])).toDF()
#data.show()
va = VectorAssembler(inputCols=['OrderMonthYear'], outputCol='features')
data = va.transform(results)
data.show()

+--------------+-----------------+-------------+
|OrderMonthYear|       SaleAmount|     features|
+--------------+-----------------+-------------+
|      20090101|741024.2000000001|[2.0090101E7]|
|      20090201|544241.1499999998|[2.0090201E7]|
|      20090301|        563502.15|[2.0090301E7]|
|      20090401|619011.4000000001|[2.0090401E7]|
|      20090501|641158.6999999998|[2.0090501E7]|
|      20090601|        558288.55|[2.0090601E7]|
|      20090701|673657.1000000002|[2.0090701E7]|
|      20090801|        662651.85|[2.0090801E7]|
|      20090901|650729.3500000001|[2.0090901E7]|
|      20091001|        571600.35|[2.0091001E7]|
|      20091101|566817.6499999999|[2.0091101E7]|
|      20091201|560466.6499999999|[2.0091201E7]|
|      20100101|         577707.6|[2.0100101E7]|
|      20100201|         585992.3|[2.0100201E7]|
|      20100301|528872.8000000002|[2.0100301E7]|
|      20100401|489686.8499999999|[2.0100401E7]|
|      20100501|        745586.95|[2.0100501E7]|
|      20100601|6090

In [6]:
''' fit the linear regression models '''
lr = LinearRegression(labelCol='SaleAmount')

# first with regularization set to 0
modelA = lr.fit(data, {lr.regParam:0.0})
predsA = modelA.transform(data)
predsA.show()

# second with regularization set to 100
modelB = lr.fit(data, {lr.regParam:100.0})
predsB = modelB.transform(data)
predsB.show()

+--------------+-----------------+-------------+-----------------+
|OrderMonthYear|       SaleAmount|     features|       prediction|
+--------------+-----------------+-------------+-----------------+
|      20090101|741024.2000000001|[2.0090101E7]|607367.4184934841|
|      20090201|544241.1499999998|[2.0090201E7]|607347.2708129259|
|      20090301|        563502.15|[2.0090301E7]|607327.1231323676|
|      20090401|619011.4000000001|[2.0090401E7]|607306.9754518094|
|      20090501|641158.6999999998|[2.0090501E7]|607286.8277712511|
|      20090601|        558288.55|[2.0090601E7]|607266.6800906928|
|      20090701|673657.1000000002|[2.0090701E7]|607246.5324101346|
|      20090801|        662651.85|[2.0090801E7]|607226.3847295763|
|      20090901|650729.3500000001|[2.0090901E7]|607206.2370490176|
|      20091001|        571600.35|[2.0091001E7]|607186.0893684593|
|      20091101|566817.6499999999|[2.0091101E7]| 607165.941687901|
|      20091201|560466.6499999999|[2.0091201E7]|607145.7940073

In [7]:
# evaluate the models
evalor = RegressionEvaluator(metricName='rmse', labelCol='SaleAmount')
RMSEA = evalor.evaluate(predsA)
RMSEB = evalor.evaluate(predsB)
print('RMSE: Model A = %0.3f, Model B = %0.3f'%(RMSEA, RMSEB))

RMSE: Model A = 69171.467, Model B = 69171.467


In [8]:
ss.stop()