In [1]:
import sys, time, pickle
from pyspark import SparkContext, SparkConf
from pyspark.sql import *
from IPython.core.display import display, HTML
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import ClusteringEvaluator
import pandas as pd
import numpy as np

from sklearn.feature_selection import mutual_info_regression

import matplotlib.pyplot as plt

# make matplotlib plot sizes larger
plt.rcParams['figure.figsize'] = [30, 20]

conf = SparkConf().setAppName('Steam Random Forest Regressor').setMaster('spark://sparkmaster:7077')
SparkContext.setSystemProperty('spark.executor.memory', '2g') # memory per executor
SparkContext.setSystemProperty('spark.executor.cores', '6') # cores per executor
SparkContext.setSystemProperty('spark.executor.instances', '3') # per worker (computer)

# https://spark.apache.org/docs/3.0.0-preview/configuration.html#dynamic-allocation
# https://stackoverflow.com/questions/26168254/how-to-set-amount-of-spark-executors
# https://blog.cloudera.com/how-to-tune-your-apache-spark-jobs-part-2/

# SparkContext.setSystemProperty("spark.shuffle.service.enabled", "True") # required for dynamic allocation below
# SparkContext.setSystemProperty("spark.dynamicAllocation.enabled", "True")
# SparkContext.setSystemProperty("spark.executor.cores", "4")
# SparkContext.setSystemProperty("spark.dynamicAllocation.minExecutors", "1")
# SparkContext.setSystemProperty("spark.dynamicAllocation.maxExecutors", "5")
# SparkContext.setSystemProperty('spark.executor.memory', '2g') # memory per executor

sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [3]:
sc._conf.getAll()

[('spark.executor.memory', '2g'),
 ('spark.driver.host', 'jupyterlab'),
 ('spark.app.id', 'app-20210419213454-0005'),
 ('spark.driver.port', '35941'),
 ('spark.executor.instances', '3'),
 ('spark.app.name', 'Steam Random Forest Regressor'),
 ('spark.executor.id', 'driver'),
 ('spark.master', 'spark://sparkmaster:7077'),
 ('spark.executor.cores', '6'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.app.startTime', '1618889694183'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

In [None]:
# load models back in from previous step
predictions = pickle.load(open("predictions.p", "rb"))
gbt_predictions = pickle.load(open("gbt_predictions.p", "rb"))
linear_predictions = pickle.load(open("linear_predictions.p", "rb"))

# Random Forest Results

In [None]:
evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("RMSE: " + str(rmse))

evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="mae")

mae = evaluator.evaluate(predictions)
print("MAE: " + str(mae))

evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="r2")

r2 = evaluator.evaluate(predictions)
print("r2: " + str(r2))

rfPred = cvModel.transform(sqlContext.createDataFrame(df))

rfResult = rfPred.toPandas()

In [None]:
rfResult

In [None]:
plt.plot(rfResult.days_until_discount, rfResult.prediction, 'bo')
plt.xlabel('Actual days until %d%% off' % (percentage_discount_predict))
plt.ylabel('Predicted days')
plt.suptitle("Random Forest Model Performance (RMSE: %f, MAE: %f, R2: %f)" % (rmse, mae, r2))
plt.show()

In [None]:
bestPipeline = cvModel.bestModel
bestModel = bestPipeline.stages[1]

importances = bestModel.featureImportances

x_values = list(range(len(importances)))

plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation=90)
plt.ylabel('Importance')
plt.xlabel('Feature')
plt.title('Feature Importance')

In [None]:
print("Best hyperparameters")
print('numTrees - ', bestModel.getNumTrees)
print('maxDepth - ', bestModel.getOrDefault('maxDepth'))

# Gradient Boosted Tree Results

In [None]:
evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(gbt_predictions)
print("RMSE: " + str(rmse))

evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="mae")

mae = evaluator.evaluate(gbt_predictions)
print("MAE: " + str(mae))

evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="r2")

r2 = evaluator.evaluate(gbt_predictions)
print("r2: " + str(r2))

rfPred = gbt_model.transform(sqlContext.createDataFrame(df))

rfResult = rfPred.toPandas()

In [None]:
plt.plot(rfResult.days_until_discount, rfResult.prediction, 'bo')
plt.xlabel('Actual days until %d%% off' % (percentage_discount_predict))
plt.ylabel('Predicted days')
plt.suptitle("GBT Model Performance (RMSE: %f, MAE: %f, R2: %f)" % (rmse, mae, r2))
plt.show()

In [None]:
bestPipeline = gbt_model.bestModel
bestModel = bestPipeline.stages[1]

importances = bestModel.featureImportances

x_values = list(range(len(importances)))

plt.bar(x_values, importances, orientation = 'vertical')
plt.xticks(x_values, feature_list, rotation=90)
plt.ylabel('Importance')
plt.xlabel('Feature')
plt.title('Feature Importance')

In [None]:
print("Best hyperparameters")
print('numTrees - ', bestModel.getNumTrees)
print('maxDepth - ', bestModel.getOrDefault('maxDepth'))
print('maxIter - ', bestModel.getOrDefault('maxIter'))

# Linear Regression Results

In [None]:
evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(linear_predictions)
print("RMSE: " + str(rmse))

evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="mae")

mae = evaluator.evaluate(linear_predictions)
print("MAE: " + str(mae))

evaluator = RegressionEvaluator(labelCol="days_until_discount", predictionCol="prediction", metricName="r2")

r2 = evaluator.evaluate(linear_predictions)
print("r2: " + str(r2))

rfPred = gbt_model.transform(sqlContext.createDataFrame(df))

rfResult = rfPred.toPandas()

In [None]:
plt.plot(rfResult.days_until_discount, rfResult.prediction, 'bo')
plt.xlabel('Actual days until %d%% off' % (percentage_discount_predict))
plt.ylabel('Predicted days')
plt.suptitle("Linear Model Performance (RMSE: %f, MAE: %f, R2: %f)" % (rmse, mae, r2))
plt.show()

In [None]:
bestPipeline = linear_model.bestModel
bestModel = bestPipeline.stages[1]

In [None]:
print("Best hyperparameters")
print('regParam - ', bestModel.getOrDefault('regParam'))
print('fitIntercept - ', bestModel.getOrDefault('fitIntercept'))
print('elasticNetParam - ', bestModel.getOrDefault('elasticNetParam'))

In [None]:
sc.stop()