In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import datetime

In [0]:
spark = SparkSession.builder \
            .appName('gp17') \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.0")\
            .config("spark.network.timeout", "7200s")\
            .config("spark.executor.heartbeatInterval", "1200s")\
            .getOrCreate()

In [0]:
database = 'msds697'
collection = 'msds697collection'
user_name = 'armantav'
password = 'cec5rosXrcbqKg5P'
address = 'cluster0.3itij.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

## Read in Data, remove null rows

In [0]:
df = spark.read.format("mongo").option("uri",connection_string).load()
#10.86seconds read

In [0]:
df.show(3)

In [0]:
#drop nulls
dfdropna = df.na.drop()

In [0]:
#dfdropna.count()

In [0]:
dfdropna = dfdropna.drop(*('date', 'event_time', '_id', 'user_session'))

In [0]:
dfdropna = dfdropna.drop(*('user_id', 'product_id'))

In [0]:
dfdropna.show(3)

In [0]:
dfdropna = dfdropna.distinct()

## Data/Feature Encoding and Preprocessing

In [0]:
from pyspark.ml.feature import StringIndexer
def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df
    for c in cols:
        # For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        # Creates a DataFame by putting the transformed values in the new colum with suffix “-num”
        # and then drops the original columns.
        # and drop the “-num” suffix.
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

In [0]:
dfnumeric = indexStringColumns(dfdropna, ["brand", "category_code", "event_type", "category_id"])

In [0]:
dfnumeric.printSchema()

In [0]:
from pyspark.ml.feature import VectorAssembler
input_cols=['brand', 'category_code', 'category_id', 'event_type', 'hospitalizedCurrently', 'negative', 'positive']
va = VectorAssembler(outputCol="features", inputCols=input_cols)
dfva = va.transform(dfnumeric).select("features", "price").withColumnRenamed("price", "label")

In [0]:
splits = dfva.randomSplit([0.7, 0.3])
train = splits[0].cache()
valid = splits[1].cache()

We all worked on all three models. However, we all came up with an idea for the model to use. Arman H. decided on Linear Regression, Arman T. decided to use gradient boosting, and Neset decided that we implement Random Forest.
The reason to choose Linear Regression against the other 2 models was to show the superiority of gradient boosing and Random Forest for big data. In all three models our analytical goal is to predict the prices of Ecommerce goods.

## Random Forest Section

Neset's model. In here we are trying to predict the price using random forest during the time covid cases started rising. We want to see if our model is still able to do a good job with prices changing due to covid.

In [0]:
from pyspark.ml.regression import RandomForestRegressor
rf = RandomForestRegressor(maxDepth=4, maxBins=4200)
rfmodel = rf.fit(train)

In [0]:
valpredicts = rfmodel.transform(valid)

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
r2_score = RegressionEvaluator().setMetricName("r2").evaluate(valpredicts)
print(r2_score)

## Linear Regression Section

Arman H's Linear regression model. We were primarily interested in quantifying the difference between a simple model like linear regression against random forest and gradient boosting trees. We wanted to see how if Linear Regression is able to handle a dataset with columns that have a lot of categories.

In [0]:
from pyspark.ml.regression import LinearRegression
linear = LinearRegression(regParam=0.0)
linearmodel = linear.fit(train)

In [0]:
linearpredicts = linearmodel.transform(valid)

In [0]:
r2_score_linear = RegressionEvaluator().setMetricName("r2").evaluate(linearpredicts)
print(r2_score_linear)

## Gradient Boosting Trees Section

Arman T's Gradient boosting trees. We expected this model to perform the best, which it did. GBT achieved an R^2 score of 0.647.

In [0]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', maxIter=6, maxBins=4200)
gbtmodel = gbt.fit(train)

In [0]:
gbt_predict = gbtmodel.transform(valid)

In [0]:
r2_score_gbt = RegressionEvaluator().setMetricName("r2").evaluate(gbt_predict)
print(r2_score_gbt)

In [0]:
valid.show(2)