## Model Training

### Spark Session Setup

In [1]:
import findspark

findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('modelTraining').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/23 17:24:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/03/23 17:24:30 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
#data loading will change once data is hosted on hadoop? loading from local for now
from pyspark import SparkFiles
df = spark.read.parquet('prepped_data.parquet')

                                                                                

### Train Test Split

In [5]:
train_data, test_data = df.randomSplit([0.7, 0.3], seed = 101)

### Model Importing and Setup

In [13]:
#import models
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import RandomForestClassifier

#set up models with default settings
nb = NaiveBayes()
rf = RandomForestClassifier(seed = 101)

In [14]:
#eval set up
from pyspark.ml.evaluation import BinaryClassificationEvaluator

acc_evaluator = BinaryClassificationEvaluator()

### Hyperparameter Tuning

In [15]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#naive bayes
grid = ParamGridBuilder().addGrid(nb.smoothing, [0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5]).build()
nb_cv = CrossValidator(estimator = nb, estimatorParamMaps = grid, evaluator = acc_evaluator, numFolds = 5, parallelism = 4)

grid = ParamGridBuilder().addGrid(rf.numTrees, range(10, 60, 10)).addGrid(rf.maxDepth, range(1, 11)).build()
rf_cv = CrossValidator(estimator = rf, estimatorParamMaps = grid, evaluator = acc_evaluator, numFolds = 5, parallelism = 4)

In [17]:
#run predictions
nb_cv_model = nb_cv.fit(train_data)
nb_cv_pred = nb_cv_model.transform(test_data)

rf_cv_model = rf_cv.fit(train_data)
rf_cv_pred = rf_cv_model.transform(test_data)

25/03/23 17:45:22 WARN DAGScheduler: Broadcasting large task binary with size 1098.0 KiB
25/03/23 17:45:27 WARN DAGScheduler: Broadcasting large task binary with size 1057.1 KiB
25/03/23 17:45:28 WARN DAGScheduler: Broadcasting large task binary with size 1057.1 KiB
25/03/23 17:45:29 WARN DAGScheduler: Broadcasting large task binary with size 1423.7 KiB
25/03/23 17:45:34 WARN DAGScheduler: Broadcasting large task binary with size 1280.9 KiB
25/03/23 17:45:35 WARN DAGScheduler: Broadcasting large task binary with size 1280.9 KiB
25/03/23 17:45:35 WARN DAGScheduler: Broadcasting large task binary with size 1709.2 KiB
25/03/23 17:45:35 WARN DAGScheduler: Broadcasting large task binary with size 1123.0 KiB
25/03/23 17:45:49 WARN DAGScheduler: Broadcasting large task binary with size 1081.6 KiB
25/03/23 17:45:54 WARN DAGScheduler: Broadcasting large task binary with size 1011.4 KiB
25/03/23 17:45:55 WARN DAGScheduler: Broadcasting large task binary with size 1011.4 KiB
25/03/23 17:45:56 WAR

In [18]:
#results
print(nb_cv_model.bestModel)

nb_accuracy = acc_evaluator.evaluate(nb_cv_pred)
print(f"Naive Bayes Accuracy: {nb_accuracy}")

print(rf_cv_model.bestModel)

rf_accuracy = acc_evaluator.evaluate(rf_cv_pred)
print(f"Naive Bayes Accuracy: {rf_accuracy}")

NaiveBayesModel: uid=NaiveBayes_9cc25f474b1f, modelType=multinomial, numClasses=2, numFeatures=4
Naive Bayes Accuracy: 0.44037054587969937
RandomForestClassificationModel: uid=RandomForestClassifier_961a2d1fef5f, numTrees=40, numClasses=2, numFeatures=4
Naive Bayes Accuracy: 0.9919906665359078
