In [1]:
__author__ = "Yasaman Emami"
__email__ = ['emami.yasamann@gmail.com','yasaman.emami@sjsu.edu']

## Create Spark Session and read data into spark dataframe

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
  
spark = SparkSession.builder.getOrCreate()

schema = StructType([
      StructField("reviewerID",StringType(),True),
      StructField("asin",StringType(),True),
      StructField("reviewerName",StringType(),True),
      StructField("helpful",StringType(),True),
      StructField("reviewText",StringType(),True),
      StructField("overall",StringType(),True),
      StructField("summary",StringType(),True),
      StructField("unixReviewTime",StringType(),True),
      StructField("reviewTime",StringType(),True)
  ])

df = spark.read.schema(schema).json('../data/Software.json')
df.show(5,truncate=70)

21/12/25 20:07:07 WARN Utils: Your hostname, YasamanEms-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.250 instead (on interface en0)
21/12/25 20:07:08 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/12/25 20:07:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
[Stage 0:>                                                          (0 + 1) / 1]

+--------------+----------+-------------------+-------+----------------------------------------------------------------------+-------+-----------------------------+--------------+-----------+
|    reviewerID|      asin|       reviewerName|helpful|                                                            reviewText|overall|                      summary|unixReviewTime| reviewTime|
+--------------+----------+-------------------+-------+----------------------------------------------------------------------+-------+-----------------------------+--------------+-----------+
|A240ORQ2LF9LUI|0077613252|         Michelle W|   null|The materials arrived early and were in excellent condition.  Howev...|    4.0|               Material Great|    1394496000|03 11, 2014|
|A1YCCU0YRLS0FE|0077613252|Rosalind White Ames|   null|I am really enjoying this book with the worksheets that make you re...|    4.0|                       Health|    1393113600|02 23, 2014|
|A1BJHRQDYVAY2J|0077613252|     Allan R.

                                                                                

In [2]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import rand 

df = df.selectExpr("cast(reviewText as string) reviewText",
                   "cast(overall as int) overall")
df.show()

+--------------------+-------+
|          reviewText|overall|
+--------------------+-------+
|The materials arr...|      4|
|I am really enjoy...|      4|
|IF YOU ARE TAKING...|      1|
|This book was mis...|      3|
|I have used Learn...|      5|
|Strong backgroung...|      4|
|If you live on Ma...|      3|
|i got this book o...|      5|
|I was very happy ...|      5|
|Recieved in a tim...|      5|
|Maybe it's just m...|      2|
|This was the text...|      5|
|Not worth the pri...|      2|
|This book served ...|      3|
|I love how this b...|      4|
|Great on the deli...|      5|
|The book was deli...|      5|
|Required to buy t...|      2|
|Didn't help me mu...|      1|
|Disappointing tex...|      1|
+--------------------+-------+
only showing top 20 rows



## Cleaning data

In [3]:
df = df.na.drop("any")
df.count()

                                                                                

459370

## Create a table from data to work with sql

In [4]:
from pyspark.sql import SparkSession
  
# creating sparksession and giving app name
spark = SparkSession.builder.appName('sparkdf').getOrCreate()

# creating a temporary view of
# Dataframe and storing it into df
df.createOrReplaceTempView("df")

# using the SQL query to count all
# distinct records and display the
# count on the screen
spark.sql("select count((overall)),overall from df group by overall").show()

                                                                                

+--------------+-------+
|count(overall)|overall|
+--------------+-------+
|        102542|      1|
|         39394|      3|
|        212399|      5|
|         73590|      4|
|         31445|      2|
+--------------+-------+



In [5]:
# filtering against review scores more than 5 or less than 1
df = df.filter("overall<6 and overall!=3")
df = df.filter("overall>0")
df.count()

                                                                                

419976

In [6]:
from pyspark.ml.feature import Bucketizer
# map review scores into two categories
bucketizer = Bucketizer(splits=[ 1, 4, 5 ],inputCol="overall", outputCol="label")
df = bucketizer.setHandleInvalid("keep").transform(df)

df.show()

+--------------------+-------+-----+
|          reviewText|overall|label|
+--------------------+-------+-----+
|The materials arr...|      4|  1.0|
|I am really enjoy...|      4|  1.0|
|IF YOU ARE TAKING...|      1|  0.0|
|I have used Learn...|      5|  1.0|
|Strong backgroung...|      4|  1.0|
|i got this book o...|      5|  1.0|
|I was very happy ...|      5|  1.0|
|Recieved in a tim...|      5|  1.0|
|Maybe it's just m...|      2|  0.0|
|This was the text...|      5|  1.0|
|Not worth the pri...|      2|  0.0|
|I love how this b...|      4|  1.0|
|Great on the deli...|      5|  1.0|
|The book was deli...|      5|  1.0|
|Required to buy t...|      2|  0.0|
|Didn't help me mu...|      1|  0.0|
|Disappointing tex...|      1|  0.0|
|This book provide...|      4|  1.0|
|I've been using D...|      4|  1.0|
|The demo is done ...|      4|  1.0|
+--------------------+-------+-----+
only showing top 20 rows



In [7]:
from pyspark.sql import SparkSession
  
# creating sparksession and giving app name

# creating a temporary view of
# Dataframe and storing it into df
df.createOrReplaceTempView("df")

# using the SQL query to count all
# distinct records and display the
# count on the screen
spark.sql("select count((overall)),overall from df group by overall").show()

                                                                                

+--------------+-------+
|count(overall)|overall|
+--------------+-------+
|        102542|      1|
|        212399|      5|
|         73590|      4|
|         31445|      2|
+--------------+-------+



In [8]:
df.show()

+--------------------+-------+-----+
|          reviewText|overall|label|
+--------------------+-------+-----+
|The materials arr...|      4|  1.0|
|I am really enjoy...|      4|  1.0|
|IF YOU ARE TAKING...|      1|  0.0|
|I have used Learn...|      5|  1.0|
|Strong backgroung...|      4|  1.0|
|i got this book o...|      5|  1.0|
|I was very happy ...|      5|  1.0|
|Recieved in a tim...|      5|  1.0|
|Maybe it's just m...|      2|  0.0|
|This was the text...|      5|  1.0|
|Not worth the pri...|      2|  0.0|
|I love how this b...|      4|  1.0|
|Great on the deli...|      5|  1.0|
|The book was deli...|      5|  1.0|
|Required to buy t...|      2|  0.0|
|Didn't help me mu...|      1|  0.0|
|Disappointing tex...|      1|  0.0|
|This book provide...|      4|  1.0|
|I've been using D...|      4|  1.0|
|The demo is done ...|      4|  1.0|
+--------------------+-------+-----+
only showing top 20 rows



In [9]:
#keeping reviewText and label column
df = df["reviewText", "label"]

In [10]:
#shuffling rows in df
df = df.orderBy(rand())

In [11]:
#check how data is spread among two categories
df.createOrReplaceTempView("df")

# using the SQL query to count all
# distinct records and display the
# count on the screen
spark.sql("select count((label)),label from df group by label").show()

                                                                                

+------------+-----+
|count(label)|label|
+------------+-----+
|      133987|  0.0|
|      285989|  1.0|
+------------+-----+



In [12]:
df.show()

+--------------------+-----+
|          reviewText|label|
+--------------------+-----+
|One of my favorit...|  1.0|
|We love this prod...|  1.0|
|I've spent many m...|  1.0|
|Description of pr...|  0.0|
|I'm not a novice ...|  0.0|
|I have used the s...|  0.0|
|Happy with the pu...|  1.0|
|                Okay|  1.0|
|I bought YNAB Pro...|  1.0|
|This is the ONE P...|  0.0|
|I returned a prev...|  1.0|
|I was a little le...|  1.0|
|I can't get anyth...|  0.0|
|Downloaded two we...|  0.0|
|download was easy...|  1.0|
|I've been using T...|  1.0|
|Best security sof...|  1.0|
|I always wondered...|  1.0|
|        resource hog|  0.0|
|Great product. Ma...|  1.0|
+--------------------+-----+
only showing top 20 rows



                                                                                

In [13]:
import pyspark.sql.functions as sq
from pyspark.sql.functions import lower, col
#replace regex

#convert txt to lower case
df = df.select("*", lower(col('reviewText')).alias("lower_text"))
#remove new lines
df = df.withColumn("no_line_text", sq.regexp_replace("lower_text", r"\n", " "))
#only keep words contains a-z
df = df.withColumn("removed_punc_text", sq.regexp_replace("no_line_text", r"[^a-z]", " "))
#replace multi space with one 
df = df.withColumn("removed_xtra_space", sq.regexp_replace("removed_punc_text", r" +", ' '))
#remove single character from string
df = df.withColumn("final_txt", sq.regexp_replace("removed_xtra_space",r"\b[a-zA-Z]\b", ""))
df.show()

+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|          reviewText|label|          lower_text|        no_line_text|   removed_punc_text|  removed_xtra_space|           final_txt|
+--------------------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|One of my favorit...|  1.0|one of my favorit...|one of my favorit...|one of my favorit...|one of my favorit...|one of my favorit...|
|We love this prod...|  1.0|we love this prod...|we love this prod...|we love this prod...|we love this prod...|we love this prod...|
|I've spent many m...|  1.0|i've spent many m...|i've spent many m...|i ve spent many m...|i ve spent many m...| ve spent many ma...|
|Description of pr...|  0.0|description of pr...|description of pr...|description of pr...|description of pr...|description of pr...|
|I'm not a novice ...|  0.0|i'm not a novice ...|i'm not a nov

                                                                                

In [14]:
#dropping duplicates
df = df.dropDuplicates()
#df.show()

In [15]:
df.count()

                                                                                

384534

## Creating Pipelines

In [16]:
#from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
import pyspark.ml.feature as ft
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regex_tokenizer = ft.RegexTokenizer(inputCol="final_txt", outputCol="words", pattern="\\W")

# stop words
stopwords_remover = ft.StopWordsRemover(inputCol="words", outputCol="filtered")

# word ngrams
##ngram = ft.NGram(n=4, inputCol="filtered", outputCol="nGrams")

# bag of words count
count_vectors = ft.CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

word2vec = ft.Word2Vec(vectorSize=10, seed=42, inputCol="filtered", outputCol="features")

#### Create pipeline with word2vec as word representor

In [17]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# create pipeline using word2vec for word representation
pipeline_w = Pipeline(stages=[regex_tokenizer, stopwords_remover, word2vec])
# fit the pipeline to training documents.
pipeline_fit_w = pipeline_w.fit(df)
dataset_w = pipeline_fit_w.transform(df)
dataset_w = dataset_w["final_txt", "features", "label"]

dataset_w.show(5, truncate=50)

21/12/25 20:08:41 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/12/25 20:08:41 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS

+--------------------------------------------------+--------------------------------------------------+-----+
|                                         final_txt|                                          features|label|
+--------------------------------------------------+--------------------------------------------------+-----+
| ve been using turbo tax basic for many years i...|[0.13613224560818213,-0.058240396341054955,-0.3...|  1.0|
| was surprised at how simple it was to install ...|[0.03974814601242543,-0.17528762798756362,-0.10...|  1.0|
|game would not play on any of my computers so  ...|[0.23392875492572784,-0.4474961130569378,-0.008...|  0.0|
|this product was for my father in law he just w...|[0.09535474683109083,-0.04171163343677395,-0.25...|  0.0|
|works well as always hey it  quickbooks what do...|[0.04957500086165965,-0.15980561831966045,-0.19...|  1.0|
+--------------------------------------------------+--------------------------------------------------+-----+
only showi

                                                                                

#### Create pipeline with countVectorizer as word representor

In [18]:
# create pipeline using countVectorizer for word representation
pipeline = Pipeline(stages=[regex_tokenizer, stopwords_remover, count_vectors])
pipeline_fit = pipeline.fit(df)
dataset = pipeline_fit.transform(df)
dataset = dataset["final_txt", "features", "label"]
dataset.show(5, truncate=50)



+--------------------------------------------------+--------------------------------------------------+-----+
|                                         final_txt|                                          features|label|
+--------------------------------------------------+--------------------------------------------------+-----+
| ve been using turbo tax basic for many years i...|(10000,[14,15,22,28,29,33,41,57,89,118,132,157,...|  1.0|
| was surprised at how simple it was to install ...|(10000,[5,31,39,77,102,130,133,153,155,201,254,...|  1.0|
|game would not play on any of my computers so  ...|(10000,[7,27,82,174,210,938],[1.0,1.0,1.0,1.0,1...|  0.0|
|this product was for my father in law he just w...|(10000,[0,1,3,4,13,26,67,123,133,148,172,192,62...|  0.0|
|works well as always hey it  quickbooks what do...|(10000,[1,21,30,100,273,351,546,644,2354,4494],...|  1.0|
+--------------------------------------------------+--------------------------------------------------+-----+
only showi

                                                                                

### Create test/train data sets

In [19]:
# set seed for reproducibility
(train_data_w, test_data_w) = dataset_w.randomSplit([0.7, 0.3], seed = 100)
(train_data, test_data) = dataset.randomSplit([0.7, 0.3], seed = 100)

print("Training Dataset Count: " + str(train_data.count()))
print("Test Dataset Count: " + str(test_data.count()))

                                                                                

Training Dataset Count: 269301




Test Dataset Count: 115233


                                                                                

## Logistic Regression model with word2vec

In [20]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
# create LR model 
lrModel = lr.fit(train_data_w)
# test LR model 
lrw_predictions = lrModel.transform(test_data_w)
# show predictions
lrw_predictions.filter(lrw_predictions['prediction'] == 0) \
    .select("final_txt","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)



+------------------------------+------------------------------+-----+----------+
|                     final_txt|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|                          blah|[0.9939376021691604,0.00606...|  0.0|       0.0|
|mac software was not used i...|[0.9351647240608097,0.06483...|  0.0|       0.0|
|                       refund |[0.9212227348392006,0.07877...|  1.0|       0.0|
|          tech support stinks |[0.9209642567895252,0.07903...|  0.0|       0.0|
|                        cancel|[0.9130237982797599,0.08697...|  0.0|       0.0|
|                 buyer beware |[0.9129220637322322,0.08707...|  0.0|       0.0|
|               code is invalid|[0.9055228303740475,0.09447...|  0.0|       0.0|
| returned it phone support ...|[0.9053859749214284,0.09461...|  0.0|       0.0|
|   misinformed by tech support|[0.9051060201172153,0.09489...|  0.0|       0.0|
|                     waste 

                                                                                

In [21]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# LR model evaluation
lrw_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lrw_ev = lrw_evaluator.evaluate(lrw_predictions)
print("Logistic Regression Accuracy: \n" + str(lrw_ev))



Logistic Regression Accuracy: 
0.7141590435834239


                                                                                

#### Logistic Regression with word2vec percision and recall

In [22]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import numpy as np

y_true = np.array(lrw_predictions.select("label").collect())
y_pred = np.array(lrw_predictions.select("prediction").collect())

print("Logistic Regression model with word2vec Recall score: {}".format(recall_score(y_true,y_pred)))
print("Logistic Regression model with word2ve Precision score: {}".format(precision_score(y_true,y_pred)))

                                                                                

Logistic Regression model with word2vec Recall score: 0.9764888848229805
Logistic Regression model with word2ve Precision score: 0.7382982717903619


## Logistic Regression model with countVectorizer

In [23]:
# create LR model 
lrModel = lr.fit(train_data)
# test LR model 
lr_predictions = lrModel.transform(test_data)
# LR model evaluation
lr_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
lr_ev = lr_evaluator.evaluate(lr_predictions)

print("Logistic Regression Accuracy: \n" + str(lr_ev))

# calculate recall and percision
y_true = np.array(lr_predictions.select("label").collect())
y_pred = np.array(lr_predictions.select("prediction").collect())

print("Logistic Regression model with countVectorizer Recall score: {}".format(recall_score(y_true,y_pred)))
print("Logistic Regression model with countVectorizer Precision score: {}".format(precision_score(y_true,y_pred)))

                                                                                

Logistic Regression Accuracy: 
0.8208813364340677


                                                                                

Logistic Regression model with countVectorizer Recall score: 0.9776781630225962
Logistic Regression model with countVectorizer Precision score: 0.811351040638591


## Naive Bayes model with countVec

In [24]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1)
# build Naive Bayes model
nb_model = nb.fit(train_data)
# predict labels of test data
nb_predictions = nb_model.transform(test_data)
nb_predictions.filter(nb_predictions['prediction'] == 0) \
    .select("final_txt","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)



+------------------------------+----------------------------+-----+----------+
|                     final_txt|                 probability|label|prediction|
+------------------------------+----------------------------+-----+----------+
| have been  loyal quicken u...|[1.0,1.0898014663275989E-16]|  0.0|       0.0|
|if  could avg zero stars as...|[1.0,1.0888935461683305E-16]|  0.0|       0.0|
| thought this would be  goo...|[1.0,1.0851742224007874E-16]|  0.0|       0.0|
|product update won  load he...|  [1.0,1.07942202561331E-16]|  0.0|       0.0|
| paid my money and  took my...|[1.0,1.0699416463158971E-16]|  0.0|       0.0|
| ve been  faithful customer...|[1.0,1.0674598409110685E-16]|  0.0|       0.0|
| should have read the other...|[1.0,1.0551504607407336E-16]|  0.0|       0.0|
| have worked with wp before...|[1.0,1.0535035743565268E-16]|  1.0|       0.0|
|this company is more intere...|[1.0,1.0522193051645666E-16]|  0.0|       0.0|
|buyer beware if you have is...|[1.0,1.0390488505407

                                                                                

In [25]:
# evaluate nb model
nb_evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
nb_ev = nb_evaluator.evaluate(nb_predictions)

print("Naive Bayes model Accuracy: \n" + str(nb_ev))



Naive Bayes model Accuracy: 
0.86355321646359


                                                                                

In [26]:
# calculate precision and recall for Naive Bayes model
y_true = np.array(nb_predictions.select("label").collect())
y_pred = np.array(nb_predictions.select("prediction").collect())

print("Naive Bayes model with countVec Recall score: {}".format(recall_score(y_true,y_pred)))
print("Naive Bayes model with countVec Precision score: {}".format(precision_score(y_true,y_pred)))

                                                                                

Naive Bayes model with countVec Recall score: 0.8998523203993883
Naive Bayes model with countVec Precision score: 0.895428831523506


## Random Forest Classifier with word2vec

In [27]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

# train Random Forest model with word2vec pipeline with Training Data
rf_model_w = rf.fit(train_data_w)
# predict test data by Random Forest model with word2vec pipeline
rf_predictions_w = rf_model_w.transform(test_data_w)
rf_predictions_w.filter(rf_predictions_w['prediction'] == 0) \
    .select("final_txt","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)



+------------------------------+------------------------------+-----+----------+
|                     final_txt|                   probability|label|prediction|
+------------------------------+------------------------------+-----+----------+
|during the download of this...|[0.7569473221640844,0.24305...|  0.0|       0.0|
|it won  download no matter ...|[0.7569473221640844,0.24305...|  0.0|       0.0|
|erased all my data no one w...|[0.7569473221640844,0.24305...|  0.0|       0.0|
|when  purchased it  was ver...|[0.7569473221640844,0.24305...|  0.0|       0.0|
|beware this product appears...|[0.7569473221640844,0.24305...|  0.0|       0.0|
|terrible it would not downl...|[0.7569473221640844,0.24305...|  0.0|       0.0|
| have used email pdf daily ...|[0.7569473221640844,0.24305...|  0.0|       0.0|
| have  mac and this softwar...|[0.7569473221640844,0.24305...|  0.0|       0.0|
|finally had to upgrade from...|[0.7569473221640844,0.24305...|  0.0|       0.0|
| purchased ftm about  month

                                                                                

In [28]:
# evaluate Random Forest model with word2vec
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
rf_ev_w = evaluator.evaluate(rf_predictions_w)
print("Random Forest with word2vec model Accuracy: \n" + str(rf_ev_w))



Random Forest with word2vec model Accuracy: 
0.7702472368451992


                                                                                

In [29]:
# calculate recall and percision for Random Forest word2vec pipeline 
y_true = np.array(rf_predictions_w.select("label").collect())
y_pred = np.array(rf_predictions_w.select("prediction").collect())

print("Random Forest model with w2v Recall score: {}".format(recall_score(y_true,y_pred)))
print("Random Forest model with w2v Precision score: {}".format(precision_score(y_true,y_pred)))

                                                                                

Random Forest model with w2v Recall score: 0.9240430231190454
Random Forest model with w2v Precision score: 0.787474801474601


## Random Forest Classifier with countVec

In [30]:
# build Random Forest Model with countVec pipeline
rf_model = rf.fit(train_data)
# test Random Forest Model with countVec pipeline
rf_predictions = rf_model.transform(test_data)

# evaluate Random Forest Model with countVec
rf_ev = evaluator.evaluate(rf_predictions)

print("Random Forest model with countVec Accuracy: \n" + str(rf_ev))

# calculate recall and percision
y_true = np.array(rf_predictions.select("label").collect())
y_pred = np.array(rf_predictions.select("prediction").collect())

print("Random Forest model with countVec Recall score: {}".format(recall_score(y_true,y_pred)))
print("Random Forest model with countVec Precision score: {}".format(precision_score(y_true,y_pred)))

21/12/25 20:26:05 WARN MemoryStore: Not enough space to cache rdd_851_0 in memory! (computed 36.7 MiB so far)
21/12/25 20:26:05 WARN MemoryStore: Not enough space to cache rdd_851_1 in memory! (computed 36.7 MiB so far)
21/12/25 20:26:05 WARN MemoryStore: Not enough space to cache rdd_851_3 in memory! (computed 36.7 MiB so far)
21/12/25 20:26:05 WARN MemoryStore: Not enough space to cache rdd_851_6 in memory! (computed 36.7 MiB so far)
21/12/25 20:26:05 WARN MemoryStore: Not enough space to cache rdd_851_5 in memory! (computed 36.7 MiB so far)
21/12/25 20:26:05 WARN BlockManager: Persisting block rdd_851_3 to disk instead.
21/12/25 20:26:05 WARN BlockManager: Persisting block rdd_851_6 to disk instead.
21/12/25 20:26:05 WARN BlockManager: Persisting block rdd_851_0 to disk instead.
21/12/25 20:26:05 WARN BlockManager: Persisting block rdd_851_5 to disk instead.
21/12/25 20:26:05 WARN BlockManager: Persisting block rdd_851_1 to disk instead.
21/12/25 20:26:05 WARN MemoryStore: Not enoug

Random Forest model with countVec Accuracy: 
0.5374366721410685


                                                                                

Random Forest model with countVec Recall score: 0.9997124821934995
Random Forest model with countVec Precision score: 0.6661528681279445
