In [None]:
# !apt update
# !apt-get install openjdk-11-jdk-headless -qq > /dev/null
# !wget -q http://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
# !tar -xvf spark-3.3.0-bin-hadoop3.tgz
# !pip install -q findspark
# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"

In [None]:
import findspark
findspark.init()

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

In [None]:
# %cd '/content/gdrive/My Drive/LDS9/Practice/Chapter11/'

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder \
        .appName("Hasaki Sentiment Analysis") \
        .config("spark.driver.memory", "16g") \
        .getOrCreate()

### Read the Data

In [None]:
data = spark.read.csv("data/Danh_gia.csv", inferSchema=True, header=True)

In [None]:
data.show(5)

+---+-------------+--------------------+--------------+-------------+------+-----------+
| id|ma_khach_hang|  noi_dung_binh_luan|ngay_binh_luan|gio_binh_luan|so_sao|ma_san_pham|
+---+-------------+--------------------+--------------+-------------+------+-----------+
|  1|          443|SỬ DỤNG DỄ DÀNG, ...|    29/04/2023|        17:06|     5|  308500015|
|  2|         1030|Sử dụng dễ dãng,r...|    30/04/2023|        15:04|     5|  308500015|
|  3|          689|Mình rất thích ha...|    30/04/2023|        18:34|     5|  422216594|
|  4|         2519|Sản phẩm có khả n...|    17/07/2022|        13:48|     5|  204100075|
|  5|          402|Sữa rửa mặt tốt,s...|    15/04/2023|        23:04|     5|  422208977|
+---+-------------+--------------------+--------------+-------------+------+-----------+
only showing top 5 rows



In [None]:
data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- ma_khach_hang: integer (nullable = true)
 |-- noi_dung_binh_luan: string (nullable = true)
 |-- ngay_binh_luan: string (nullable = true)
 |-- gio_binh_luan: string (nullable = true)
 |-- so_sao: string (nullable = true)
 |-- ma_san_pham: string (nullable = true)



In [None]:
from pyspark.sql.functions import *

In [None]:
data = data.withColumn('sentiment', when(data.so_sao >=4, "positive")
                               .when(data.so_sao <= 2, "negative")
                               .otherwise("neutral"))

In [None]:
data = data.select("noi_dung_binh_luan", "so_sao", "sentiment")

In [None]:
data.groupBy("sentiment").count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive|19512|
|  neutral| 1002|
| negative| 1061|
+---------+-----+



### Clean and Prepare the Data
- ** Create a new length feature: **

In [None]:
# Đếm số lượt bình luận sản phẩm
print("Num rows of training dataset: ", data.count())

Num rows of training dataset:  21575


In [None]:
# check data NULL
data.select([count(when(col(c).isNull(), c)).alias(c)
           for c in data.columns]).toPandas().T

Unnamed: 0,0
noi_dung_binh_luan,901
so_sao,0
sentiment,0


In [None]:
data = data.dropna()
print("Num rows of training dataset after drop Null: ", data.count())

Num rows of training dataset after drop Null:  20674


In [None]:
# Sau khi drop giá trị NULL
data.groupBy("sentiment").count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| positive|18634|
|  neutral|  995|
| negative| 1045|
+---------+-----+



In [None]:
from pyspark.sql.functions import length

data = data.withColumn('length',length(data['noi_dung_binh_luan']))
data.show(10)

+--------------------+------+---------+------+
|  noi_dung_binh_luan|so_sao|sentiment|length|
+--------------------+------+---------+------+
|SỬ DỤNG DỄ DÀNG, ...|     5| positive|    48|
|Sử dụng dễ dãng,r...|     5| positive|    45|
|Mình rất thích ha...|     5| positive|    41|
|Sản phẩm có khả n...|     5| positive|   378|
|Sữa rửa mặt tốt,s...|     5| positive|    44|
|Sau 77 49 dòng sr...|     5| positive|   221|
|Đó giờ mình sài b...|     5| positive|   177|
|Rất ok mình xài 2...|     5| positive|    26|
|Mik bị kich ứng, ...|     4| positive|    81|
|nhân viên tư vấn ...|     5| positive|   156|
+--------------------+------+---------+------+
only showing top 10 rows



In [None]:
data.groupby('sentiment').mean().show()
# Không có sự chênh lệch quá lớn về số lượng từ của các đánh giá

+---------+-----------------+
|sentiment|      avg(length)|
+---------+-----------------+
| positive|96.85091767736395|
|  neutral|101.3678391959799|
| negative|76.14449760765551|
+---------+-----------------+



In [None]:
data = data.drop("length")

In [None]:
# !pip install pyspark underthesea pyvi

### Feature Transformations

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, RegexTokenizer
from pyspark.ml.feature import CountVectorizer, IDF, StringIndexer
from underthesea import word_tokenize

In [None]:
data = data.withColumn("lower_noi_dung_binh_luan", lower(data["noi_dung_binh_luan"]))

In [None]:
# Tokenization Vietnamese text with underthesea
def tokenize_vietnamese(text):
    return word_tokenize(text, format="text").split()

tokenizer_udf = udf(tokenize_vietnamese, ArrayType(StringType()))

data = data.withColumn("token_text", tokenizer_udf(col("lower_noi_dung_binh_luan")))
data.show()

+--------------------+------+---------+------------------------+--------------------+
|  noi_dung_binh_luan|so_sao|sentiment|lower_noi_dung_binh_luan|          token_text|
+--------------------+------+---------+------------------------+--------------------+
|SỬ DỤNG DỄ DÀNG, ...|     5| positive|    sử dụng dễ dàng, ...|[sử_dụng, dễ_dàng...|
|Sử dụng dễ dãng,r...|     5| positive|    sử dụng dễ dãng,r...|[sử_dụng, dễ, dãn...|
|Mình rất thích ha...|     5| positive|    mình rất thích ha...|[mình, rất, thích...|
|Sản phẩm có khả n...|     5| positive|    sản phẩm có khả n...|[sản_phẩm, có, kh...|
|Sữa rửa mặt tốt,s...|     5| positive|    sữa rửa mặt tốt,s...|[sữa, rửa, mặt, t...|
|Sau 77 49 dòng sr...|     5| positive|    sau 77 49 dòng sr...|[sau, 77, 49, dòn...|
|Đó giờ mình sài b...|     5| positive|    đó giờ mình sài b...|[đó, giờ, mình, s...|
|Rất ok mình xài 2...|     5| positive|    rất ok mình xài 2...|[rất, ok, mình, x...|
|Mik bị kich ứng, ...|     4| positive|    mik bị kich

In [None]:
# Vietnamese stopwords
sc = spark.sparkContext

stopwords_path = "files/vietnamese-stopwords.txt"
stopwords_rdd = sc.textFile(stopwords_path)

# Chuyển đổi sang list
vn_stopwords_rdd = stopwords_rdd.map(lambda word: word.strip()).filter(lambda word: word)
vietnamese_stopwords = stopwords_rdd.collect()
# print(vietnamese_stopwords)

vietnamese_stopwords_remover = StopWordsRemover(inputCol="token_text", outputCol="stop_tokens")
vietnamese_stopwords_remover.setStopWords(vietnamese_stopwords) #1

count_vec = CountVectorizer(inputCol='stop_tokens',outputCol='c_vec')  #2

idf = IDF(inputCol="c_vec", outputCol="tf_idf")  #3

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector

clean_up = VectorAssembler(inputCols=['tf_idf'],outputCol='features') #4

In [None]:
class_to_num = StringIndexer(inputCol='sentiment',outputCol='label') #5

### Pipeline

In [None]:
from pyspark.ml import Pipeline

In [None]:
data_prep_pipe = Pipeline(stages=[vietnamese_stopwords_remover,
                                  count_vec, idf, clean_up, class_to_num])

In [None]:
cleaner = data_prep_pipe.fit(data)

In [None]:
clean_data = cleaner.transform(data)

In [None]:
clean_data.show(20)

+--------------------+------+---------+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|  noi_dung_binh_luan|so_sao|sentiment|lower_noi_dung_binh_luan|          token_text|         stop_tokens|               c_vec|              tf_idf|            features|label|
+--------------------+------+---------+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|SỬ DỤNG DỄ DÀNG, ...|     5| positive|    sử dụng dễ dàng, ...|[sử_dụng, dễ_dàng...|[dễ_dàng, ,, thoả...|(7227,[0,1,22,27,...|(7227,[0,1,22,27,...|(7227,[0,1,22,27,...|  0.0|
|Sử dụng dễ dãng,r...|     5| positive|    sử dụng dễ dãng,r...|[sử_dụng, dễ, dãn...|[dãng, ,, thoải_m...|(7227,[0,22,28,30...|(7227,[0,22,28,30...|(7227,[0,22,28,30...|  0.0|
|Mình rất thích ha...|     5| positive|    mình rất thích ha...|[mình, rất, thích...|[thích, va_sp, tẩ...|(7227,[14,18,2

In [None]:
# 0:positive, 1:negative, 2:neutral
clean_data.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|18634|
|  1.0| 1045|
|  2.0|  995|
+-----+-----+



In [None]:
clean_data = clean_data.select(['label','features'])

In [None]:
(training,testing) = clean_data.randomSplit([0.7,0.3])

In [None]:
training.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|13056|
|  1.0|  726|
|  2.0|  719|
+-----+-----+



In [None]:
testing.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 5578|
|  1.0|  319|
|  2.0|  276|
+-----+-----+



### Modeling

###
- Logistic Regression

In [None]:
import time
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression

In [None]:
lg = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [None]:
# Measure training time
start_time = time.time()
predictor_lg = lg.fit(training)
end_time = time.time()

# Calculate training time
training_time_log = end_time - start_time

# Print training time
print(f"Training time: {training_time_log:.2f} seconds")

Training time: 49.50 seconds


In [None]:
test_results_lg = predictor_lg.transform(testing)

In [None]:
test_results_lg.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 5917|
|       1.0|  143|
|       2.0|  113|
+----------+-----+



In [None]:
# Create a confusion matrix
test_results_lg.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  167|
|  1.0|       1.0|  140|
|  1.0|       0.0|  174|
|  2.0|       2.0|  106|
|  2.0|       1.0|    3|
|  1.0|       2.0|    5|
|  0.0|       0.0| 5576|
|  0.0|       2.0|    2|
+-----+----------+-----+



In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_lg = acc_eval.evaluate(test_results_lg)
print("Accuracy of model at predicting Logistic Regression: {}".format(acc_lg))

Accuracy of model at predicting Logistic Regression: 0.9323355753318717


In [None]:
print("Before resampling data")
# Multiclass evaluator for precision, recall, F1-score, and accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = multi_evaluator.evaluate(test_results_lg, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_results_lg, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_results_lg, {multi_evaluator.metricName: "weightedRecall"})
f1_score = multi_evaluator.evaluate(test_results_lg, {multi_evaluator.metricName: "f1"})

# Display metrics
print(f"LR predicting Accuracy: {accuracy:.2f}")
print(f"LR predicting Precision: {precision:.2f}")
print(f"LR predicting Recall: {recall:.2f}")
print(f"LR predicting F1 Score: {f1_score:.2f}")

Before resampling data
LR predicting Accuracy: 0.94
LR predicting Precision: 0.94
LR predicting Recall: 0.94
LR predicting F1 Score: 0.93


###
- Naive Bayes

In [None]:
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes()

In [None]:
# Measure training time
start_time = time.time()
predictor_nb = nb.fit(training)
end_time = time.time()

# Calculate training time
training_time_nb = end_time - start_time

# Print training time
print(f"Training time: {training_time_nb:.2f} seconds")

Training time: 24.81 seconds


In [None]:
test_results_nb = predictor_nb.transform(testing)

In [None]:
test_results_nb.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 5121|
|       1.0|  571|
|       2.0|  481|
+----------+-----+



In [None]:
test_results_nb.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   58|
|  1.0|       1.0|  268|
|  0.0|       1.0|  260|
|  1.0|       0.0|   21|
|  2.0|       2.0|  175|
|  2.0|       1.0|   43|
|  1.0|       2.0|   30|
|  0.0|       0.0| 5042|
|  0.0|       2.0|  276|
+-----+----------+-----+



In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_nb = acc_eval.evaluate(test_results_nb)
print("Accuracy of model at predicting: {}".format(acc_nb))

Accuracy of model at predicting: 0.9034652839799525


In [None]:
print("Before resampling data")
# Multiclass evaluator for precision, recall, F1-score, and accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = multi_evaluator.evaluate(test_results_nb, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_results_nb, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_results_nb, {multi_evaluator.metricName: "weightedRecall"})
f1_score = multi_evaluator.evaluate(test_results_nb, {multi_evaluator.metricName: "f1"})

# Display metrics
print(f"NB predicting Accuracy: {accuracy:.2f}")
print(f"NB predicting Precision: {precision:.2f}")
print(f"NB predicting Recall: {recall:.2f}")
print(f"NB predicting F1 Score: {f1_score:.2f}")

Before resampling data
NB predicting Accuracy: 0.89
NB predicting Precision: 0.93
NB predicting Recall: 0.89
NB predicting F1 Score: 0.90


###
- RandomForest

In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 50, \
                            maxDepth = 5, \
                            maxBins = 64)

In [None]:
# Measure training time
start_time = time.time()
predictor_rf = rf.fit(training)
end_time = time.time()

# Calculate training time
training_time_rf = end_time - start_time

# Print training time
print(f"Training time: {training_time_rf:.2f} seconds")

Training time: 100.09 seconds


In [None]:
test_results_rf = predictor_rf.transform(testing)

In [None]:
test_results_rf.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0| 6173|
+----------+-----+



In [None]:
# Create a confusion matrix
test_results_rf.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  276|
|  1.0|       0.0|  319|
|  0.0|       0.0| 5578|
+-----+----------+-----+



In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_rf = acc_eval.evaluate(test_results_rf)
print("Accuracy of model at predicting: {}".format(acc_rf))

Accuracy of model at predicting: 0.8578590007463993


In [None]:
print("Before resampling data")
# Multiclass evaluator for precision, recall, F1-score, and accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = multi_evaluator.evaluate(test_results_rf, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_results_rf, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_results_rf, {multi_evaluator.metricName: "weightedRecall"})
f1_score = multi_evaluator.evaluate(test_results_rf, {multi_evaluator.metricName: "f1"})

# Display metrics
print(f"Random Forest predicting Accuracy: {accuracy:.2f}")
print(f"Random Forest predicting Precision: {precision:.2f}")
print(f"Random Forest predicting Recall: {recall:.2f}")
print(f"Random Forest predicting F1 Score: {f1_score:.2f}")

Before resampling data
Random Forest predicting Accuracy: 0.90
Random Forest predicting Precision: 0.82
Random Forest predicting Recall: 0.90
Random Forest predicting F1 Score: 0.86


## Need to resample data

In [None]:
positive_df = training.filter(col("label") == 0)
negative_df = training.filter(col("label") == 1)
neutral_df = training.filter(col("label") == 2)
ratio_1 = int(positive_df.count()/negative_df.count())
ratio_2 = int(positive_df.count()/neutral_df.count())
print("ratio like/neutral: {}".format(ratio_1))
print("ratio like/not_like: {}".format(ratio_2))

ratio like/neutral: 17
ratio like/not_like: 18


In [None]:
# ratio1 = (ratio_1 -1)/2
# ratio2 = ratio_2/2

In [None]:
# resample negative
a1 = range(ratio_1)
# duplicate the minority rows
oversampled_negative_df = negative_df.withColumn("dummy",
                                                explode(array([lit(x) for x in a1])))\
                                                .drop('dummy')
# combine both oversampled minority rows and previous majority rows
combined_df = positive_df.unionAll(oversampled_negative_df)
combined_df.show(5)

+-----+------------+
|label|    features|
+-----+------------+
|  0.0|(7227,[],[])|
|  0.0|(7227,[],[])|
|  0.0|(7227,[],[])|
|  0.0|(7227,[],[])|
|  0.0|(7227,[],[])|
+-----+------------+
only showing top 5 rows



In [None]:
combined_df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|13056|
|  1.0|12342|
+-----+-----+



In [None]:
# resample neutral
a2 = range(ratio_2)
# duplicate the minority rows
oversampled_neutral_df = neutral_df.withColumn("dummy",
                                                explode(array([lit(x) for x in a2])))\
                                                .drop('dummy')
# combine both oversampled minority rows and previous majority rows
combined_df = combined_df.unionAll(oversampled_neutral_df)
combined_df.show(5)

+-----+------------+
|label|    features|
+-----+------------+
|  0.0|(7227,[],[])|
|  0.0|(7227,[],[])|
|  0.0|(7227,[],[])|
|  0.0|(7227,[],[])|
|  0.0|(7227,[],[])|
+-----+------------+
only showing top 5 rows



In [None]:
combined_df.groupBy("label").count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|13056|
|  1.0|12342|
|  2.0|12942|
+-----+-----+



### Logistic Regression

In [None]:
predictor_lg1 = lg.fit(combined_df)

In [None]:
test_results_lg1 = predictor_lg1.transform(testing)

In [None]:
test_results_lg1.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 5578|
|  1.0|  319|
|  2.0|  276|
+-----+-----+



In [None]:
test_results_lg1.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   50|
|  1.0|       1.0|  278|
|  0.0|       1.0|  143|
|  1.0|       0.0|   24|
|  2.0|       2.0|  206|
|  2.0|       1.0|   20|
|  1.0|       2.0|   17|
|  0.0|       0.0| 5238|
|  0.0|       2.0|  197|
+-----+----------+-----+



In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_lg1 = acc_eval.evaluate(test_results_lg1)
print("Accuracy of model at predicting: {}".format(acc_lg1))

Accuracy of model at predicting: 0.9335326091378984


In [None]:
print("After resampling data")
# Multiclass evaluator for precision, recall, F1-score, and accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = multi_evaluator.evaluate(test_results_lg1, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_results_lg1, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_results_lg1, {multi_evaluator.metricName: "weightedRecall"})
f1_score = multi_evaluator.evaluate(test_results_lg1, {multi_evaluator.metricName: "f1"})

# Display metrics
print(f"Logistic Regression predicting Accuracy: {accuracy:.2f}")
print(f"Logistic Regression predicting Precision: {precision:.2f}")
print(f"Logistic Regression predicting Recall: {recall:.2f}")
print(f"Logistic Regression predicting F1 Score: {f1_score:.2f}")

After resampling data
Logistic Regression predicting Accuracy: 0.93
Logistic Regression predicting Precision: 0.95
Logistic Regression predicting Recall: 0.93
Logistic Regression predicting F1 Score: 0.93


### Random Forest

In [None]:
predictor_rf1 = rf.fit(combined_df)

In [None]:
test_result_rf1 = predictor_rf1.transform(testing)

In [None]:
test_result_rf1.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 5578|
|  1.0|  319|
|  2.0|  276|
+-----+-----+



In [None]:
test_result_rf1.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|  109|
|  1.0|       1.0|  193|
|  0.0|       1.0|  225|
|  1.0|       0.0|   87|
|  2.0|       2.0|  133|
|  2.0|       1.0|   34|
|  1.0|       2.0|   39|
|  0.0|       0.0| 4896|
|  0.0|       2.0|  457|
+-----+----------+-----+



In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_rf1 = acc_eval.evaluate(test_result_rf1)
print("Accuracy of model at predicting: {}".format(acc_rf1))

Accuracy of model at predicting: 0.8682705066055129


In [None]:
print("After resampling data")
# Multiclass evaluator for precision, recall, F1-score, and accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = multi_evaluator.evaluate(test_result_rf1, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_result_rf1, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_result_rf1, {multi_evaluator.metricName: "weightedRecall"})
f1_score = multi_evaluator.evaluate(test_result_rf1, {multi_evaluator.metricName: "f1"})

# Display metrics
print(f"Random Forest predicting Accuracy: {accuracy:.2f}")
print(f"Random Forest predicting Precision: {precision:.2f}")
print(f"Random Forest predicting Recall: {recall:.2f}")
print(f"Random Forest predicting F1 Score: {f1_score:.2f}")

After resampling data
Random Forest predicting Accuracy: 0.85
Random Forest predicting Precision: 0.90
Random Forest predicting Recall: 0.85
Random Forest predicting F1 Score: 0.87


### Naive Bayer

In [None]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes()

In [None]:
predictor_nb1 = nb.fit(combined_df)

In [None]:
test_results_nb1 = predictor_nb1.transform(testing)

In [None]:
test_results_nb1.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0| 5578|
|  1.0|  319|
|  2.0|  276|
+-----+-----+



In [None]:
test_results_nb1.groupBy('label', 'prediction').count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  2.0|       0.0|   50|
|  1.0|       1.0|  268|
|  0.0|       1.0|  231|
|  1.0|       0.0|   27|
|  2.0|       2.0|  186|
|  2.0|       1.0|   40|
|  1.0|       2.0|   24|
|  0.0|       0.0| 5113|
|  0.0|       2.0|  234|
+-----+----------+-----+



In [None]:
acc_eval = MulticlassClassificationEvaluator()
acc_nb1 = acc_eval.evaluate(test_results_nb1)
print("Accuracy of model at predicting: {}".format(acc_nb1))

Accuracy of model at predicting: 0.9135132338313691


In [None]:
print("After resampling data")
# Multiclass evaluator for precision, recall, F1-score, and accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = multi_evaluator.evaluate(test_results_nb1, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(test_results_nb1, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(test_results_nb1, {multi_evaluator.metricName: "weightedRecall"})
f1_score = multi_evaluator.evaluate(test_results_nb1, {multi_evaluator.metricName: "f1"})

# Display metrics
print(f"Naive Bayes predicting Accuracy: {accuracy:.2f}")
print(f"Naive Bayes predicting Precision: {precision:.2f}")
print(f"Naive Bayes predicting Recall: {recall:.2f}")
print(f"Naive Bayes predicting F1 Score: {f1_score:.2f}")

After resampling data
Naive Bayes predicting Accuracy: 0.90
Naive Bayes predicting Precision: 0.93
Naive Bayes predicting Recall: 0.90
Naive Bayes predicting F1 Score: 0.91


In [None]:
print(f"Training time Logistic Regression: {training_time_log:.2f} seconds")
print(f"Training time Naive Bayes: {training_time_nb:.2f} seconds")
print(f"Training time Random Forest: {training_time_rf:.2f} seconds")
print("----------------")
print("Accuracy of model at Logistic Regression predicting: {}".format(acc_lg))
print("Accuracy of model at Naive Bayes predicting: {}".format(acc_nb))
print("Accuracy of model at Random Forest predicting: {}".format(acc_rf))
print("----------------")

print("After resampling data")
print("Accuracy of model at Logistic Regression predicting: {}".format(acc_lg1))
print("Accuracy of model at Naive Bayes predicting: {}".format(acc_nb1))
print("Accuracy of model at Random Forest predicting: {}".format(acc_rf1))

Training time Logistic Regression: 49.50 seconds
Training time Naive Bayes: 24.81 seconds
Training time Random Forest: 100.09 seconds
----------------
Accuracy of model at Logistic Regression predicting: 0.9323355753318717
Accuracy of model at Naive Bayes predicting: 0.9034652839799525
Accuracy of model at Random Forest predicting: 0.8578590007463993
----------------
After resampling data
Accuracy of model at Logistic Regression predicting: 0.9335326091378984
Accuracy of model at Naive Bayes predicting: 0.9135132338313691
Accuracy of model at Random Forest predicting: 0.8682705066055129
