In [378]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.classification import NaiveBayes, LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType
from emoji import replace_emoji
import re
import logging
from time import time
from datetime import datetime
import os

In [379]:
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
log_file = os.path.join(log_dir, f"SentimentAnalysis_{timestamp}.log")
open(log_file, "w")

logging.basicConfig(filename=log_file,
                    format='%(asctime)s - %(message)s',
                    filemode="w",
                    level=logging.INFO)
logger = logging.getLogger()

In [380]:
spark = SparkSession.builder \
    .appName("Sentiment Analysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "4G") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

In [381]:
train_df = spark.read.option("header", "true").csv("./dataset/train.csv")
train_df = train_df.withColumn("label", (col("target") / 4).cast("int"))

In [382]:
def clean_text(text):
    if text is None: return ""

    text = replace_emoji(text, "")
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)

    return text.lower().strip()

clean_udf = udf(clean_text, StringType())

train_df = train_df.withColumn("clean_text", clean_udf(col("text")))
train_df = train_df.cache()

In [383]:
tokenizer = RegexTokenizer() \
    .setInputCol("clean_text") \
    .setOutputCol("tokens") \
    .setPattern("\\W+") \
    .setGaps(True)

remover = StopWordsRemover() \
    .setInputCol("tokens") \
    .setOutputCol("filtered_text")

hashingTF = HashingTF() \
    .setInputCol("filtered_text") \
    .setOutputCol("rawFeatures") \
    .setNumFeatures(5000)

idf = IDF() \
    .setInputCol("rawFeatures") \
    .setOutputCol("features")

In [384]:
preprocessing_start = time()

preprocessing_pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf])
model = preprocessing_pipeline.fit(train_df)
train_df = model.transform(train_df)

preprocessing_time = time() - preprocessing_start
logger.info(f"Preprocessing Time: {preprocessing_time:.2f}s")

INFO:root:Preprocessing Time: 26.28s                                            


In [385]:
test_df = spark.read.option("header", "true").csv("./dataset/test.csv")
test_df = test_df.filter(test_df.target != "2")
test_df = test_df.withColumn("clean_text", clean_udf(col("text")))
test_df = test_df.withColumn("label", (col("target") / 4).cast("int"))
test_df = model.transform(test_df)
test_df = test_df.cache()

In [386]:
nb_start = time()

nb = NaiveBayes(featuresCol="features", labelCol="label", modelType="multinomial")
nb_model = nb.fit(train_df)
predictions = nb_model.transform(test_df)
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = f1_evaluator.evaluate(predictions)

nb_time = time() - nb_start
logger.info(f"Naive Bayes Time: {nb_time:.2f}s - Accuracy: {accuracy:.2f} - F1 Score: {f1:.2f}")

INFO:root:Naive Bayes Time: 13.00s - Accuracy: 0.81 - F1 Score: 0.80            


In [387]:
lr_start = time()

lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_df)
predictions = lr_model.transform(test_df)
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = f1_evaluator.evaluate(predictions)

lr_time = time() - lr_start
logger.info(f"Logistic Regression Time: {lr_time:.2f}s - Accuracy: {accuracy:.2f} - F1 Score: {f1:.2f}")

INFO:root:Logistic Regression Time: 24.09s - Accuracy: 0.81 - F1 Score: 0.81    


In [388]:
rf_start = time()

rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=20, maxDepth=5)
rfModel = rf.fit(train_df)
predictions = rfModel.transform(test_df)
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = f1_evaluator.evaluate(predictions)

rf_time = time() - rf_start
logger.info(f"Random Forest Time: {rf_time:.2f}s - Accuracy: {accuracy:.2f} - F1 Score: {f1:.2f}")

25/05/08 14:23:10 WARN MemoryStore: Not enough space to cache rdd_253_1 in memory! (computed 222.8 MiB so far)
25/05/08 14:23:10 WARN BlockManager: Persisting block rdd_253_1 to disk instead.
25/05/08 14:23:10 WARN MemoryStore: Not enough space to cache rdd_253_6 in memory! (computed 222.8 MiB so far)
25/05/08 14:23:10 WARN BlockManager: Persisting block rdd_253_6 to disk instead.
25/05/08 14:23:10 WARN MemoryStore: Not enough space to cache rdd_253_4 in memory! (computed 222.8 MiB so far)
25/05/08 14:23:10 WARN BlockManager: Persisting block rdd_253_4 to disk instead.
25/05/08 14:23:10 WARN MemoryStore: Not enough space to cache rdd_253_2 in memory! (computed 222.8 MiB so far)
25/05/08 14:23:10 WARN BlockManager: Persisting block rdd_253_2 to disk instead.
25/05/08 14:23:10 WARN MemoryStore: Not enough space to cache rdd_253_5 in memory! (computed 222.8 MiB so far)
25/05/08 14:23:10 WARN BlockManager: Persisting block rdd_253_5 to disk instead.
25/05/08 14:23:10 WARN MemoryStore: Not 

In [None]:
spark.stop()

25/05/08 14:24:19 WARN BlockManager: Block rdd_253_5 could not be removed as it was not found on disk or in memory
25/05/08 14:24:19 WARN BlockManager: Block rdd_253_1 could not be removed as it was not found on disk or in memory
25/05/08 14:24:19 WARN BlockManager: Block rdd_253_0 could not be removed as it was not found on disk or in memory


25/05/08 14:24:19 WARN BlockManager: Block rdd_253_2 was not removed normally.
java.util.concurrent.RejectedExecutionException: Task scala.concurrent.impl.CallbackRunnable@3a81d62 rejected from java.util.concurrent.ThreadPoolExecutor@d6fe367[Shutting down, pool size = 1, active threads = 1, queued tasks = 0, completed tasks = 495]
	at java.base/java.util.concurrent.ThreadPoolExecutor$AbortPolicy.rejectedExecution(ThreadPoolExecutor.java:2055)
	at java.base/java.util.concurrent.ThreadPoolExecutor.reject(ThreadPoolExecutor.java:825)
	at java.base/java.util.concurrent.ThreadPoolExecutor.execute(ThreadPoolExecutor.java:1355)
	at scala.concurrent.impl.ExecutionContextImpl$$anon$4.execute(ExecutionContextImpl.scala:138)
	at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:72)
	at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1(Promise.scala:288)
	at scala.concurrent.impl.Promise$DefaultPromise.$anonfun$tryComplete$1$adapted(Promise.scala:288)
	at scal