In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install emoji
import emoji
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as f
from IPython.display import display, clear_output
import pandas as pd
from pyspark.ml import PipelineModel
from pyspark.sql.functions import udf
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

Collecting emoji
  Downloading emoji-1.4.1.tar.gz (185 kB)
[?25l[K     |█▊                              | 10 kB 36.5 MB/s eta 0:00:01[K     |███▌                            | 20 kB 33.3 MB/s eta 0:00:01[K     |█████▎                          | 30 kB 19.5 MB/s eta 0:00:01[K     |███████                         | 40 kB 16.3 MB/s eta 0:00:01[K     |████████▉                       | 51 kB 8.7 MB/s eta 0:00:01[K     |██████████▋                     | 61 kB 9.0 MB/s eta 0:00:01[K     |████████████▍                   | 71 kB 7.9 MB/s eta 0:00:01[K     |██████████████▏                 | 81 kB 8.7 MB/s eta 0:00:01[K     |████████████████                | 92 kB 9.5 MB/s eta 0:00:01[K     |█████████████████▊              | 102 kB 8.2 MB/s eta 0:00:01[K     |███████████████████▌            | 112 kB 8.2 MB/s eta 0:00:01[K     |█████████████████████▎          | 122 kB 8.2 MB/s eta 0:00:01[K     |███████████████████████         | 133 kB 8.2 MB/s eta 0:00:01[K     |███████

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from sklearn.metrics import classification_report, accuracy_score

In [None]:
import sparknlp
spark = sparknlp.start(gpu=True)

# **Read Dataset**

In [None]:
schema = "text STRING, sentiment FLOAT"
train = spark.read.csv('/content/train_vaccine_tweet.csv', header=True, schema = schema)
test = spark.read.csv('/content/test_vaccine_tweet.csv', header=True, schema = schema)

In [None]:
train.show(truncate=100)
test.show(truncate=100)

+----------------------------------------------------------------------------------------------------+---------+
|                                                                                                text|sentiment|
+----------------------------------------------------------------------------------------------------+---------+
|@latingle Too bad they didn't order #Moderna or #JohnsonAndJohnsonVaccine  FFS! Incompetent noobs...|      0.0|
|Yes @DollyParton ❤️❤️❤️❤️🥰🥰🥰. AND THANK YOU! #Moderna   Dolly Parton, 75, receives Moderna COV...|      2.0|
| I wish anyone taking the mRNA vaccines good luck  You’ll dearly need it  #vaccine #Pfizer #Moderna |      2.0|
|@BBCWorld New crime against humanity :Israeli occupation banned the entry of 2000 shots of #Sputn...|      0.0|
|                                                                      @BDUTT #Covaxin airlifted?!? ?|      1.0|
|UPDATE | Brazil grateful to Russia for allowing Anvisa to inspect #SputnikV production, ambassado.

In [None]:
train.printSchema()
train.groupBy("sentiment").agg(f.count("sentiment")).show()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)

+---------+----------------+
|sentiment|count(sentiment)|
+---------+----------------+
|      2.0|            3150|
|      1.0|            4320|
|      0.0|            1530|
+---------+----------------+



In [None]:
test.printSchema()
test.groupBy("sentiment").agg(f.count("sentiment")).show()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)

+---------+----------------+
|sentiment|count(sentiment)|
+---------+----------------+
|      2.0|             350|
|      1.0|             480|
|      0.0|             170|
+---------+----------------+



# **Preprocessing Text**

In [None]:
def emoji2text(text):
    return emoji.demojize(text, delimiters=(" ", " "))

udf_emoji2text = udf(emoji2text,StringType())

In [None]:
user_regex = r"(@\w{1,15})"
hashtag_replace_regex = "#(\w{1,})"
url_regex = r"(https?:\/\/\S+|www\.\S+)"
email_regex = r"[\w.-]+@[\w.-]+.[a-zA-Z]{1,}"
RT_regex = r"RT"

def cleaning_process(data):
            # Loại bỏ @Mention khỏi text
    data=(data.withColumn("text",f.regexp_replace(f.col("text"), user_regex, "")) 
            # Loại bỏ dấu # khỏi Hashtag khỏi text
            .withColumn("text",f.regexp_replace(f.col("text"), hashtag_replace_regex, "$1"))
            # Loại bỏ URL khỏi text
            .withColumn("text",f.regexp_replace(f.col("text"), url_regex, "")) 
            # Loại bỏ Email khỏi text
            .withColumn("text",f.regexp_replace(f.col("text"), email_regex, ""))
            # Loại bỏ RT khỏi text
            .withColumn("text",f.regexp_replace(f.col("text"), RT_regex, ""))
            # Emoji 2 text
            .withColumn('text', udf_emoji2text(f.col('text')))
            # Loại bỏ số cũng như các ký tự khỏi đoạn text
            .withColumn("text",f.regexp_replace(f.col("text"), "[^a-zA-Z]", " "))
            # Loại bỏ các khoảng trắng thừa trong câu
            .withColumn("text",f.regexp_replace(f.col("text"), " +", " "))
            # Loại vỏ các khoảng trắng đầu và cuối câu
            .withColumn("text",f.trim(f.col("text")))
            # Chuẩn hoá viết thường
            .withColumn("text",f.lower(f.col("text")))
            # Giữ lại các dòng mà đoạn text có nội dung 
            .filter(f.col("text") != ""))
    return data

In [None]:
train = cleaning_process(train)
test = cleaning_process(test)

In [None]:
# Show Cleaned Text
train.show(truncate=100)
test.show(truncate=100)

+----------------------------------------------------------------------------------------------------+---------+
|                                                                                                text|sentiment|
+----------------------------------------------------------------------------------------------------+---------+
|   too bad they didn t order moderna or johnsonandjohnsonvaccine ffs incompetent noobs this means my|      0.0|
|yes red heart red heart red heart red heart smiling face with hearts smiling face with hearts smi...|      2.0|
|       i wish anyone taking the mrna vaccines good luck you ll dearly need it vaccine pfizer moderna|      2.0|
|      new crime against humanity israeli occupation banned the entry of shots of sputnikv vaccine to|      0.0|
|                                                                                   covaxin airlifted|      1.0|
|update brazil grateful to russia for allowing anvisa to inspect sputnikv production ambassador 

# **TF-IDF + Naive Bayes (✔️)**

In [None]:
# Define Spark NLP pipleline
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

sentence = SentenceDetector() \
          .setInputCols(['document']) \
          .setOutputCol('sentences')

token = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")

#stop_words = StopWordsCleaner.pretrained('stopwords_en', 'en').setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
            .setInputCols(["token"]) \
            .setOutputCol("lemma")

finisher = Finisher().setInputCols(["lemma"]).setOutputCols("token_features").setOutputAsArray(True).setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [None]:
hashtf = HashingTF(inputCol="token_features", outputCol="raw_features")

idf = IDF(inputCol='raw_features', outputCol="features", minDocFreq=5) 

nv = NaiveBayes(labelCol="sentiment", featuresCol="features", smoothing=111)

nlp_pipeline_nv = Pipeline(stages=[document, 
                                   sentence, 
                                   token, 
                                   #stop_words, 
                                   lemmatizer, 
                                   finisher, 
                                   hashtf, 
                                   idf, 
                                   nv])

In [None]:
model_nv = nlp_pipeline_nv.fit(train)
predict_train = model_nv.transform(train)
predict_test = model_nv.transform(test)

In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       1.00      0.03      0.06       170
     Neutral       0.60      0.92      0.72       480
    Positive       0.80      0.58      0.67       350

    accuracy                           0.65      1000
   macro avg       0.80      0.51      0.48      1000
weighted avg       0.74      0.65      0.59      1000



# **TF-IDF + Logistic Regression (✔️)**

In [None]:
hashtf = HashingTF(inputCol="token_features", outputCol="raw_features")

idf = IDF(inputCol='raw_features', outputCol="features", minDocFreq=5) 

lr = LogisticRegression(maxIter=10, regParam = 0.01, featuresCol="features", labelCol="sentiment")

nlp_pipeline_lr = Pipeline(stages=[document, 
                                   sentence, 
                                   token, 
                                   #stop_words, 
                                   lemmatizer, 
                                   finisher, 
                                   hashtf, 
                                   idf, 
                                   lr])

In [None]:
model_lr = nlp_pipeline_lr.fit(train)
predict_train = model_lr.transform(train)
predict_test = model_lr.transform(test)

In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.76      0.62      0.68       170
     Neutral       0.74      0.76      0.75       480
    Positive       0.71      0.75      0.73       350

    accuracy                           0.73      1000
   macro avg       0.74      0.71      0.72      1000
weighted avg       0.73      0.73      0.73      1000



# **Bert base uncased and ClassifierDLApproach**

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

embeddings = BertSentenceEmbeddings.pretrained("sent_bert_base_uncased", "en") \
            .setInputCols(["document"]) \
            .setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach() \
              .setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setMaxEpochs(10) \
              .setBatchSize(32) \
              .setEnableOutputLogs(True)

sent_bert_base_uncased download started this may take some time.
Approximate size to download 392.5 MB
[OK!]


In [None]:
nlp_pipeline_BERT = Pipeline(stages=[document, 
                                     embeddings, 
                                     classifierdl])

In [None]:
model_BERT_labse = nlp_pipeline_BERT.fit(train)

predict_test = model_BERT_labse.transform(test)

In [None]:
predict_test.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|                text|sentiment|            document| sentence_embeddings|          prediction|
+--------------------+---------+--------------------+--------------------+--------------------+
|pfizerbiontech to...|      0.0|[{document, 0, 97...|[{sentence_embedd...|[{category, 0, 97...|
|mom is doing pani...|      0.0|[{document, 0, 99...|[{sentence_embedd...|[{category, 0, 99...|
|today is my birth...|      1.0|[{document, 0, 14...|[{sentence_embedd...|[{category, 0, 14...|
|omi hospital kara...|      1.0|[{document, 0, 55...|[{sentence_embedd...|[{category, 0, 55...|
|our beloved pm sh...|      2.0|[{document, 0, 11...|[{sentence_embedd...|[{category, 0, 11...|
|super happy to ge...|      2.0|[{document, 0, 94...|[{sentence_embedd...|[{category, 0, 94...|
|covaxin s phase t...|      1.0|[{document, 0, 83...|[{sentence_embedd...|[{category, 0, 83...|
|dose at terapanth...|      1.0|[{docume

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       170
     Neutral       0.00      0.00      0.00       480
    Positive       0.35      1.00      0.52       350

    accuracy                           0.35      1000
   macro avg       0.12      0.33      0.17      1000
weighted avg       0.12      0.35      0.18      1000



  _warn_prf(average, modifier, msg_start, len(result))


- epoch = 1: 0.12, 0.33, 0.17
- epoch = 5: 0.12, 0.33, 0.17
- epoch = 10, batchsize = 32: 0.12, 0.33, 0.17

# **Universal Sentence Encoder and ClassifierDLApproach (✔️)**
https://nlp.johnsnowlabs.com/2020/04/17/tfhub_use.html

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

embeddings = UniversalSentenceEncoder.pretrained("tfhub_use", "en") \
            .setInputCols(["document"]) \
            .setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach() \
              .setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(64) \
              .setMaxEpochs(10) \
              .setLr(0.005) \
              .setEnableOutputLogs(True)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [None]:
nlp_pipeline_USE = Pipeline(stages=[document, 
                                   embeddings, 
                                   classifierdl])

In [None]:
model_USE = nlp_pipeline_USE.fit(train)

predict_test = model_USE.transform(test)

In [None]:
predict_test.show(truncate=100)
#predict_test.select("sentence_embeddings").show(truncate=False)

+----------------------------------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                                text|sentiment|                                                                                            document|                                                                                 sentence_embeddings|                                                                                          prediction|
+----------------------------------------------------------------------------------------------------+---------+--------------------------------------------------------

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.72      0.59      0.65       170
     Neutral       0.72      0.82      0.77       480
    Positive       0.77      0.70      0.73       350

    accuracy                           0.74      1000
   macro avg       0.74      0.70      0.72      1000
weighted avg       0.74      0.74      0.74      1000



- epoch = 1: 68, 64, 65
- epoch = 5: 72, 71, 71
- epoch = 10: 74, 70, 72

# **GloVe840B and ClassifierDLApproach (✔️)**
https://nlp.johnsnowlabs.com/2020/01/22/glove_840B_300.html

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

sentence = SentenceDetector() \
          .setInputCols(['document']) \
          .setOutputCol('sentences')

token = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")

#stop_words = StopWordsCleaner.pretrained('stopwords_en', 'en').setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
            .setInputCols(["token"]) \
            .setOutputCol("lemma")

glove_embeddings = WordEmbeddingsModel().pretrained('glove_840B_300','xx') \
                  .setInputCols(["sentences","lemma"]) \
                  .setOutputCol("embeddings") \
                  .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
                    .setInputCols(["sentences", "embeddings"]) \
                    .setOutputCol("sentence_embeddings") \
                    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(32) \
              .setMaxEpochs(10) \
              .setLr(0.003) \
              .setEnableOutputLogs(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[OK!]


In [None]:
nlp_pipeline_glove = Pipeline(stages=[document,
                                      sentence,
                                      token,
                                      lemmatizer, 
                                      glove_embeddings,
                                      embeddingsSentence,
                                      classifierdl])

In [None]:
model_glove = nlp_pipeline_glove.fit(train)

predict_test = model_glove.transform(test)

In [None]:
predict_test.show(truncate=100)
#predict_test.select("sentence_embeddings").show(truncate=False)

+----------------------------------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                                text|sentiment|                                                                    

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.77      0.66      0.71       170
     Neutral       0.72      0.83      0.77       480
    Positive       0.78      0.67      0.72       350

    accuracy                           0.74      1000
   macro avg       0.76      0.72      0.73      1000
weighted avg       0.75      0.74      0.74      1000



- 5 epoch: 73, 72, 73
- 10 epoch: 74, 71, 72
- 15 epoch: 75, 71, 73
- 20 epoch: 72, 72, 72
----------------------------
- 5 epoch, batchsize = 32, lr=0.003: 74, 71, 72
- 10 epoch, batchsize = 32, lr=0.003: 76, 73, 74 **Highest**
- 15 epoch, batchsize = 32, lr=0.003: 74, 73, 73
- 20 epoch, batchsize = 32, lr=0.003: 75, 73, 74
----------------------------
- 5 epoch, batchsize = 8, lr=0.003: 16, 33, 22
- 10 epoch, batchsize = 8, lr=0.003: 75, 74, 74
- 15 epoch, batchsize = 8, lr=0.003: 74, 72, 73
- 50 epoch, batchsize = 8, lr=0.003: 75, 72, 73
----------------------------
- 5 epoch, batchsize = 16, lr=0.003: 75, 73, 73
- 10 epoch, batchsize = 16, lr=0.003: 75, 72, 73
- 15 epoch, batchsize = 16, lr=0.003: 73, 72, 72
- 20 epoch, batchsize = 16, lr=0.003: 75, 73, 74

# **DistilBERT base model and ClassifierDLApproach (✔️)**
https://nlp.johnsnowlabs.com/2021/05/20/distilbert_base_cased_en.html

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

sentence = SentenceDetector() \
          .setInputCols(['document']) \
          .setOutputCol('sentences')

token = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")

#stop_words = StopWordsCleaner.pretrained('stopwords_en', 'en').setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
            .setInputCols(["token"]) \
            .setOutputCol("lemma")

distilbert_embeddings = DistilBertEmbeddings.pretrained("distilbert_base_cased", "en") \
                  .setInputCols(["lemma","sentences"]) \
                  .setOutputCol("embeddings") \
                  .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
                    .setInputCols(["sentences", "embeddings"]) \
                    .setOutputCol("sentence_embeddings") \
                    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(64) \
              .setMaxEpochs(10) \
              .setLr(0.005) \
              .setEnableOutputLogs(True)

# classifierdl = ClassifierDLApproach().setInputCols(["embeddings"]) \
#               .setOutputCol("prediction") \
#               .setLabelColumn("sentiment") \
#               .setBatchSize(64) \
#               .setMaxEpochs(10) \
#               .setLr(0.005) \
#               .setEnableOutputLogs(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
distilbert_base_cased download started this may take some time.
Approximate size to download 232.7 MB
[OK!]


In [None]:
nlp_pipeline_distilbert = Pipeline(stages=[document,
                                          sentence,
                                          token,
                                          lemmatizer, 
                                          distilbert_embeddings,
                                          embeddingsSentence,
                                          classifierdl])

In [None]:
model_distilbert = nlp_pipeline_distilbert.fit(train)

predict_test = model_distilbert.transform(test)

In [None]:
predict_test.show(truncate=100)
#predict_test.select("sentence_embeddings").show(truncate=False)

+----------------------------------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+
|                                                                                                text|sentiment|                                                                    

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.78      0.64      0.70       170
     Neutral       0.73      0.78      0.75       480
    Positive       0.73      0.73      0.73       350

    accuracy                           0.74      1000
   macro avg       0.75      0.71      0.73      1000
weighted avg       0.74      0.74      0.74      1000



- epoch = 5: 0.16, 0.33, 0.22
- epoch = 10: 0.75, 0.71, 0.73

# **RoBERTa and ClassifierDLApproach**

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

sentence = SentenceDetector() \
          .setInputCols(['document']) \
          .setOutputCol('sentences')

token = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")

#stop_words = StopWordsCleaner.pretrained('stopwords_en', 'en').setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
            .setInputCols(["token"]) \
            .setOutputCol("lemma")

roberta_embeddings = RoBertaEmbeddings.pretrained("roberta_base", "en") \
      .setInputCols("sentences", "lemma") \
      .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
                    .setInputCols(["sentences", "embeddings"]) \
                    .setOutputCol("sentence_embeddings") \
                    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(64) \
              .setMaxEpochs(10) \
              .setLr(0.005) \
              .setEnableOutputLogs(True)

# embeddingsFinisher = EmbeddingsFinisher() \
#                     .setInputCols("embeddings") \
#                     .setOutputCols("finished_embeddings") \
#                     .setOutputAsVector(True) \
#                     .setCleanAnnotations(False)

# classifierdl = ClassifierDLApproach().setInputCols(["finished_embeddings"]) \
#               .setOutputCol("prediction") \
#               .setLabelColumn("sentiment") \
#               .setBatchSize(64) \
#               .setMaxEpochs(10) \
#               .setLr(0.005) \
#               .setEnableOutputLogs(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
roberta_base download started this may take some time.
Approximate size to download 284.8 MB
[OK!]


In [None]:
nlp_pipeline_roberta = Pipeline(stages=[document,
                                          sentence,
                                          token,
                                          lemmatizer, 
                                          roberta_embeddings,
                                          embeddingsSentence,
                                          classifierdl])

In [None]:
model_roberta = nlp_pipeline_roberta.fit(train)

predict_test = model_roberta.transform(test)

In [None]:
predict_test.show(truncate=100)
#predict_test.select("sentence_embeddings").show(truncate=False)

+----------------------------------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|                                                                                                text|sentiment|                                                                                      

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       170
     Neutral       0.48      1.00      0.65       480
    Positive       0.00      0.00      0.00       350

    accuracy                           0.48      1000
   macro avg       0.16      0.33      0.22      1000
weighted avg       0.23      0.48      0.31      1000



  _warn_prf(average, modifier, msg_start, len(result))


- epoch = 10: 0.12, 0.33, 0.17

# **XLM-RoBERTa and ClassifierDLApproach**

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

sentence = SentenceDetector() \
          .setInputCols(['document']) \
          .setOutputCol('sentences')

token = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")

#stop_words = StopWordsCleaner.pretrained('stopwords_en', 'en').setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
            .setInputCols(["token"]) \
            .setOutputCol("lemma")

xlmroberta_embeddings = XlmRoBertaEmbeddings.pretrained("xlm_roberta_base", "xx") \
      .setInputCols("sentences", "lemma") \
      .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
                    .setInputCols(["sentences", "embeddings"]) \
                    .setOutputCol("sentence_embeddings") \
                    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(64) \
              .setMaxEpochs(10) \
              .setLr(0.005) \
              .setEnableOutputLogs(True)

# classifierdl = ClassifierDLApproach().setInputCols(["embeddings"]) \
#               .setOutputCol("prediction") \
#               .setLabelColumn("sentiment") \
#               .setBatchSize(64) \
#               .setMaxEpochs(10) \
#               .setLr(0.005) \
#               .setEnableOutputLogs(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
xlm_roberta_base download started this may take some time.
Approximate size to download 619.5 MB
[OK!]


In [None]:
nlp_pipeline_xlmroberta = Pipeline(stages=[document,
                                          sentence,
                                          token,
                                          lemmatizer, 
                                          xlmroberta_embeddings,
                                          embeddingsSentence,
                                          classifierdl])

In [None]:
model_xlmroberta = nlp_pipeline_xlmroberta.fit(train)

predict_test = model_xlmroberta.transform(test)

In [None]:
predict_test.show(truncate=100)
#predict_test.select("sentence_embeddings").show(truncate=False)

+----------------------------------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|                                                                                                text|sentiment|                                                                                      

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       170
     Neutral       0.48      1.00      0.65       480
    Positive       0.00      0.00      0.00       350

    accuracy                           0.48      1000
   macro avg       0.16      0.33      0.22      1000
weighted avg       0.23      0.48      0.31      1000



  _warn_prf(average, modifier, msg_start, len(result))


- epoch = 10: 0.16, 0.33, 0.22 (l2: 0.12, 0.33, 0.17)




# **XLNet and ClassifierDLApproach**

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

sentence = SentenceDetector() \
          .setInputCols(['document']) \
          .setOutputCol('sentences')

token = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")

#stop_words = StopWordsCleaner.pretrained('stopwords_en', 'en').setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
            .setInputCols(["token"]) \
            .setOutputCol("lemma")

xlnet_embeddings = XlnetEmbeddings.pretrained("xlnet_base_cased", "en") \
      .setInputCols("sentences", "lemma") \
      .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
                    .setInputCols(["sentences", "embeddings"]) \
                    .setOutputCol("sentence_embeddings") \
                    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(64) \
              .setMaxEpochs(10) \
              .setLr(0.005) \
              .setEnableOutputLogs(True)

# classifierdl = ClassifierDLApproach().setInputCols(["embeddings"]) \
#               .setOutputCol("prediction") \
#               .setLabelColumn("sentiment") \
#               .setBatchSize(64) \
#               .setMaxEpochs(10) \
#               .setLr(0.005) \
#               .setEnableOutputLogs(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
xlnet_base_cased download started this may take some time.
Approximate size to download 417.5 MB
[OK!]


In [None]:
nlp_pipeline_xlnet = Pipeline(stages=[document,
                                          sentence,
                                          token,
                                          lemmatizer, 
                                          xlnet_embeddings,
                                          embeddingsSentence,
                                          classifierdl])

In [None]:
model_xlnet = nlp_pipeline_xlnet.fit(train)

predict_test = model_xlnet.transform(test)

In [None]:
predict_test.show(truncate=100)
#predict_test.select("sentence_embeddings").show(truncate=False)

+----------------------------------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|                                                                                                text|sentiment|                                                                                      

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       170
     Neutral       0.00      0.00      0.00       480
    Positive       0.35      1.00      0.52       350

    accuracy                           0.35      1000
   macro avg       0.12      0.33      0.17      1000
weighted avg       0.12      0.35      0.18      1000



  _warn_prf(average, modifier, msg_start, len(result))


- epoch = 10: 12, 33, 17




# **BERT-base uncased and ClassifierDLApproach**

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

sentence = SentenceDetector() \
          .setInputCols(['document']) \
          .setOutputCol('sentences')

token = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")

#stop_words = StopWordsCleaner.pretrained('stopwords_en', 'en').setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
            .setInputCols(["token"]) \
            .setOutputCol("lemma")

bertbase_embeddings = BertEmbeddings.pretrained("bert_base_uncased", "en") \
                  .setInputCols(["sentences","lemma"]) \
                  .setOutputCol("embeddings") \
                  .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
                    .setInputCols(["sentences", "embeddings"]) \
                    .setOutputCol("sentence_embeddings") \
                    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(64) \
              .setMaxEpochs(5) \
              .setLr(0.005) \
              .setEnableOutputLogs(True)


# classifierdl = ClassifierDLApproach().setInputCols(["embeddings"]) \
#               .setOutputCol("prediction") \
#               .setLabelColumn("sentiment") \
#               .setBatchSize(32) \
#               .setMaxEpochs(5) \
#               .setLr(0.005) \
#               .setEnableOutputLogs(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_uncased download started this may take some time.
Approximate size to download 392.5 MB
[OK!]


In [None]:
nlp_pipeline_bertbaseuncased = Pipeline(stages=[document,
                                          sentence,
                                          token,
                                          lemmatizer, 
                                          bertbase_embeddings,
                                          embeddingsSentence,
                                          classifierdl])

In [None]:
model_bertbaseuncased = nlp_pipeline_bertbaseuncased.fit(train)

predict_test = model_bertbaseuncased.transform(test)

In [None]:
predict_test.show(truncate=100)
#predict_test.select("sentence_embeddings").show(truncate=False)

+----------------------------------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|                                                                                                text|sentiment|                                                                                      

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       170
     Neutral       0.48      1.00      0.65       480
    Positive       0.00      0.00      0.00       350

    accuracy                           0.48      1000
   macro avg       0.16      0.33      0.22      1000
weighted avg       0.23      0.48      0.31      1000



  _warn_prf(average, modifier, msg_start, len(result))


- epoch = 5, batchsize = 32, lr = 0.005: 0.16, 0.33, 0.22
- epoch = 10, batchsize = 32, lr = 0.005: 0.16, 0.33, 0.22
- epoch = 5, batchsize = 64, lr = 0.005: 0.16, 0.33, 0.22
- epoch = 10, batchsize = 64, lr = 0.005: 0.16, 0.33, 0.22

# **Elmo and ClassifierDLApproach**

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

sentence = SentenceDetector() \
          .setInputCols(['document']) \
          .setOutputCol('sentences')

token = Tokenizer() \
        .setInputCols(["sentences"]) \
        .setOutputCol("token")

#stop_words = StopWordsCleaner.pretrained('stopwords_en', 'en').setInputCols(["token"]).setOutputCol("cleanTokens").setCaseSensitive(False)

lemmatizer = LemmatizerModel.pretrained("lemma_antbnc", "en") \
            .setInputCols(["token"]) \
            .setOutputCol("lemma")

elmo_embeddings = ElmoEmbeddings.pretrained("elmo", "en") \
      .setInputCols(["sentences", "lemma"]) \
      .setOutputCol("embeddings") \
      .setPoolingLayer("elmo")

embeddingsSentence = SentenceEmbeddings() \
                    .setInputCols(["sentences", "embeddings"]) \
                    .setOutputCol("sentence_embeddings") \
                    .setPoolingStrategy("AVERAGE")

classifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(64) \
              .setMaxEpochs(10) \
              .setLr(0.005) \
              .setEnableOutputLogs(True)

# classifierdl = ClassifierDLApproach().setInputCols(["embeddings"]) \
#               .setOutputCol("prediction") \
#               .setLabelColumn("sentiment") \
#               .setBatchSize(64) \
#               .setMaxEpochs(5) \
#               .setLr(0.005) \
#               .setEnableOutputLogs(True)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
elmo download started this may take some time.
Approximate size to download 334.1 MB
[OK!]


In [None]:
nlp_pipeline_elmo = Pipeline(stages=[document,
                                          sentence,
                                          token,
                                          lemmatizer, 
                                          elmo_embeddings,
                                          embeddingsSentence,
                                          classifierdl])

In [None]:
model_elmo = nlp_pipeline_elmo.fit(train)

predict_test = model_elmo.transform(test)

In [None]:
predict_test.show(truncate=100)
#predict_test.select("sentence_embeddings").show(truncate=False)

+----------------------------------------------------------------------------------------------------+---------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------+
|                                                                                                text|sentiment|                                                                                      

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       170
     Neutral       0.48      1.00      0.65       480
    Positive       0.00      0.00      0.00       350

    accuracy                           0.48      1000
   macro avg       0.16      0.33      0.22      1000
weighted avg       0.23      0.48      0.31      1000



  _warn_prf(average, modifier, msg_start, len(result))


- epoch = 5, batchsize = 64, lr = 0.005: 0.16, 0.33, 0.22
- epoch = 10, batchsize = 64, lr = 0.005: 0.16, 0.33, 0.22
- epoch = 10, batchsize = 32, lr = 0.005: 0.16, 0.33, 0.22

# **BERT labse and ClassifierDLApproach (✔️)**
https://nlp.johnsnowlabs.com/2020/09/23/labse.html

In [None]:
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

embeddings = BertSentenceEmbeddings.pretrained("labse", "xx") \
            .setInputCols(["document"]) \
            .setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLApproach().setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction") \
              .setLabelColumn("sentiment") \
              .setBatchSize(64) \
              .setMaxEpochs(1) \
              .setLr(0.005) \
              .setEnableOutputLogs(True)

labse download started this may take some time.
Approximate size to download 1.7 GB
[OK!]


In [None]:
nlp_pipeline_BERT = Pipeline(stages=[document, 
                                     embeddings, 
                                     classifierdl])

In [None]:
model_BERT_labse = nlp_pipeline_BERT.fit(train)

predict_test = model_BERT_labse.transform(test)

In [None]:
predict_test.show()

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.85      0.62      0.72       170
     Neutral       0.76      0.79      0.77       480
    Positive       0.73      0.78      0.75       350

    accuracy                           0.76      1000
   macro avg       0.78      0.73      0.75      1000
weighted avg       0.76      0.76      0.76      1000



# **Save & Load model**

In [None]:
# SAVED MODEL
model_BERT_labse.stages[-1].write().overwrite().save('/content/drive/MyDrive/labse/bert_labse_model')

In [None]:
# LOAD MODEL
document = DocumentAssembler() \
          .setInputCol("text") \
          .setOutputCol("document")

embeddings = BertSentenceEmbeddings.pretrained("labse", "xx") \
      .setInputCols(["document"]) \
      .setOutputCol("sentence_embeddings")

classifierdl = ClassifierDLModel().load("/content/drive/MyDrive/labse/bert_labse_model") \
              .setInputCols(["sentence_embeddings"]) \
              .setOutputCol("prediction")

labse download started this may take some time.
Approximate size to download 1.7 GB
[OK!]


In [None]:
nlp_pipeline_BERT = Pipeline(stages=[document, 
                                     embeddings, 
                                     classifierdl])

In [None]:
model_BERT_labse = nlp_pipeline_BERT.fit(test)

predict_test = model_BERT_labse.transform(test)

In [None]:
predict_test.show()

+--------------------+---------+--------------------+--------------------+--------------------+
|                text|sentiment|            document| sentence_embeddings|          prediction|
+--------------------+---------+--------------------+--------------------+--------------------+
|pfizerbiontech to...|      0.0|[[document, 0, 97...|[[sentence_embedd...|[[category, 0, 97...|
|mom is doing pani...|      0.0|[[document, 0, 99...|[[sentence_embedd...|[[category, 0, 99...|
|today is my birth...|      1.0|[[document, 0, 14...|[[sentence_embedd...|[[category, 0, 14...|
|omi hospital kara...|      1.0|[[document, 0, 55...|[[sentence_embedd...|[[category, 0, 55...|
|our beloved pm sh...|      2.0|[[document, 0, 11...|[[sentence_embedd...|[[category, 0, 11...|
|super happy to ge...|      2.0|[[document, 0, 94...|[[sentence_embedd...|[[category, 0, 94...|
|covaxin s phase t...|      1.0|[[document, 0, 83...|[[sentence_embedd...|[[category, 0, 83...|
|dose at terapanth...|      1.0|[[docume

In [None]:
predict_test = predict_test.select('text','sentiment','prediction.result').withColumn('prediction',f.col('result').getItem(0))
predict_test = predict_test.withColumn('prediction',f.col('prediction').cast("double"))
predict_test.printSchema()

root
 |-- text: string (nullable = true)
 |-- sentiment: float (nullable = true)
 |-- result: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- prediction: double (nullable = true)



In [None]:
pred_test_rp = predict_test.select('sentiment','prediction').toPandas()
print (classification_report(pred_test_rp['sentiment'], pred_test_rp['prediction'], target_names=['Negative','Neutral','Positive']))

              precision    recall  f1-score   support

    Negative       0.85      0.62      0.72       170
     Neutral       0.76      0.79      0.77       480
    Positive       0.73      0.78      0.75       350

    accuracy                           0.76      1000
   macro avg       0.78      0.73      0.75      1000
weighted avg       0.76      0.76      0.76      1000

