# BERT Model for Classification Using Twitter COVID19 Dataset

Library Import

In [None]:
!pip install transformers

import tensorflow as tf
import numpy as np
import pandas as pd
import zipfile
import os
import nltk
import matplotlib.pyplot as plt
import transformers as trfs
import torch
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow import keras
from tensorflow.keras import layers
from google.colab import files
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 40.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyy

True

Kaggle.json upload

In [None]:
uploaded = files.upload()

for fn in uploaded.keys():
  print('Kaggle.json has been uploaded!')

!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
Kaggle.json has been uploaded!


Dataset Download

Dataset source: https://www.kaggle.com/datasets/datatattle/covid-19-nlp-text-classification

In [None]:
!kaggle datasets download -d datatattle/covid-19-nlp-text-classification
!unzip -o covid-19-nlp-text-classification.zip

Downloading covid-19-nlp-text-classification.zip to /content
  0% 0.00/4.38M [00:00<?, ?B/s]100% 4.38M/4.38M [00:00<00:00, 44.6MB/s]

Archive:  covid-19-nlp-text-classification.zip
  inflating: Corona_NLP_test.csv     
  inflating: Corona_NLP_train.csv    


# **Part a**


Dataset Preview using Panda Library

In [None]:
dataset_train = pd.read_csv('Corona_NLP_train.csv', delimiter=',',  encoding='latin-1')
dataset_test = pd.read_csv('Corona_NLP_test.csv', delimiter=',',  encoding='latin-1')

dataset_train.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


**Preprocess the dataset**

1.   Drop unwanted columns:
      *   UserName
      *   ScreenName
      *   Location
      *   TweetAt

2.   Used columns:
      *   OriginalTweet: x (tweet from users)
      *   Sentiment: y, which contains 5 labels (extremely negative, negative, neutral, positive, extremely positive)





In [None]:
original_tweet_field = "OriginalTweet"
sentiment_field = "Sentiment"

def convert_sentiment_to_numbers(dataset):
  dataset.loc[dataset.Sentiment == 'Extremely Negative', sentiment_field] = 0
  dataset.loc[dataset.Sentiment == 'Negative', sentiment_field] = 1
  dataset.loc[dataset.Sentiment == 'Neutral', sentiment_field] = 2
  dataset.loc[dataset.Sentiment == 'Positive', sentiment_field] = 3
  dataset.loc[dataset.Sentiment == 'Extremely Positive', sentiment_field] = 4
  return dataset

dataset_train = dataset_train[[original_tweet_field, sentiment_field]]
dataset_test = dataset_test[[original_tweet_field, sentiment_field]]

dataset_train = convert_sentiment_to_numbers(dataset_train)
dataset_test = convert_sentiment_to_numbers(dataset_test)

dataset_train

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,2
1,advice Talk to your neighbours family to excha...,3
2,Coronavirus Australia: Woolworths to give elde...,3
3,My food stock is not the only one which is emp...,3
4,"Me, ready to go at supermarket during the #COV...",0
...,...,...
41152,Airline pilots offering to stock supermarket s...,2
41153,Response to complaint not provided citing COVI...,0
41154,You know itÂs getting tough when @KameronWild...,3
41155,Is it wrong that the smell of hand sanitizer i...,2


**Remove Stopwords**

In NLP, stopwords are a set of commonly used words in a language. Some of the examples are “a”, “the”, “is”, and so on. These are eliminated in NLP, as they carry little useful information. This way, we can give more focus on the important information. Not to worry, this removal does not have negative consequences, as it only removes common words. In this code, I use NLTK for removing stop words.

In [None]:
stop = stopwords.words('english')
dataset_train[original_tweet_field] = dataset_train[original_tweet_field].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

dataset_test[original_tweet_field] = dataset_test[original_tweet_field].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))

dataset_train

Unnamed: 0,OriginalTweet,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,2
1,advice Talk neighbours family exchange phone n...,3
2,Coronavirus Australia: Woolworths give elderly...,3
3,"My food stock one empty... PLEASE, panic, THER...",3
4,"Me, ready go supermarket #COVID19 outbreak. No...",0
...,...,...
41152,Airline pilots offering stock supermarket shel...,2
41153,Response complaint provided citing COVID-19 re...,0
41154,You know itÂs getting tough @KameronWilds rat...,3
41155,Is wrong smell hand sanitizer starting turn on...,2


**Perform English Stemming**

Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefices or to the roots of words known as lemma. This technique is important in NLP, as it reduces inflectional form of each word into a common base word or root word or stem word. In this code, I use Snowball Stemmer. It is the next version of Porter Stemmer. This will make the stemming more precise compared to porter stemmer.

In [None]:
stemmer = SnowballStemmer("english")

dataset_train[original_tweet_field] = dataset_train[original_tweet_field].apply(lambda x: ' '.join([stemmer.stem(y) for y in x.split()]))
dataset_test[original_tweet_field] = dataset_test[original_tweet_field].apply(lambda x: ' '.join([stemmer.stem(y) for y in x.split()]))

dataset_train

Unnamed: 0,OriginalTweet,Sentiment
0,@menyrbi @phil_gahan @chrisitv https://t.co/if...,2
1,advic talk neighbour famili exchang phone numb...,3
2,"coronavirus australia: woolworth give elderly,...",3
3,"my food stock one empty... please, panic, ther...",3
4,"me, readi go supermarket #covid19 outbreak. no...",0
...,...,...
41152,airlin pilot offer stock supermarket shelv #nz...,2
41153,respons complaint provid cite covid-19 relat d...,0
41154,you know itâ get tough @kameronwild ration to...,3
41155,is wrong smell hand sanit start turn on? #coro...,2


Feed the dataset into Pytorch by building TorchTensor data type

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

**Implement BertTokenizer Model from pretrained models**

After all the preprocesses, we use BERTokenizer. BERTokenizer is the most popular tokenizer for a wide range of language based machine learning. BERT uses WordPiece tokenizer. It works by splitting words either into the full forms or into word pirces, where one word can be broken into multiple tokens. Using BERT allows for easily identifying related words as they will usually share some of the same input tokens, which are then fed into the first layers of BERT.

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

*   Convert OriginalTweet data field into Tokenized BERT format
*   Combine OriginalTweet and Sentiment field into Dataset Pytorch format



In [None]:
twitter_train = list(dataset_train[original_tweet_field])
twitter_test = list(dataset_test[original_tweet_field])
sentiments_train = list(dataset_train[sentiment_field])
sentiments_test = list(dataset_test[sentiment_field])

train_twitter_tokenized = tokenizer(twitter_train, 
                                    padding=True,
                                    truncation=True,
                                    max_length=600)

test_twitter_tokenized = tokenizer(twitter_test,
                                   padding=True,
                                   truncation=True,
                                   max_length=600)

train_dataset = Dataset(train_twitter_tokenized, sentiments_train)
test_dataset = Dataset(test_twitter_tokenized, sentiments_test)

# **Part b**

Initialize BertForSequenceClassification from Huggingface Pretrained model

In [None]:
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding

Train BERT model

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average="weighted")
    precision = precision_score(y_true=labels, y_pred=pred, average="weighted")
    f1 = f1_score(y_true=labels, y_pred=pred, average="weighted")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

arguments = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=bert_model,
    args=arguments,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Training will be done in part c, as it also does machine learning training and testing

# **Part c**

In [None]:
trainer.train()

***** Running training *****
  Num examples = 41157
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 15435


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,No log,1.572006,0.248815,0.1067,0.248815,0.102512
200,No log,1.644528,0.168773,0.116336,0.168773,0.066177
300,No log,1.578798,0.328857,0.29168,0.328857,0.224096
400,No log,1.294359,0.434966,0.370181,0.434966,0.381615
500,1.461100,1.282688,0.432596,0.455445,0.432596,0.396453
600,1.461100,1.242178,0.488942,0.488002,0.488942,0.486435
700,1.461100,1.370048,0.465508,0.507162,0.465508,0.437394
800,1.461100,1.229119,0.467615,0.554318,0.467615,0.419399
900,1.461100,1.236862,0.517378,0.527046,0.517378,0.493083
1000,1.196500,1.147166,0.536072,0.563021,0.536072,0.527899


***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8
***** Running Evaluation *****
  Num examples = 3798
  Batch size = 8
***** Running Evaluation *****
  Num examples = 379

**Performance Metrics**

Based on all of the metrics above (on training and testing table result), we can see that the accuracy, precision, recall and f1 score are quite good.

Below are the definitions of each of the metrics:

*   **Precision**: the number of correct documents returned by our model.
*   **Recall**: the number of positive results returned from our machine learning model. 

*   **F1-Score**: f1-score describes about how accurate our classifier (how many instances it classifies correctly)

*   **Accuracy**: number of correct predictions made as a ratio of all predictions made.