# Required modules

In [None]:
!pip install transformers datasets

# Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!unzip /content/drive/MyDrive/Kaggle_Text_Classification/Coronavirus_tweets.zip

In [None]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': 'Corona_NLP_train.csv', 'test': 'Corona_NLP_test.csv'}, encoding = "ISO-8859-1")

In [None]:
dataset

# BERT model for Covid-19 tweet classification

In [None]:
# 1. Call pretrained model 
# 2. Call the tokenizer 
from transformers import AutoTokenizer
model_name = "bert-base-cased"
bert_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# See how the BERT tokenizer works (tokenizer used is WordPiece)
text = "Covid-19 has finally ended!"
bert_tokens = bert_tokenizer(text).tokens()
print(bert_tokens)

In [None]:
# Convert labels given in the form of strings to integers
def label2int(label):
    label = label['Sentiment']
    idx = 0
    if label == 'Positive':
        idx = 0
    elif label == 'Negative':
        idx = 1
    elif label == 'Neutral':
        idx = 2
    elif label == 'Extremely Positive':
        idx = 3
    elif label == 'Extremely Negative':
        idx = 4
    return {'labels': idx}

# Tokenize the entire data
def tokenize_data(example):
    return bert_tokenizer(example['OriginalTweet'], padding='max_length')


dataset = dataset.map(tokenize_data, batched=True)

remove_columns = ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']
dataset = dataset.map(label2int, remove_columns=remove_columns)

In [None]:
dataset

In [None]:
# Get model and training args using Hugging face transformers library
from transformers import TrainingArguments, AutoModelForSequenceClassification
args_train = TrainingArguments("test_trainer", num_train_epochs=3)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

In [None]:
# 40K train and 1k test split 
train_dataset = dataset['train'].shuffle(seed=0).select(range(40000))
eval_dataset = dataset['train'].shuffle(seed=0).select(range(40000, 41000))

In [None]:
# Fine-tune the BERT model on text classification task
from transformers import Trainer
trainer = Trainer(model=model, args=args_train, train_dataset=train_dataset, eval_dataset=eval_dataset)

In [None]:
trainer.train()

In [None]:
# Evaluate the model
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer.evaluate()