<a href="https://colab.research.google.com/github/abhijeetk597/twitter-sentiment-analysis/blob/main/twitter_sentiment_analysis_using_BERT_Part_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part 2: Fine Tuning BERT on train data using 🤗 Hugging_Face trainer

In [2]:
# Install necessary libraries
!pip install -q transformers[torch] datasets evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m907.1 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.2/124.2 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.0/196.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.0/166.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.1/99.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [4]:
# Import libraries
import pandas as pd
import numpy as np

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

In [19]:
# import data saved in part 1
train_df = pd.read_csv("train_data_processed.csv")
test_df = pd.read_csv("test_data_processed.csv")

In [20]:
# renaming columns as required for fine-tuning
# we will use only 20% of train data for fine-tuning
train_df = train_df.sample(frac=1).reset_index(drop=True)
train_df = train_df.sample(int(len(train_df)*.2)).reset_index(drop=True)
train_df = train_df.rename(columns={"text_clean": "text", "Sentiment": "label"})
test_df = test_df.rename(columns={"text_clean": "text", "Sentiment": "label"})

In [21]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10799 entries, 0 to 10798
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    10799 non-null  object
 1   label   10799 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 168.9+ KB


In [26]:
# create dataset suitable for trainer
x = train_df[["label", "text"]].to_dict(orient="list")
train_dataset = Dataset.from_dict(x)
train_dataset[0]

{'label': 2,
 'text': 'children and the underprivileged are doing more proportionally to combat coronavirus than billionaires just saying you staying at home you using sanitizer amp face masks you sharing supplies you shopping to your needs supporting local business you too'}

In [27]:
y = test_df[["label", "text"]].to_dict(orient="list")
test_dataset = Dataset.from_dict(y)
test_dataset[0]

{'label': 0,
 'text': 'anyone planning to look after vulnerable elderly during coronavirus covid19 pandemic in uk some have no broadband tabletonline bankingonline shopping to reduce loneliness wld be a good ide'}

In [9]:
# tokenizer for bert
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [24]:
# preprocess function and mapping this function to dataset
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10799 [00:00<?, ? examples/s]

Map:   0%|          | 0/3787 [00:00<?, ? examples/s]

In [11]:
# for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
# ids and labels
id2label = {0: "NEGATIVE", 1: "NEUTRAL", 2: "POSITIVE"}
label2id = {"NEGATIVE": 0, "NEUTRAL": 1, "POSITIVE": 2}

In [13]:
# instantiate model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# select metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [29]:
# train / fine-tune
training_args = TrainingArguments(
    output_dir="BERT_finetuned_model",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.378356,0.874307
2,0.252400,0.415714,0.878796


TrainOutput(global_step=676, training_loss=0.2337002274552746, metrics={'train_runtime': 291.6849, 'train_samples_per_second': 74.046, 'train_steps_per_second': 2.318, 'total_flos': 680073156391356.0, 'train_loss': 0.2337002274552746, 'epoch': 2.0})

In [38]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier("very true china has done a great job of more than 80000 people infected in china over 67000 have recovered according to data compiled by johns hopkins university there were 40 new confirmed cases of cor")

[{'label': 'POSITIVE', 'score': 0.9642297625541687}]

We achieved 87.87% accuracy on fine-tuning BERT with our train data.

Ref Hugging Face 🤗:
- [Create Dataset](https://huggingface.co/docs/datasets/en/create_dataset)
- [Sequence classification fine-tuning](https://huggingface.co/docs/transformers/tasks/sequence_classification)