In [None]:
# Fine Tuning pretrained BERT for Sentiment Classification using Transformers in Python
# Source: https://medium.com/nerd-for-tech/fine-tuning-pretrained-bert-for-sentiment-classification-using-transformers-in-python-931ed142e37 Retrieved in July 2022

In [None]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 9.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 80.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 83.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
# loading the train dataset
train=pd.read_csv("CovidMaskTrainingDatasetPre-processed.csv")


In [None]:
# loading the test dataset
#test=pd.read_csv("CovidMaskPeriod2Pre-processed.csv")
test=pd.read_csv("CovidMaskPeriod1Pre-processed.csv")


In [None]:
test.head()

In [None]:
train.head()

In [None]:
# function to convert the sentiment score to 0,1 or 2
def getSentimentScore(score):
        if score < 0:
            return 0
        elif score == 0:
            return 1
        else:
            return 2
train['TextBlob_SentimentNumber'] = train['Sentiment'].apply(getSentimentScore)

In [None]:
train

Unnamed: 0,text,Sentiment,cleanText,textLemma,TextBlob_SentimentNumber
0,@emeraldtyger @NYGovCuomo Open your home to a ...,-0.162338,open home covid positive elder r supposed go t...,open home covid positive elder r supposed go t...,0
1,"Saudi Arabia records 48 coronavirus deaths, 3,...",0.136364,saudi arabia records 48 coronavirus deaths 394...,saudi arabia record 48 coronavirus death 3943 ...,2
2,@venusin12th There’s a website you signup for ...,0.400000,there’s website signup pay subscription it’s c...,there s website signup pay subscription it s c...,2
3,@WarRoomPandemic @VOG_2020 Exactly! Why NO maj...,0.140625,exactly major news agency ask question virus c...,exactly major news agency ask question virus c...,2
4,i just wish this corona shit go away &amp; eve...,-0.200000,wish corona shit go away everybody stop dying,wish corona shit go away everybody stop dying,0
...,...,...,...,...,...
584334,"10,286,768 #Coronavirus cases as of 2020-06-29...",0.000000,10286768 coronavirus cases 20200629 104001,10286768 coronavirus case 20200629 104001,1
584335,Listen to the newest episode of The Marketplac...,0.000000,listen newest episode marketplace ideas podcas...,listen newest episode marketplace idea podcast...,1
584336,What measures are you taking to reassure custo...,0.225000,measures taking reassure customers youre safe ...,measure taking reassure customer youre safe bu...,2
584337,A study found that if the nation continues to ...,0.000000,study found nation continues see elevated infe...,study found nation continues see elevated infe...,1


In [None]:
# extracting features and labels 
trainText = train['textLemma'].values.tolist()
trainLabel = train['TextBlob_SentimentNumber'].values.tolist()


In [None]:
# split 80(training)/20(validation)
from sklearn.model_selection import train_test_split
trainText, validationText, trainLabel, validationLabel = train_test_split(trainText, trainLabel, test_size=.2,random_state=42,stratify=trainLabel)


In [None]:
# importing libraries
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast,DistilBertForSequenceClassification
from transformers import Trainer,TrainingArguments

In [None]:
#	Loading the pretrained tokenizer  
modelTokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased',num_labels=3)

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [None]:
# encode training and validation sets
trainEncoding = modelTokenizer(trainText, truncation=True, padding=True,return_tensors = 'pt')
validationEncoding = modelTokenizer(validationText, truncation=True, padding=True,return_tensors = 'pt')

In [None]:
#create Pytorch datasets using encodings and labels 
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
# creating an object of the subclasss SentimentDataset for training and validation
trainDataset = SentimentDataset(trainEncoding, trainLabel)
validationDataset = SentimentDataset(validationEncoding, validationLabel)


In [None]:
# creating a function for metrics calculation
from sklearn.metrics import accuracy_score, f1_score
def metricsFunction(label):
    prediction, labels = label
    prediction = np.argmax(prediction, axis=1)
    accuracyScore = accuracy_score(y_true=labels, y_pred=prediction)
    f1Score = f1_score(labels, prediction, average='weighted')

    return {"accuracy": accuracyScore,"f1_score":f1Score }

In [None]:
# defining training arguments 
trainingArgs = TrainingArguments(
    output_dir='./res',          # outcome directory
    evaluation_strategy="steps",
    num_train_epochs=3,              # training epochs number
    per_device_train_batch_size=32,  # training batch size per device 
    per_device_eval_batch_size=64,   # evaluation batch size 
    learning_rate=3e-5, 
    warmup_steps=500,                #  learning rate warmup steps number  
    weight_decay=0.01,               # weight decay   
    logging_dir='./logs4',            # logs directory 
    load_best_model_at_end=True,
)

In [None]:
# Loading the pretrained model  
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased",num_labels=3)
# creating the trainer object 
trainer = Trainer(
    model=model,# the Transformers model 
    args=trainingArgs, # training arguments
    train_dataset=trainDataset,# training dataset
    eval_dataset=validationDataset , # evaluation dataset
    compute_metrics=metricsFunction,
)
# start the train process
trainer.train()

Downloading pytorch_model.bin:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier

Step,Training Loss,Validation Loss,Accuracy,F1 Score
500,0.8444,0.583558,0.778382,0.779498
1000,0.5319,0.48824,0.828679,0.828548
1500,0.4872,0.450466,0.844466,0.842645
2000,0.4566,0.430501,0.854528,0.854165
2500,0.4342,0.419031,0.853536,0.854467
3000,0.4113,0.397757,0.865344,0.86536
3500,0.4014,0.393736,0.867295,0.8675
4000,0.4036,0.391761,0.86768,0.867917
4500,0.3988,0.394442,0.869862,0.869659
5000,0.3954,0.37899,0.874166,0.873887


***** Running Evaluation *****
  Num examples = 116868
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-500
Configuration saved in ./res/checkpoint-500/config.json
Model weights saved in ./res/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 116868
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-1000
Configuration saved in ./res/checkpoint-1000/config.json
Model weights saved in ./res/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 116868
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-1500
Configuration saved in ./res/checkpoint-1500/config.json
Model weights saved in ./res/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 116868
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-2000
Configuration saved in ./res/checkpoint-2000/config.json
Model weights saved in ./res/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Nu

TrainOutput(global_step=43827, training_loss=0.32427284250645205, metrics={'train_runtime': 28689.8478, 'train_samples_per_second': 48.882, 'train_steps_per_second': 1.528, 'total_flos': 4.644432874173773e+16, 'train_loss': 0.32427284250645205, 'epoch': 3.0})

In [None]:
# evaluate the model using the validation dataset
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 116868
  Batch size = 64


{'eval_loss': 0.32451164722442627,
 'eval_accuracy': 0.8900212205222987,
 'eval_f1_score': 0.8901032899650411,
 'eval_runtime': 230.0055,
 'eval_samples_per_second': 508.11,
 'eval_steps_per_second': 7.943,
 'epoch': 3.0}

In [None]:
# create a new column in test called Sentiment and initilaize it with zero
test['Sentiment'] = 0 
# extract the features and labels 
testText = test['textLemma'].values.tolist() 
testLabel = test['Sentiment'].values.tolist() 
# encode the testing set
testEncoding = modelTokenizer(testText, truncation=True, padding=True,return_tensors = 'pt')
# creating an object of the subclasss SentimentDataset for testing set 
testDataset = SentimentDataset(testEncoding, testLabel)
# predict the sentiments 
preds = trainer.predict(test_dataset=testDataset)

***** Running Prediction *****
  Num examples = 366266
  Batch size = 64


In [None]:
# probabilities of test sentiments prediction 
probabilities = torch.from_numpy(preds[0]).softmax(1)
# convert to numpy array
predictions = probabilities.numpy()

In [None]:
predictions


array([[0.13382936, 0.61746526, 0.24870543],
       [0.9312634 , 0.00341962, 0.06531709],
       [0.05926002, 0.01155891, 0.9291811 ],
       ...,
       [0.02905289, 0.9338011 , 0.03714601],
       [0.01483336, 0.23316966, 0.75199693],
       [0.00454289, 0.00394607, 0.99151105]], dtype=float32)

In [None]:
# the numpy array probabilities are converted into a data frame 
probabilitiesArray = pd.DataFrame(predictions,columns=['Negative','Neutral','Positive'])
probabilitiesArray.head()

Unnamed: 0,Negative,Neutral,Positive
0,0.133829,0.617465,0.248705
1,0.931263,0.00342,0.065317
2,0.05926,0.011559,0.929181
3,0.034537,0.003225,0.962238
4,0.059267,0.853316,0.087417


In [None]:
# function to generate the sentiment label
def labelFunction(x):
  if x == 0:
    return 'Negative'
  elif x == 1:
    return 'Neutral'
  else:
    return 'Positive'
# adding the sentiment labels into the sentiment column in test dataset
resultSentiments = np.argmax(predictions,axis=1)
test['Sentiment'] = resultSentiments
test['Sentiment'] = test['Sentiment'].map(labelFunction)
test.head()

Unnamed: 0,id,text,created_at,cleanText,textLemma,Sentiment
0,1277300891335491584,"#Weakness \n""While Trump almost invariably esc...",Jun,weakness trump almost invariably eschews masks...,weakness trump almost invariably eschews mask ...,Neutral
1,1277300871928266752,@JoeBiden I don't think I have ever been so di...,Jun,dont think ever disappointed presidential cand...,dont think ever disappointed presidential cand...,Negative
2,1277300855763582976,ALSOO the mask mostly prevents people from spr...,Jun,alsoo mask mostly prevents people spreading pr...,alsoo mask mostly prevents people spreading pr...,Positive
3,1277300855134273536,@bga1228 @Im_YoPusha @Lisaaa40 @Bethenny I’m n...,Jun,i’m saying exactly i’m saying people caring se...,i m saying exactly i m saying people caring se...,Positive
4,1277300821839970309,@timmyvoe @piersmorgan Have experience in deal...,Jun,experience dealing virus outbreaks mask wearin...,experience dealing virus outbreak mask wearing...,Neutral


In [None]:
# concatenation of test and probabilitiesArray datasets 
test = pd.concat([test, probabilitiesArray], axis=1, join='inner')

In [None]:
# calculate the maximum value between the 3 sentiments 
test["SentimentScoreMax"]=test[["Negative", "Neutral","Positive"]].max(axis=1)

In [None]:
# generate the sentiments level (intensity) and add in a new column called SentimentLevel
def getScore(test):
        if test['Sentiment']=='Negative' and test['SentimentScoreMax']>0.5 :
            return 'High Negative'
        elif test['Sentiment']=='Negative' and test['SentimentScoreMax']<0.5:
            return 'Low Negative'
        elif test['Sentiment']=='Neutral' and test['SentimentScoreMax']<0.5:
            return 'Low Neutral'
        elif test['Sentiment']=='Neutral' and test['SentimentScoreMax']>0.5 :
            return 'High Neutral'
        elif test['Sentiment']=='Positive' and test['SentimentScoreMax']<0.5:
            return 'Low Positive'
        elif test['Sentiment']=='Positive' and test['SentimentScoreMax']>0.5 :
            return 'High Positive'
        else:
            return np.NaN
        
test['SentimentLevel']=test.apply(lambda test : getScore(test), axis=1)

In [None]:
test.head()

Unnamed: 0,id,text,created_at,cleanText,textLemma,Sentiment,Negative,Neutral,Positive,SentimentScoreMax,SentimentLevel
0,1277300891335491584,"#Weakness \n""While Trump almost invariably esc...",Jun,weakness trump almost invariably eschews masks...,weakness trump almost invariably eschews mask ...,Neutral,0.133829,0.617465,0.248705,0.617465,High Neutral
1,1277300871928266752,@JoeBiden I don't think I have ever been so di...,Jun,dont think ever disappointed presidential cand...,dont think ever disappointed presidential cand...,Negative,0.931263,0.00342,0.065317,0.931263,High Negative
2,1277300855763582976,ALSOO the mask mostly prevents people from spr...,Jun,alsoo mask mostly prevents people spreading pr...,alsoo mask mostly prevents people spreading pr...,Positive,0.05926,0.011559,0.929181,0.929181,High Positive
3,1277300855134273536,@bga1228 @Im_YoPusha @Lisaaa40 @Bethenny I’m n...,Jun,i’m saying exactly i’m saying people caring se...,i m saying exactly i m saying people caring se...,Positive,0.034537,0.003225,0.962238,0.962238,High Positive
4,1277300821839970309,@timmyvoe @piersmorgan Have experience in deal...,Jun,experience dealing virus outbreaks mask wearin...,experience dealing virus outbreak mask wearing...,Neutral,0.059267,0.853316,0.087417,0.853316,High Neutral


In [None]:
# saving the testing dataset with predicted sentiments in a CSV file 
test.to_csv('CovidMaskPeriod1Predicted.csv', mode='w', encoding='utf-8', index=False, line_terminator='\n')
#test.to_csv('CovidMaskPeriod2Predicted.csv', mode='w', encoding='utf-8', index=False, line_terminator='\n')




In [None]:
from google.colab import files

files.download('CovidMaskPeriod1PredictedBERTSentiments.csv')
#files.download('CovidMaskPeriod2PredictedBERTSentiments.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>