<a href="https://colab.research.google.com/github/ajaybyan/Fine-Tune-HuggingFace-Model/blob/main/Tweet_Sentiment_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#! pip install transformers[torch] datasets

### Load Data

In [3]:
import pandas as pd
import numpy as np

In [4]:
train_df = pd.read_csv('Corona_NLP_train.csv',  encoding="ISO-8859-1")

In [5]:
train_df.isnull().sum()

UserName            0
ScreenName          0
Location         8590
TweetAt             0
OriginalTweet       0
Sentiment           0
dtype: int64

In [6]:
train_df.shape

(41157, 6)

In [7]:
train_df.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [8]:
train_df.Sentiment.unique()

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [9]:
# taking sample of train data
train_df = train_df.sample(frac=0.7)
train_df.shape

(28810, 6)

In [10]:
test_df = pd.read_csv('Corona_NLP_test.csv',  encoding="ISO-8859-1")
test_df.shape

(3798, 6)

In [11]:
# label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train_df['Sentiment'] = le.fit_transform(train_df['Sentiment'])
test_df['Sentiment'] = le.transform(test_df['Sentiment'])

In [None]:
# Preprocess tweets to replace username and link with placeholder

def preprocess(text):
  new_text = ""
  for t in text.split(" "):
      t = '@user' if t.startswith('@') and len(t) > 1 else t
      t = 'http' if t.startswith('http') else t
      new_text = new_text+" "+ t
  return new_text.strip()

In [None]:
train_df['OriginalTweet'] = train_df['OriginalTweet'].apply(preprocess)
test_df['OriginalTweet'] = test_df['OriginalTweet'].apply(preprocess)

### Create Dataset Object

Select only OriginalTweets and there Sentiments to create the dataset.

In [45]:
from datasets import Dataset

# create dataset objects with the tweets and sentiments
train_ds = Dataset.from_dict({"text":train_df.OriginalTweet.tolist(), "labels":train_df.Sentiment.tolist()})
test_ds = Dataset.from_dict({"text":test_df.OriginalTweet.tolist(), "labels":test_df.Sentiment.tolist()})

In [46]:
train_ds[0]

{'text': '@user Can you make an announcement to the people.... Please stop being so nasty to us Grocery store workers. We are getting cussed and threatened for limiting items.  Thank you for everything you are doing for our State #HealthatHome #Positivit',
 'labels': 0}

### Tokenize Data

In [15]:
import transformers
from transformers import AutoTokenizer

In [44]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [47]:
# tokenize text

def tokenize(examples):

  return tokenizer(examples['text'], truncation=True)

train_ds = train_ds.map(tokenize)
test_ds = test_ds.map(tokenize)

Map:   0%|          | 0/28810 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [48]:
train_ds[0].keys()

dict_keys(['text', 'labels', 'input_ids', 'attention_mask'])

In [49]:
# convert to torch format
train_ds.set_format('torch', columns = ['input_ids', 'attention_mask', 'labels'])
test_ds.set_format('torch', columns = ['input_ids', 'attention_mask', 'labels'])

In [50]:
train_ds[0]

{'labels': tensor(0),
 'input_ids': tensor([     0,   1374,  65918,   4171,    398,   3249,    142,   3398,  85018,
            674,     47,     70,   3395,      5,     27,  30607,   7279,   8035,
            221,     24,   8946,     47,   1821,  14854,   3443,     53,   4343,
         133325,      5,   1401,    621,  20949,    314,      7,   5281,    136,
         120332,     33,    297,    100,  17475,    214,  55769,      5,  25689,
            398,    100,  26818,    398,    621,  20594,    100,   2446,  22836,
            468, 177109,    257,  55376,    468,   7192,  61054,   3760,      2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

### Training - FineTuning

In [51]:
# match label ids to label
id2label = {0: "Extremely Negative", 1: "Extremely Positive", 2: "Negative", 3:"Neutral", 4:"Positive"}
label2id = {"Extremely Negative": 0, "Extremely Positive": 1, "Negative":2, "Neutral":3, "Positive":4}

In [52]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="micro")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [53]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [55]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-xlm-roberta-base', num_labels = 5, id2label=id2label, label2id=label2id)

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_steps=500,
        evaluation_strategy="epoch",
        logging_dir="./logs",
        learning_rate=5e-5,
        weight_decay = 0.1,
    )

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator = data_collator
)

In [58]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.7568,0.71,0.730384,0.730384,0.730384,0.730384
2,0.4503,0.529665,0.795682,0.795682,0.795682,0.795682
3,0.2759,0.563909,0.826751,0.826751,0.826751,0.826751


TrainOutput(global_step=5403, training_loss=0.5664977956740432, metrics={'train_runtime': 2116.4722, 'train_samples_per_second': 40.837, 'train_steps_per_second': 2.553, 'total_flos': 3679208736815700.0, 'train_loss': 0.5664977956740432, 'epoch': 3.0})

In [59]:
trainer.evaluate(eval_dataset=test_ds)

{'eval_loss': 0.5639094114303589,
 'eval_accuracy': 0.8267509215376514,
 'eval_f1': 0.8267509215376514,
 'eval_precision': 0.8267509215376514,
 'eval_recall': 0.8267509215376514,
 'eval_runtime': 18.4416,
 'eval_samples_per_second': 205.948,
 'eval_steps_per_second': 3.254,
 'epoch': 3.0}