In [2]:
!pip install -U transformers datasets huggingface_hub emoji
!sudo apt-get install git-lfs --yes

Collecting transformers
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading transformers-4.48.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [1]:
# import all necessary dependencies

import pandas as pd
import re
import emoji
import string
import torch
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, TrainingArguments, Trainer, pipeline
from huggingface_hub import HfFolder, notebook_login
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# give notebook access to huggingface account
# make sure to use token with 'write' access
# storing git credential isn't neccessary

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# model to use
model_id = "roberta-base"

# repository name created in huggingface account
repository_id = "atharva-m/RoBERTa_Sentiment_Analysis"

# load dataset
train_data = pd.read_csv('/content/train.csv')

In [None]:
# check null values

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      31962 non-null  int64 
 1   label   31962 non-null  int64 
 2   tweet   31962 non-null  object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB


In [None]:
train_data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
# thoroughly inspect tweets
train_data[10:50]

# lot of tweets contain non-UTF characters which the model cannot understand

Unnamed: 0,id,label,tweet
10,11,0,â #ireland consumer price index (mom) climb...
11,12,0,we are so selfish. #orlando #standwithorlando ...
12,13,0,i get to see my daddy today!! #80days #getti...
13,14,1,@user #cnn calls #michigan middle school 'buil...
14,15,1,no comment! in #australia #opkillingbay #se...
15,16,0,ouch...junior is angryð#got7 #junior #yugyo...
16,17,0,i am thankful for having a paner. #thankful #p...
17,18,1,retweet if you agree!
18,19,0,its #friday! ð smiles all around via ig use...
19,20,0,"as we all know, essential oils are not made of..."


In [5]:
# create a preprocess function

def preprocess_text(text):
    """
    Preprocesses text by removing emojis, punctuations, links, mentions,
    non-UTF8 characters, cleaning hashtags, filtering special characters,
    and removing multiple spaces.

    Args:
        text: The text to be preprocessed.

    Returns:
        The preprocessed text.
    """

    # Remove emojis
    text = emoji.replace_emoji(text, replace="")

    # Remove punctuations, links, mentions, and new line characters
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    banned_list = string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)

    # Clean hashtags
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', text))
    text = " ".join(word.strip() for word in re.split('#|_', new_tweet))

    # Filter special characters
    sent = []
    for word in text.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    text = ' '.join(sent)

    # Remove multiple spaces
    text = re.sub("\s\s+" , " ", text)

    return text

In [6]:
# reorder columns to get ['input_ids', 'attention_mask', 'label'] format which will be used later
reodered_columns = ['id', 'tweet', 'label']
new_train = train_data[reodered_columns]

# drop 'id' column as it is unneccessary
new_train.drop('id', axis=1)

Unnamed: 0,tweet,label
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0
...,...,...
31957,ate @user isz that youuu?ðððððð...,0
31958,to see nina turner on the airwaves trying to...,0
31959,listening to sad songs on a monday morning otw...,0
31960,"@user #sikh #temple vandalised in in #calgary,...",1


In [7]:
# apply the preprocess function to all the tweets

new_train['tweet'] = new_train['tweet'].astype(str).apply(preprocess_text)

In [None]:
new_train.head()

Unnamed: 0,id,tweet,label
0,1,when a father is dysfunctional and is so selfi...,0
1,2,thanks for lyft credit i cant use cause they d...,0
2,3,bihday your majesty,0
3,4,model i love u take with u all the time in ur,0
4,5,factsguide society now motivation,0


In [9]:
# initialize tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

# split the clean dataset into training and testing sets
train_ds, val_ds = train_test_split(new_train, test_size=0.2, random_state=42, stratify=new_train['label'])

# convert the datasets to 'Dataset' object
train_dataset = Dataset.from_pandas(train_ds)
val_dataset = Dataset.from_pandas(val_ds)

# create a tokenizer function to tokenize the 'tweet' column
def tokenize(batch):
  return tokenizer(batch["tweet"], padding=True, truncation=True, max_length=256)

# apply the tokenizer function to both datasets
tokenized_train = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))
tokenized_val = val_dataset.map(tokenize, batched=True, batch_size=len(val_dataset))

# set the format to 'torch'
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/25569 [00:00<?, ? examples/s]

Map:   0%|          | 0/6393 [00:00<?, ? examples/s]

In [None]:
# initialize roberta model
model = RobertaForSequenceClassification.from_pretrained(model_id)

# define training arguments
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=5,
    per_device_train_batch_size=50,
    per_device_eval_batch_size=50,
    evaluation_strategy="epoch",
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.0000001,
    warmup_steps=500,
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# define 'Trainer' to train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# start training

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1276,0.111564
2,0.1097,0.099421
3,0.0662,0.116475
4,0.0542,0.14475
5,0.019,0.162977


TrainOutput(global_step=2560, training_loss=0.08073847150626534, metrics={'train_runtime': 1131.2349, 'train_samples_per_second': 113.014, 'train_steps_per_second': 2.263, 'total_flos': 2890716887480400.0, 'train_loss': 0.08073847150626534, 'epoch': 5.0})

In [None]:
# choose the best model

trainer.evaluate()

{'eval_loss': 0.09942052513360977,
 'eval_runtime': 14.5877,
 'eval_samples_per_second': 438.246,
 'eval_steps_per_second': 8.775,
 'epoch': 5.0}

In [None]:
# save our tokenizer and create model card
tokenizer.save_pretrained(repository_id)
trainer.create_model_card()

In [None]:
# push the model to huggingface repository

trainer.push_to_hub()

events.out.tfevents.1726000277.8984ad1f964d.707.0:   0%|          | 0.00/60.6k [00:00<?, ?B/s]

events.out.tfevents.1726001431.8984ad1f964d.707.1:   0%|          | 0.00/359 [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/atharva-m/RoBERTa_Sentiment_Analysis/commit/c326a1501bd0250a0bfc8f13490d06e7800a145d', commit_message='End of training', commit_description='', oid='c326a1501bd0250a0bfc8f13490d06e7800a145d', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# create a custom pipeline to test the model
def model_pipeline(text):
  # define pipeline
  pip = pipeline('text-classification', repository_id, device='cuda')

  # preprocess the input
  preprocessed_text = preprocess_text(text)

  # get prediction
  result = pip(preprocessed_text)
  predicted_label = result[0]["label"]
  print(f"Predicted label: {predicted_label}")

# define input
text = "The Lakers game at the weekend was amazing"

# call model
model_pipeline(text)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Predicted label: LABEL_0


In [None]:
text = "Women should not be allowed to vote"

model_pipeline(text)

Predicted label: LABEL_1


In [12]:
# create a custom function to define and derive the evaluation metrics
def compute_metrics(eval_pred):
    # unpack the tuple of predictions and true labels from eval_pred
    logits, labels = eval_pred

    # convert the logits (raw model outputs) into class predictions by taking the argmax
    # the result is a tensor with predicted class indices
    predictions = torch.argmax(torch.tensor(logits), axis=1).numpy()

    # get the evaluation metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    f1 = f1_score(labels, predictions, average="weighted")

    # return the metrics
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [17]:
# define temporary training arguments (no need to train again)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=50,
    logging_dir="./logs",
    report_to="none"
)

# get model from repository
model = RobertaForSequenceClassification.from_pretrained(repository_id)

# initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# evaluate the model
results = trainer.evaluate()
print(results)

{'eval_loss': 0.09942052513360977, 'eval_model_preparation_time': 0.0058, 'eval_accuracy': 0.9613639918661036, 'eval_precision': 0.9626825763068382, 'eval_recall': 0.9613639918661036, 'eval_f1': 0.9619595110644236, 'eval_runtime': 14.1619, 'eval_samples_per_second': 451.422, 'eval_steps_per_second': 9.038}
