In [1]:
import torch
import pytorch_lightning as pl
from transformers import (
    AutoTokenizer,
    AutoModel,
    AdamW,
    get_cosine_schedule_with_warmup
)
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics.functional import accuracy
from pytorch_lightning.callbacks import ModelCheckpoint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch; 
print(torch.cuda.is_available(), torch.cuda.get_device_name(0))

True NVIDIA GeForce RTX 3050 Laptop GPU


loading the dataset

In [3]:
tweet_dataset = load_dataset('tweet_eval', 'sentiment')

In [4]:
print(tweet_dataset['train'][58])
# print(tweet_dataset)

{'text': 'Don\\u2019t forget Mitch Daniels is going to be on Steven Colbert\\u2019s show Thursday.  Think this will come up as a topic?', 'label': 1}


In [5]:
import pandas as pd

df_train = pd.read_csv("/home/AGFirass/Documents/Github/Fine-tuning-RoBERTa---Unhealthy-Comment-Corpus/second_data/second_train.csv", encoding="ISO-8859-1")
df_test = pd.read_csv("/home/AGFirass/Documents/Github/Fine-tuning-RoBERTa---Unhealthy-Comment-Corpus/second_data/second_test.csv", encoding="ISO-8859-1")

In [6]:
df_train.head(5)

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [7]:
df_train = df_train[['text', 'sentiment']]
df_train = df_train.rename(columns={'sentiment': 'label'})

In [8]:
sentiment_mapping = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

df_train["label"] = df_train["label"].map(sentiment_mapping)

In [9]:
df_train.head(20)

Unnamed: 0,text,label
0,"I`d have responded, if I were going",1
1,Sooo SAD I will miss you here in San Diego!!!,0
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of ****, why couldn`t they put them on t...",0
5,http://www.dothebouncy.com/smf - some shameles...,1
6,2am feedings for the baby are fun when he is a...,2
7,Soooo high,1
8,Both of you,1
9,Journey!? Wow... u just became cooler. hehe....,2


In [10]:
df_test

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0
...,...,...,...,...,...,...,...,...,...
4810,,,,,,,,,
4811,,,,,,,,,
4812,,,,,,,,,
4813,,,,,,,,,


In [11]:
df_test = df_test[['text', 'sentiment']]
df_test = df_test.rename(columns={'sentiment': 'label'})

In [12]:
sentiment_mapping = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}

df_test["label"] = df_test["label"].map(sentiment_mapping)

In [13]:
print(df_test["label"].isna().sum())

1281


In [14]:
df_test = df_test.dropna(subset=["label"])

In [15]:
df_test["label"] = df_test["label"].astype(int)

In [16]:
df_test.head(15)

Unnamed: 0,text,label
0,Last session of the day http://twitpic.com/67ezh,1
1,Shanghai is also really exciting (precisely -...,2
2,"Recession hit Veronique Branquinho, she has to...",0
3,happy bday!,2
4,http://twitpic.com/4w75p - I like it!!,2
5,that`s great!! weee!! visitors!,2
6,I THINK EVERYONE HATES ME ON HERE lol,0
7,"soooooo wish i could, but im in school and my...",0
8,and within a short time of the last clue all ...,1
9,What did you get? My day is alright.. haven`...,1


In [17]:
df_train = df_train[['text', 'label']]
df_test = df_test[['text', 'label']]

In [18]:
merged_train = pd.concat([tweet_dataset['train'].to_pandas(), df_train], ignore_index=True)
merged_test = pd.concat([tweet_dataset['test'].to_pandas(), df_test], ignore_index=True)

preprocessing & tokenization

In [19]:
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = self.data[idx]['label']
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [20]:
merged_train = merged_train.to_dict(orient="records")
merged_test = merged_test.to_dict(orient="records")

creating splits

In [21]:
train_dataset = TweetDataset(merged_train, tokenizer)
test_dataset = TweetDataset(merged_test, tokenizer)
val_dataset = TweetDataset(tweet_dataset['validation'], tokenizer)

data module

In [22]:
class TweetDataModule(pl.LightningDataModule):
    def __init__(self, batch_size=16):
        super().__init__()
        self.batch_size = batch_size
        
    def setup(self, stage=None):
        pass
        
    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
    
    def val_dataloader(self):
        return DataLoader(val_dataset, batch_size=self.batch_size, num_workers=4)
    
    def test_dataloader(self):
        return DataLoader(test_dataset, batch_size=self.batch_size, num_workers=4)

model

In [23]:
class SentimentClassifier(pl.LightningModule):
    def __init__(self, num_classes=3):
        super().__init__()
        self.roberta = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.roberta.config.hidden_size, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token
        logits = self.classifier(pooled_output)
        
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
            
        return loss, logits
    
    def training_step(self, batch, batch_idx):
        loss, logits = self(**batch)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        loss, logits = self(**batch)
        preds = torch.argmax(logits, dim=1)
        acc = accuracy(preds, batch['labels'], task='multiclass', num_classes=3)
        self.log_dict({'val_loss': loss, 'val_acc': acc}, prog_bar=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        loss, logits = self(**batch)
        preds = torch.argmax(logits, dim=1)
        acc = accuracy(preds, batch['labels'], task='multiclass', num_classes=3)
        self.log_dict({'test_loss': loss, 'test_acc': acc})
        return loss
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        return optimizer

training

In [24]:
dm = TweetDataModule(batch_size=8)
model = SentimentClassifier()

checkpoint_callback = ModelCheckpoint(
    monitor='val_acc',
    mode='max',
    save_top_k=1,
    filename='best-checkpoint'
)

trainer = pl.Trainer(
    max_epochs=5,
    accelerator='auto',
    precision='16-mixed',
    callbacks=[checkpoint_callback]
)

trainer.fit(model, dm)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/AGFirass/Documents/Github/Fine-tuning-RoBERTa---Unhealthy-Comment-Corpus/venv/lib/python3.13/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one o

Epoch 0:  22%|██▏       | 1980/9137 [04:04<14:42,  8.11it/s, v_num=3]      

ValueError: Caught ValueError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/AGFirass/Documents/Github/Fine-tuning-RoBERTa---Unhealthy-Comment-Corpus/venv/lib/python3.13/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/AGFirass/Documents/Github/Fine-tuning-RoBERTa---Unhealthy-Comment-Corpus/venv/lib/python3.13/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
            ~~~~~~~~~~~~^^^^^
  File "/tmp/ipykernel_23079/3387846227.py", line 17, in __getitem__
    encoding = self.tokenizer(
        text,
    ...<3 lines>...
        return_tensors='pt'
    )
  File "/home/AGFirass/Documents/Github/Fine-tuning-RoBERTa---Unhealthy-Comment-Corpus/venv/lib/python3.13/site-packages/transformers/tokenization_utils_base.py", line 2877, in __call__
    encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
  File "/home/AGFirass/Documents/Github/Fine-tuning-RoBERTa---Unhealthy-Comment-Corpus/venv/lib/python3.13/site-packages/transformers/tokenization_utils_base.py", line 2937, in _call_one
    raise ValueError(
    ...<2 lines>...
    )
ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).


evaluation

In [None]:
trainer.test(dataloaders=dm.test_dataloader())

inference example

In [12]:
def predict_sentiment(text, model, tokenizer):
    encoding = tokenizer(
        text,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        outputs = model.roberta(**encoding)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        logits = model.classifier(pooled_output)
        
    probs = F.softmax(logits, dim=1)
    return probs


example usage

In [None]:
# sample_text = "I love how this model understands emojis! 😍"
sample_text = "RIP Duo, he would have loved this emotional manipulation."
probabilities = predict_sentiment(sample_text, model, tokenizer)
predicted_class = torch.argmax(probabilities).item()
print(f"Predicted sentiment: {['negative', 'neutral', 'positive'][predicted_class]}")