RoBERTa Model Implementation for Classification (using sliding window approach)

In [1]:
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def sliding_window_tokenize(texts, tokenizer, max_length=512, stride=256):
    encodings = []
    for text in texts:
        tokens = tokenizer.tokenize(text)
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        for start in range(0, len(token_ids), stride):
            end = min(start + max_length, len(token_ids))
            input_ids = token_ids[start:end]
            attention_mask = [1] * len(input_ids)
            padding_length = max_length - len(input_ids)
            input_ids += [tokenizer.pad_token_id] * padding_length
            attention_mask += [0] * padding_length
            encodings.append({'input_ids': input_ids, 'attention_mask': attention_mask})
    return encodings

class SlidingWindowDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.encodings[idx].items()}
        item['labels'] = torch.tensor(self.labels[idx // len(self.encodings[0])])
        return item

    def __len__(self):
        return len(self.labels) * len(self.encodings[0])

In [3]:
import pandas as pd

df = pd.read_csv('Input.csv') 
texts = df['Text'].tolist()
labels = df['Label'].tolist()

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, encoded_labels, test_size=0.1)

train_encodings = sliding_window_tokenize(train_texts, tokenizer, max_length=512, stride=256)
val_encodings = sliding_window_tokenize(val_texts, tokenizer, max_length=512, stride=256)

train_dataset = SlidingWindowDataset(train_encodings, train_labels)
val_dataset = SlidingWindowDataset(val_encodings, val_labels)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

  3%|▎         | 10/342 [02:13<1:12:19, 13.07s/it]

{'loss': 1.3982, 'grad_norm': 2.9817054271698, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.09}


  6%|▌         | 20/342 [04:17<1:06:54, 12.47s/it]

{'loss': 1.3738, 'grad_norm': 3.302122116088867, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.18}


  9%|▉         | 30/342 [06:22<1:05:18, 12.56s/it]

{'loss': 1.3898, 'grad_norm': 3.591437816619873, 'learning_rate': 3e-06, 'epoch': 0.26}


 12%|█▏        | 40/342 [08:33<1:05:40, 13.05s/it]

{'loss': 1.3964, 'grad_norm': 3.8775970935821533, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.35}


 15%|█▍        | 50/342 [10:46<1:04:30, 13.25s/it]

{'loss': 1.3901, 'grad_norm': 1.9768247604370117, 'learning_rate': 5e-06, 'epoch': 0.44}


 18%|█▊        | 60/342 [12:53<1:00:44, 12.92s/it]

{'loss': 1.4101, 'grad_norm': 1.5315808057785034, 'learning_rate': 6e-06, 'epoch': 0.53}


 20%|██        | 70/342 [15:03<59:03, 13.03s/it]  

{'loss': 1.3934, 'grad_norm': 2.108271360397339, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.61}


 23%|██▎       | 80/342 [17:12<56:26, 12.93s/it]

{'loss': 1.3863, 'grad_norm': 2.166874647140503, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.7}


 26%|██▋       | 90/342 [19:20<53:23, 12.71s/it]

{'loss': 1.3957, 'grad_norm': 1.8043886423110962, 'learning_rate': 9e-06, 'epoch': 0.79}


 29%|██▉       | 100/342 [21:29<52:12, 12.94s/it]

{'loss': 1.3949, 'grad_norm': 2.3684511184692383, 'learning_rate': 1e-05, 'epoch': 0.88}


 32%|███▏      | 110/342 [23:36<48:41, 12.59s/it]

{'loss': 1.3849, 'grad_norm': 3.296285629272461, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.96}


 35%|███▌      | 120/342 [41:08<17:50:55, 289.44s/it]

{'loss': 1.3828, 'grad_norm': 2.4403011798858643, 'learning_rate': 1.2e-05, 'epoch': 1.05}


 38%|███▊      | 130/342 [43:17<1:13:08, 20.70s/it]  

{'loss': 1.3931, 'grad_norm': 1.7376954555511475, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.14}


 41%|████      | 140/342 [45:26<44:01, 13.08s/it]  

{'loss': 1.3809, 'grad_norm': 4.312652587890625, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.23}


 44%|████▍     | 150/342 [47:34<41:10, 12.87s/it]

{'loss': 1.403, 'grad_norm': 3.984259843826294, 'learning_rate': 1.5e-05, 'epoch': 1.32}


 47%|████▋     | 160/342 [49:41<38:13, 12.60s/it]

{'loss': 1.3774, 'grad_norm': 2.781034469604492, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.4}


 50%|████▉     | 170/342 [51:48<36:18, 12.66s/it]

{'loss': 1.3836, 'grad_norm': 2.667254686355591, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.49}


 53%|█████▎    | 180/342 [53:57<34:38, 12.83s/it]

{'loss': 1.394, 'grad_norm': 3.5915427207946777, 'learning_rate': 1.8e-05, 'epoch': 1.58}


 56%|█████▌    | 190/342 [56:08<34:24, 13.58s/it]

{'loss': 1.3961, 'grad_norm': 4.310908317565918, 'learning_rate': 1.9e-05, 'epoch': 1.67}


 58%|█████▊    | 200/342 [58:19<29:42, 12.55s/it]

{'loss': 1.3843, 'grad_norm': 2.318768262863159, 'learning_rate': 2e-05, 'epoch': 1.75}


 61%|██████▏   | 210/342 [1:00:22<27:19, 12.42s/it]

{'loss': 1.3975, 'grad_norm': 2.069550037384033, 'learning_rate': 2.1e-05, 'epoch': 1.84}


 64%|██████▍   | 220/342 [1:02:23<24:00, 11.81s/it]

{'loss': 1.4081, 'grad_norm': 3.3788340091705322, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.93}


 67%|██████▋   | 230/342 [1:04:23<22:14, 11.92s/it]

{'loss': 1.3756, 'grad_norm': 2.529125213623047, 'learning_rate': 2.3000000000000003e-05, 'epoch': 2.02}


 70%|███████   | 240/342 [1:06:26<21:29, 12.65s/it]

{'loss': 1.3714, 'grad_norm': 2.5425479412078857, 'learning_rate': 2.4e-05, 'epoch': 2.11}


 73%|███████▎  | 250/342 [1:08:45<20:59, 13.69s/it]

{'loss': 1.3306, 'grad_norm': 4.75630521774292, 'learning_rate': 2.5e-05, 'epoch': 2.19}


 76%|███████▌  | 260/342 [1:10:48<17:04, 12.49s/it]

{'loss': 1.4111, 'grad_norm': 4.397022724151611, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.28}


 79%|███████▉  | 270/342 [1:13:11<18:27, 15.38s/it]

{'loss': 1.4042, 'grad_norm': 2.5951955318450928, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.37}


 82%|████████▏ | 280/342 [1:16:13<19:17, 18.67s/it]

{'loss': 1.3844, 'grad_norm': 4.611429214477539, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.46}


 85%|████████▍ | 290/342 [1:19:20<16:14, 18.74s/it]

{'loss': 1.3756, 'grad_norm': 2.026846170425415, 'learning_rate': 2.9e-05, 'epoch': 2.54}


 88%|████████▊ | 300/342 [1:22:15<11:06, 15.87s/it]

{'loss': 1.3904, 'grad_norm': 8.587235450744629, 'learning_rate': 3e-05, 'epoch': 2.63}


 91%|█████████ | 310/342 [1:24:32<07:56, 14.89s/it]

{'loss': 1.3915, 'grad_norm': 4.809105396270752, 'learning_rate': 3.1e-05, 'epoch': 2.72}


 94%|█████████▎| 320/342 [1:27:32<04:57, 13.52s/it]

{'loss': 1.386, 'grad_norm': 2.7878432273864746, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.81}


 96%|█████████▋| 330/342 [1:29:34<02:26, 12.21s/it]

{'loss': 1.393, 'grad_norm': 3.4179837703704834, 'learning_rate': 3.3e-05, 'epoch': 2.89}


 99%|█████████▉| 340/342 [1:31:31<00:23, 11.67s/it]

{'loss': 1.3796, 'grad_norm': 2.6450376510620117, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.98}


100%|██████████| 342/342 [1:31:55<00:00, 16.13s/it]

{'train_runtime': 5515.9664, 'train_samples_per_second': 0.496, 'train_steps_per_second': 0.062, 'train_loss': 1.388234747780694, 'epoch': 3.0}





TrainOutput(global_step=342, training_loss=1.388234747780694, metrics={'train_runtime': 5515.9664, 'train_samples_per_second': 0.496, 'train_steps_per_second': 0.062, 'total_flos': 719884774342656.0, 'train_loss': 1.388234747780694, 'epoch': 3.0})

In [8]:
from collections import Counter
import torch

model.save_pretrained('./fine-tuned-roberta512-sliding-window')
tokenizer.save_pretrained('./fine-tuned-roberta512-sliding-window')

def predict(texts):
    predictions = []
    for text in texts:
        encodings = sliding_window_tokenize([text], tokenizer, max_length=512, stride=256)
        text_predictions = []
        for enc in encodings:
            inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in enc.items()}
            with torch.no_grad():
                outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=1).tolist()
            text_predictions.append(pred[0])
        most_common_pred = Counter(text_predictions).most_common(1)[0][0]
        predictions.append(most_common_pred)
    return predictions

example_texts = ['''A property owner files a lawsuit against their neighbor for encroaching on their land and demands that the boundary be restored to its rightful place.''']
predictions = predict(example_texts)
decoded_predictions = [label_encoder.inverse_transform([p])[0] for p in predictions]
print(decoded_predictions)

['Civil Case']


In [9]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from collections import Counter
import joblib
import torch
tokenizer = RobertaTokenizer.from_pretrained('./fine-tuned-roberta512-sliding-window')
model = RobertaForSequenceClassification.from_pretrained('./fine-tuned-roberta512-sliding-window')
label_encoder = joblib.load('label_encoder.joblib')

In [5]:
def sliding_window_tokenize(texts, tokenizer, max_length=512, stride=256):
    encodings = []
    for text in texts:
        tokens = tokenizer.tokenize(text)
        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        for start in range(0, len(token_ids), stride):
            end = min(start + max_length, len(token_ids))
            input_ids = token_ids[start:end]
            attention_mask = [1] * len(input_ids)
            padding_length = max_length - len(input_ids)
            input_ids += [tokenizer.pad_token_id] * padding_length
            attention_mask += [0] * padding_length
            encodings.append({'input_ids': input_ids, 'attention_mask': attention_mask})
    return encodings

In [10]:
def predict(texts):
    predictions = []
    for text in texts:
        encodings = sliding_window_tokenize([text], tokenizer, max_length=512, stride=256)
        text_predictions = []
        for enc in encodings:
            inputs = {key: torch.tensor(val).unsqueeze(0) for key, val in enc.items()}
            with torch.no_grad():
                outputs = model(**inputs)
            pred = torch.argmax(outputs.logits, dim=1).tolist()
            text_predictions.append(pred[0])
        most_common_pred = Counter(text_predictions).most_common(1)[0][0]
        predictions.append(most_common_pred)
    return predictions

example_texts = ['''A property owner files a lawsuit against their neighbor for encroaching on their land and demands that the boundary be restored to its rightful place.''']
predictions = predict(example_texts)
decoded_predictions = [label_encoder.inverse_transform([p])[0] for p in predictions]
print(decoded_predictions)

['Civil Case']
