In [None]:
# import pandas as pd
# import random
# from lorem_text import lorem

# # Generate synthetic data
# num_samples = 1000  # Adjust as needed
# data = {
#     "text": [lorem.paragraph() for _ in range(num_samples)],  # Random paragraphs
#     "label": [random.uniform(0, 10) for _ in range(num_samples)]  # Regression labels (0-10)
# }

# # Convert to DataFrame
# df = pd.DataFrame(data)
# df.to_csv("synthetic_dataset.csv", index=False)
# print("Synthetic dataset created!")

Synthetic dataset created!


In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import LongformerModel, LongformerConfig, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
from torch.optim import AdamW
from torch.nn import MSELoss
from tqdm import tqdm
import os

In [2]:
# Load Tokenizer & Model
from transformers import LongformerModel, LongformerConfig, AutoTokenizer

model_name = "allenai/longformer-base-4096"
config = LongformerConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
from transformers import LongformerPreTrainedModel

class LongformerForRegression(LongformerPreTrainedModel):
    def __init__(self, config):
        super().__init__(config) 

        self.longformer = LongformerModel(config)
        self.regression_head = torch.nn.Linear(config.hidden_size, 1)  

        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, inputs_embeds=None, 
                labels=None, output_attentions=False, output_hidden_states=False, return_dict=True):
        outputs = self.longformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=True
        )
        pooled_output = outputs.last_hidden_state[:, 0]  
        logits = self.regression_head(pooled_output)  
        
        loss = None
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            loss = loss_fn(logits.view(-1), labels.view(-1))

        return {"loss": loss if loss is not None else torch.tensor(0.0, requires_grad=True), 
                "logits": logits}


# Initialize model
model = LongformerForRegression.from_pretrained(model_name, config=config)

Some weights of LongformerForRegression were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['regression_head.bias', 'regression_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Apply LoRA with PEFT
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  
    inference_mode=False,        
    r=8,                         
    lora_alpha=32,               
    lora_dropout=0.1,            
    target_modules=["query", "key", "value"]  
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 442,368 || all params: 149,102,593 || trainable%: 0.2967


In [5]:
import torch
from torch.utils.data import Dataset, DataLoader

class RegressionDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_length=512):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset[idx]["text"]
        label = float(self.dataset[idx]["label"])  

        encoding = self.tokenizer(
            text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),  
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float32)
        }
    
import pandas as pd 
df = pd.read_csv("synthetic_dataset.csv")

train_dataset = RegressionDataset(df[:800].to_dict(orient="records"), tokenizer)
test_dataset = RegressionDataset(df[800:].to_dict(orient="records"), tokenizer)

# Define dataloaders
BATCH_SIZE = 2
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [6]:
from torch.optim import AdamW
from transformers import get_scheduler

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer & Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
num_training_steps = len(train_loader) * 3  # num_epochs = 3
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Loss Function
loss_fn = MSELoss()


In [7]:
def train_one_epoch(model, dataloader, optimizer, loss_fn, device, lr_scheduler):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", leave=False)

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"].squeeze(-1)

        loss = loss_fn(logits, labels)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    return total_loss / len(dataloader)

In [8]:
def evaluate(model, dataloader, loss_fn, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs["logits"].squeeze(-1)

            loss = loss_fn(logits, labels)
            total_loss += loss.item()

    return total_loss / len(dataloader)

In [9]:
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")

    train_loss = train_one_epoch(model, train_loader, optimizer, loss_fn, device, lr_scheduler)
    eval_loss = evaluate(model, test_loader, loss_fn, device)

    print(f"Train Loss: {train_loss:.4f} | Eval Loss: {eval_loss:.4f}")

Epoch 1/3


                                                                        

Train Loss: 12.0644 | Eval Loss: 9.4871
Epoch 2/3


                                                                        

Train Loss: 8.2747 | Eval Loss: 9.9386
Epoch 3/3


                                                                        

Train Loss: 8.2714 | Eval Loss: 9.9198


In [6]:
# Training Setup
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./longformer-regression",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,8.792159
2,12.922300,8.817775
3,8.504700,8.930118


TrainOutput(global_step=1200, training_loss=10.302698567708333, metrics={'train_runtime': 300.0419, 'train_samples_per_second': 7.999, 'train_steps_per_second': 3.999, 'total_flos': 791477755084800.0, 'train_loss': 10.302698567708333, 'epoch': 3.0})

In [4]:
# Training Setup
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./longformer-regression",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,  # Enable mixed precision training
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    processing_class=tokenizer
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,9.701742
2,9.510500,8.778942
3,8.584400,8.792759


TrainOutput(global_step=1200, training_loss=8.92286631266276, metrics={'train_runtime': 281.5991, 'train_samples_per_second': 8.523, 'train_steps_per_second': 4.261, 'total_flos': 788216264294400.0, 'train_loss': 8.92286631266276, 'epoch': 3.0})