In [None]:
# %pip install sentencepiece

# from transformers import T5Tokenizer, T5ForConditionalGeneration
# import torch
# import pandas as pd
# from torch.cuda.amp import autocast, GradScaler

# # Check if GPU is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")  # Should print "cuda" if GPU is detected

# # Load dataset
# qa_data = pd.read_csv('QA_datasets.csv')  # Contains question-answer pairs
# print(qa_data.columns)  # Verify column names

# # Load T5 model and tokenizer
# tokenizer = T5Tokenizer.from_pretrained('t5-small')
# model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

# # Enable gradient checkpointing for memory efficiency
# model.gradient_checkpointing_enable()

# # Prepare training data
# inputs = tokenizer(["Question: " + q for q in qa_data['Question']], 
#                     truncation=True, padding=True, max_length=64, return_tensors='pt')

# labels = tokenizer([a for a in qa_data['Answer']], 
#                    truncation=True, padding=True, max_length=64, return_tensors='pt')

# # Move to GPU
# inputs, attention_mask = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
# labels = labels['input_ids'].to(device)

# # Ensure labels ignore padding tokens
# labels[labels == tokenizer.pad_token_id] = -100  

# # Optimizer and gradient scaler
# optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# scaler = GradScaler()

# # Training settings
# epochs = 5  # Increase epochs for better performance
# batch_size = 4
# accumulation_steps = 4  

# # Fine-tune T5 on GPU
# for epoch in range(epochs):
#     model.train()
#     optimizer.zero_grad()
#     total_loss = 0

#     for i in range(0, inputs.shape[0], batch_size):
#         batch_inputs = inputs[i:i+batch_size]
#         batch_labels = labels[i:i+batch_size]
#         batch_attention_mask = attention_mask[i:i+batch_size]

#         with autocast():  # Mixed precision
#             outputs = model(input_ids=batch_inputs, attention_mask=batch_attention_mask, labels=batch_labels)
#             loss = outputs.loss / accumulation_steps  # Normalize loss

#         scaler.scale(loss).backward()

#         if (i // batch_size + 1) % accumulation_steps == 0 or (i + batch_size >= inputs.shape[0]):
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()

#         total_loss += loss.item() * accumulation_steps  # Accumulate loss correctly

#     avg_loss = total_loss / (inputs.shape[0] / batch_size)  # Compute average loss
#     print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.6f}")

# # Save the trained model
# model.save_pretrained('qa_model_t5')
# tokenizer.save_pretrained('t5_tokenizer')

# # Clear GPU memory
# torch.cuda.empty_cache()

# print("Fine-tuning completed successfully!")


# lastest version
### T5

In [1]:
# %pip install sentencepiece

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import pandas as pd
from torch.cuda.amp import autocast, GradScaler

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Should print "cuda" if GPU is detected

# Load dataset
qa_data = pd.read_csv('../data/QA_banking.csv')  # Contains question-answer pairs
print(qa_data.columns)  # Verify column names

# Load T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

# Enable gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()

# Prepare training data
inputs = tokenizer(["Question: " + q for q in qa_data['Question']], 
                    truncation=True, padding=True, max_length=64, return_tensors='pt')

labels = tokenizer([a for a in qa_data['Answer']], 
                   truncation=True, padding=True, max_length=64, return_tensors='pt')

# Move to GPU
inputs, attention_mask = inputs['input_ids'].to(device), inputs['attention_mask'].to(device)
labels = labels['input_ids'].to(device)

# Ensure labels ignore padding tokens
labels[labels == tokenizer.pad_token_id] = -100  

# Optimizer and gradient scaler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

# Training settings
epochs = 5  # Increase epochs for better performance
batch_size = 4
accumulation_steps = 4  

# Fine-tune T5 on GPU
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    total_loss = 0

    for i in range(0, inputs.shape[0], batch_size):
        batch_inputs = inputs[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        batch_attention_mask = attention_mask[i:i+batch_size]

        with autocast():  # Mixed precision
            outputs = model(input_ids=batch_inputs, attention_mask=batch_attention_mask, labels=batch_labels)
            loss = outputs.loss / accumulation_steps  # Normalize loss

        scaler.scale(loss).backward()

        if (i // batch_size + 1) % accumulation_steps == 0 or (i + batch_size >= inputs.shape[0]):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps  # Accumulate loss correctly

    avg_loss = total_loss / (inputs.shape[0] / batch_size)  # Compute average loss
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.6f}")

# Save the trained model
model.save_pretrained('../model/qa_model_t5')
tokenizer.save_pretrained('../model/t5_tokenizer')

# Clear GPU memory
torch.cuda.empty_cache()

print("Fine-tuning completed successfully!")


Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Index(['Question', 'Answer'], dtype='object')


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  scaler = GradScaler()
  with autocast():  # Mixed precision
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch 1, Average Loss: 4.738472
Epoch 2, Average Loss: 4.559990
Epoch 3, Average Loss: 4.095875
Epoch 4, Average Loss: 3.792747
Epoch 5, Average Loss: 3.707226
Fine-tuning completed successfully!


### mBERT

In [8]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from torch.nn import CrossEntropyLoss

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load dataset
qa_data = pd.read_csv('../data/QA_banking.csv')  # Contains question-answer pairs
print(qa_data.columns)  # Ensure 'Question' and 'Answer' exist

# Encode labels (if answers are categorical)
label_encoder = LabelEncoder()
qa_data['Answer'] = label_encoder.fit_transform(qa_data['Answer'])
num_labels = len(label_encoder.classes_)

# Train-test split
train_data, val_data = train_test_split(qa_data, test_size=0.1, random_state=42)

# Custom dataset class
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data.iloc[idx]['Question']
        answer = self.data.iloc[idx]['Answer']

        # Tokenize question
        encoded = self.tokenizer(question, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': torch.tensor(answer, dtype=torch.long)
        }

# Initialize mBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=num_labels).to(device)

# Prepare datasets and dataloaders
train_dataset = QADataset(train_data, tokenizer, max_length=64)
val_dataset = QADataset(val_data, tokenizer, max_length=64)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()

# Training settings
epochs = 5
model.train()

# Training loop
for epoch in range(epochs):
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")

# Save model
model.save_pretrained('../model/qa_model_mbert')
tokenizer.save_pretrained('../model/mbert_tokenizer')

print("Fine-tuning completed successfully!")


Using device: cuda
Index(['Question', 'Answer'], dtype='object')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 4.0372
Epoch 2, Loss: 3.8909
Epoch 3, Loss: 3.8673
Epoch 4, Loss: 3.8149
Epoch 5, Loss: 3.7284
Fine-tuning completed successfully!
