In [1]:
# %pip install transformers
# %pip install torch
# %pip install pandas

from transformers import BertTokenizer, BertForQuestionAnswering
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Should print "cuda" if GPU is detected

# Load dataset
qa_data = pd.read_csv('../data/QA_greeting.csv')  # Update to your new dataset name
print(qa_data.columns)  # Verify column names

# Load mBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForQuestionAnswering.from_pretrained("bert-base-multilingual-cased").to(device)

# Define custom dataset for QA
class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        
        # Tokenize question and answer
        encoding = self.tokenizer(
            question,
            answer,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Identify the start and end positions of the answer within the input
        answer_tokens = self.tokenizer.tokenize(answer)
        input_tokens = self.tokenizer.tokenize(question) + [self.tokenizer.sep_token] + answer_tokens

        start_position = input_tokens.index(answer_tokens[0])
        end_position = start_position + len(answer_tokens) - 1

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "start_positions": torch.tensor(start_position),
            "end_positions": torch.tensor(end_position),
        }

# Prepare the dataset
max_length = 128
qa_dataset = QADataset(qa_data["question"].tolist(), qa_data["answer"].tolist(), tokenizer, max_length)
data_loader = DataLoader(qa_dataset, batch_size=8, shuffle=True)

# Optimizer and gradient scaler
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

# Training settings
epochs = 5  # Adjust epochs for better performance

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        optimizer.zero_grad()

        with autocast():  # Mixed precision
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions,
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.6f}")

# Save the trained model
model.save_pretrained('../model/qa_greeting_model_mbert')
tokenizer.save_pretrained('../model/qa_greeting_mbert_tokenizer')

# Clear GPU memory
torch.cuda.empty_cache()

print("Fine-tuning with mBERT completed successfully!")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Index(['question', 'answer'], dtype='object')


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():  # Mixed precision


Epoch 1, Average Loss: 2.309006
Epoch 2, Average Loss: 0.564699
Epoch 3, Average Loss: 0.335947
Epoch 4, Average Loss: 0.235814
Epoch 5, Average Loss: 0.156824


MemoryError: 

In [1]:
# %pip install transformers
# %pip install torch
# %pip install pandas

from transformers import BertTokenizer, BertForQuestionAnswering
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
import gc

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Should print "cuda" if GPU is detected

# Load dataset
qa_data = pd.read_csv('../data/QA_greeting.csv')  # Update to your new dataset name
print(qa_data.columns)  # Verify column names

# Load mBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForQuestionAnswering.from_pretrained("bert-base-multilingual-cased").to(device)

# Define custom dataset for QA
class QADataset(Dataset):
    def __init__(self, questions, answers, tokenizer, max_length):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        answer = self.answers[idx]
        
        # Tokenize question and answer
        encoding = self.tokenizer(
            question,
            answer,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        # Identify the start and end positions of the answer within the input
        answer_tokens = self.tokenizer.tokenize(answer)
        input_tokens = self.tokenizer.tokenize(question) + [self.tokenizer.sep_token] + answer_tokens

        start_position = input_tokens.index(answer_tokens[0])
        end_position = start_position + len(answer_tokens) - 1

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "start_positions": torch.tensor(start_position),
            "end_positions": torch.tensor(end_position),
        }

# Prepare the dataset
max_length = 128
qa_dataset = QADataset(qa_data["question"].tolist(), qa_data["answer"].tolist(), tokenizer, max_length)
data_loader = DataLoader(qa_dataset, batch_size=8, shuffle=True)

# Optimizer and gradient scaler
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

# Training settings
epochs = 5  # Adjust epochs for better performance

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        start_positions = batch["start_positions"].to(device)
        end_positions = batch["end_positions"].to(device)

        optimizer.zero_grad()

        with autocast():  # Mixed precision
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions,
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.6f}")

# 🔹 Free up memory before saving
torch.cuda.empty_cache()
gc.collect()

# 🔹 Save the trained model with memory optimization
model.save_pretrained('../model/qa_greeting_model_mbert', max_shard_size="500MB")
tokenizer.save_pretrained('../model/qa_greeting_mbert_tokenizer')

# Clear GPU memory after saving
torch.cuda.empty_cache()

print("Fine-tuning with mBERT completed successfully!")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Index(['question', 'answer'], dtype='object')


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()
  with autocast():  # Mixed precision


Epoch 1, Average Loss: 2.430560
Epoch 2, Average Loss: 0.597897
Epoch 3, Average Loss: 0.308619
Epoch 4, Average Loss: 0.234925
Epoch 5, Average Loss: 0.167643
Fine-tuning with mBERT completed successfully!
