In [14]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm  # Import tqdm for progress bars

# Load CSV file
csv_file_path = "/home/cleaned_file.csv"  # Path to the CSV file
df = pd.read_csv(csv_file_path)

train_df, test_df = train_test_split(df, test_size=0.1)

# Custom Dataset for NER task
class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = 1024

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = f"Find the value and unit (full form)\n{row['entity_name']}: {row['result']}"
        target_text = row['entity_value']
        # Tokenize input and target texts
        input_ids = self.tokenizer(input_text, padding='max_length', max_length=self.max_length, return_tensors="pt", add_special_tokens=False).input_ids.squeeze()
        if len(input_ids) > self.max_length:
            input_ids = input_ids[:self.max_length]

        outputs = self.tokenizer(target_text, padding='max_length', max_length=self.max_length, return_tensors="pt", add_special_tokens=False).input_ids.squeeze()

        return {
            "input_ids": input_ids,
            "labels": outputs
        }
# Load the tokenizer and model (flan-t5-base)
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Create Dataset and DataLoader for training and testing sets
train_dataset = NERDataset(train_df, tokenizer)
test_dataset = NERDataset(test_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=6,persistent_workers=True,pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=6,persistent_workers=True,pin_memory=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)



In [15]:
from tqdm import tqdm
import torch

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    try:
        for batch_idx, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1} Training")):
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, labels=labels)
            loss = outputs.loss

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

    except Exception as e:
        print(f"Error in batch {batch_idx}")
        print(e)
        print(f"Input IDs: {tokenizer.decode(input_ids)}")
        break  # Stop training if an error occurs

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch: {epoch + 1}, Training Loss: {avg_loss}")

    # Save the model and tokenizer
    model.save_pretrained("/home/flan-t5-ner-finetuned_model_final")
    tokenizer.save_pretrained("/home/flan-t5-ner-finetuned_tokenizer_final")
    print(f"Model saved for epoch {epoch + 1}.")

Epoch 1 Training: 100%|██████████| 2242/2242 [18:50<00:00,  1.98it/s]


Epoch: 1, Training Loss: 0.29518631722547123
Model saved for epoch 1.


Epoch 2 Training: 100%|██████████| 2242/2242 [18:46<00:00,  1.99it/s]


Epoch: 2, Training Loss: 0.008896959202619972
Model saved for epoch 2.


Epoch 3 Training: 100%|██████████| 2242/2242 [18:46<00:00,  1.99it/s]


Epoch: 3, Training Loss: 0.006691509638443576
Model saved for epoch 3.


Epoch 4 Training: 100%|██████████| 2242/2242 [18:46<00:00,  1.99it/s]


Epoch: 4, Training Loss: 0.0058554369491216765
Model saved for epoch 4.


Epoch 5 Training: 100%|██████████| 2242/2242 [18:45<00:00,  1.99it/s]


Epoch: 5, Training Loss: 0.005467786431752052
Model saved for epoch 5.


Epoch 6 Training: 100%|██████████| 2242/2242 [18:44<00:00,  1.99it/s]


Epoch: 6, Training Loss: 0.005066965144820491
Model saved for epoch 6.


Epoch 7 Training: 100%|██████████| 2242/2242 [18:43<00:00,  2.00it/s]


Epoch: 7, Training Loss: 0.004851196481587999
Model saved for epoch 7.


Epoch 8 Training:  11%|█         | 247/2242 [02:08<17:14,  1.93it/s]


KeyboardInterrupt: 

In [None]:
model.eval()
test_loss = 0
print("Test Set Results:\n")
with torch.no_grad():
    for batch_idx, batch in enumerate(tqdm(test_dataloader, desc="Testing")):
        input_text = batch['input_text']  # Assuming 'input_text' is in your dataset
        labels = batch['labels'].to(device)

        # Ensure input is tokenized and in the right format (List[str])
        if isinstance(input_text, list):
            input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).input_ids.to(device)
        else:
            input_ids = tokenizer([input_text], return_tensors="pt", padding=True, truncation=True).input_ids.to(device)

        print(f"Batch {batch_idx} Input IDs: {input_ids}")  # Debugging line

        # Generate predictions
        outputs = model.generate(input_ids=input_ids, max_length=512)

        # Calculate loss for reporting
        loss = model(input_ids=input_ids, labels=labels).loss
        test_loss += loss.item()

        # Decode inputs, labels, and outputs for readability
        decoded_inputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
        decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]
        decoded_outputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]

        # Print input, predicted output, and ground truth for comparison
        for inp, pred, truth in zip(decoded_inputs, decoded_outputs, decoded_labels):
            print(f"Input: {inp}")
            print(f"Predicted Output: {pred}")
            print(f"True Output: {truth}")
            print("\n------------------\n")

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")

In [6]:
# Save the fine-tuned model
model.save_pretrained("/home/flan-t5-ner-finetuned_model_finak")
print("Model saved as 'flan-t5-ner-finetuned'.")

# Save the tokenizer
tokenizer.save_pretrained("/home/flan-t5-ner-finetuned_tokenizer_final")


Model saved as 'flan-t5-ner-finetuned'.


('/home/flan-t5-ner-finetuned_tokenizer_final/tokenizer_config.json',
 '/home/flan-t5-ner-finetuned_tokenizer_final/special_tokens_map.json',
 '/home/flan-t5-ner-finetuned_tokenizer_final/spiece.model',
 '/home/flan-t5-ner-finetuned_tokenizer_final/added_tokens.json')