In [2]:
!pip install peft

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
Installing collected packages: peft
Successfully installed peft-0.12.0


In [7]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from sklearn.model_selection import train_test_split

# Load CSV file
text = "Recognize the given entity value and units(in full form)"
csv_file_path = "/home/inference_results.csv"  # Path to the CSV file
df = pd.read_csv(csv_file_path)

# Split data into training and testing sets (80% training, 20% testing)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Custom Dataset for NER task
class NERDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = 512

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        input_text = f"{row['entity_name']}: {row['result']} \n {text}"
        target_text = row['entity_value']

        # Tokenize input and target texts
        input_ids = self.tokenizer(input_text, padding='max_length', max_length=self.max_length, return_tensors="pt").input_ids.squeeze()
        outputs = self.tokenizer(target_text, padding='max_length', max_length=self.max_length, return_tensors="pt").input_ids.squeeze()

        return {
            "input_ids": input_ids,
            "labels": outputs
        }


tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Create Dataset and DataLoader for training and testing sets
train_dataset = NERDataset(train_df, tokenizer)
test_dataset = NERDataset(test_df, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
num_epochs = 10
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch: {epoch + 1}, Training Loss: {avg_loss}")

# Testing Loop with Output Printing
model.eval()
test_loss = 0
print("Test Set Results:\n")
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        # Generate predictions
        outputs = model.generate(input_ids=input_ids, max_length=512)

        # Calculate loss for reporting
        loss = model(input_ids=input_ids, labels=labels).loss
        test_loss += loss.item()

        # Decode inputs, labels, and outputs for readability
        decoded_inputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]
        decoded_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in labels]
        decoded_outputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]

        # Print input, predicted output, and ground truth for comparison
        for inp, pred, truth in zip(decoded_inputs, decoded_outputs, decoded_labels):
            print(f"Input: {inp}")
            print(f"Predicted Output: {pred}")
            print(f"True Output: {truth}")
            print("\n------------------\n")

avg_test_loss = test_loss / len(test_dataloader)
print(f"Test Loss: {avg_test_loss}")

# Save the fine-tuned model
model.save_pretrained("t5-ner-finetuned")
print("Model saved as 't5-ner-finetuned'.")

# Save the tokenizer
tokenizer.save_pretrained("t5-ner-finetuned")




Epoch: 1, Training Loss: 4.617118086133685
Epoch: 2, Training Loss: 0.8074463520731244
Epoch: 3, Training Loss: 0.451425701379776
Epoch: 4, Training Loss: 0.22630473332745688
Epoch: 5, Training Loss: 0.16003939083644322
Epoch: 6, Training Loss: 0.135900256889207
Epoch: 7, Training Loss: 0.11897693361554827
Epoch: 8, Training Loss: 0.1048823818564415
Epoch: 9, Training Loss: 0.10535648252282824
Epoch: 10, Training Loss: 0.08473124142204012
Test Set Results:

Input: item_weight: 50 ml Recognize the given entity value and units(in full form)
Predicted Output: 50 ml Recognize the given entity value and units(in full form)(in full form)
True Output: 18.55 gram

------------------

Input: item_weight: The image does not provide any information regarding item_weight, so I cannot return a value for it. Recognize the given entity value and units(in full form)
Predicted Output: item_weight: The image does not provide any information regarding item_weight, so I cannot return a value for it.
True 

('t5-ner-finetuned/tokenizer_config.json',
 't5-ner-finetuned/special_tokens_map.json',
 't5-ner-finetuned/spiece.model',
 't5-ner-finetuned/added_tokens.json')

In [13]:
from transformers import pipeline
task = "text2text-generation"
model_name = "google/flan-t5-base"
input_text = "Get the value and units(in full form) not entity name"
text2text_generator = pipeline(
    task,
    model = model_name)

text2text_generator("1400MG")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': '1400MG'}]