## load model

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq


tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## load dataset

In [2]:
from datasets import load_dataset
ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 112165
    })
})

## split the dataset

In [3]:
ds=ds['train'].take(80000)
ds=ds.train_test_split(test_size=0.3)
ds

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 56000
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 24000
    })
})

## processing dataset

In [4]:
prefix = "If you are a doctor, please answer the medical questions based on the patient's description: "

# Define the preprocessing function

def preprocess_function(examples):
   inputs = [prefix + doc for doc in examples["input"]]
   model_inputs = tokenizer(inputs, max_length=512, truncation=True)
  
   labels = tokenizer(text_target=examples["output"], 
                      max_length=512,         
                      truncation=True)

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

tokenized_dataset = ds.map(preprocess_function, batched=True)

Map: 100%|██████████| 56000/56000 [00:30<00:00, 1817.69 examples/s]
Map: 100%|██████████| 24000/24000 [00:13<00:00, 1833.26 examples/s]


## compute_metrics

In [5]:
import nltk
import evaluate
import numpy as np

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
  
   return result

## fine-tuning

In [7]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-dataset2-results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,  
    per_device_train_batch_size=1,  
    per_device_eval_batch_size=1, 
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=5,
    predict_with_generate=True,
    push_to_hub=False
   
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()




  0%|          | 61/42000 [03:03<24:10:47,  2.08s/it] 

{'loss': 3.0732, 'grad_norm': 2.7126808166503906, 'learning_rate': 9.999404761904763e-05, 'epoch': 0.0}



  0%|          | 61/42000 [03:20<24:10:47,  2.08s/it] 

{'loss': 3.1032, 'grad_norm': 3.388437509536743, 'learning_rate': 9.998809523809525e-05, 'epoch': 0.0}



  0%|          | 61/42000 [03:38<24:10:47,  2.08s/it] 

{'loss': 3.2036, 'grad_norm': 4.284571647644043, 'learning_rate': 9.998214285714286e-05, 'epoch': 0.0}




KeyboardInterrupt: 

## inference

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
last_checkpoint = "./third-results/checkpoint-4000"

finetuned_model = T5ForConditionalGeneration.from_pretrained(last_checkpoint).to("cuda")
finetuned_tokenizer = T5Tokenizer.from_pretrained(last_checkpoint)
question="what are marine toxins?"

input_text = "Please answer this medical related question: "+question
input_ids = finetuned_tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = finetuned_model.generate(
    input_ids,
    max_length=200,
    min_length=20,
    repetition_penalty=2.0
)
answer = finetuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
from textwrap import fill

print(fill(answer, width=100))

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Marine toxins are substances that cause damage to the body's tissues and organs. They can be toxic,
but they do not affect other parts of the body. The most common types of marine toxins include:
Lymphadenoma (the type of lymph nodes in the blood) Affected people may have an increased risk for
developing certain diseases such as cancer or heart disease. Some cases of this condition occur when
there is too much fluid in the brain or spinal cord. In some instances, it causes pain, swelling,
loss of appetite, nausea, vomiting, diarrhea, headache, seizures, fatigue, weight gain, muscle
weakness, difficulty swallowing, and/or confusion.
