In [1]:
!pip install -q rouge
!pip install -q rouge-score
!pip install -q evaluate
!pip install -q sacrebleu

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import torch

from transformers import pipeline, T5Tokenizer, TFT5Model, T5ForConditionalGeneration, AutoTokenizer
from transformers import Trainer, TrainingArguments, TrainerCallback
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM

#evaluation packages
#rogue score
from rouge import Rouge
from evaluate import load
# BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity.
# from bert_score import BERTScorer
#bleu score
import sacrebleu

In [3]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data

In [4]:
!pip install -q datasets # Install the 'datasets' library
from datasets import load_dataset # Now you can import the library

ds = load_dataset("Bilal-Mamji/Medical-summary")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Show dataset dict
print(ds)

# First training input for testing
dialogue_sample0 = ds['train']['input'][0]
print('First input:')
dialogue_sample0

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 9250
    })
    validation: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 500
    })
    test: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 250
    })
})
First input:


"Doctor: Hello, how can I help you today?\nPatient: My son has been having some issues with speech and development. He's 13 years old now.\nDoctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?\nPatient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.\nDoctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition. \n(After the tests)\nDoctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated philtrum and thin upper lip?\nPatient: Yes, he has all of those features. His hands are also broad and short. And his feet have mild syndactyly of the second and third toe, 

In [6]:
# ds['train']['input']  # Gets lits of train inputs
# ds['train'][0]  # Get's first instance of train data

## Summary Instructions

In [7]:
basic_instruct = "Summarize: "
SOAP_instruct = "Create a medical SOAP summary of this dialogue.: "
SOAP_instruct_full = "Create a Medical SOAP note summary from the dialogue, following these guidelines: S (Subjective): Summarize the patient's reported symptoms, including chief complaint and relevant history. Rely on the patient's statements as the primary source and ensure standardized terminology. O (Objective): Highlight critical findings such as vital signs, lab results, and imaging, emphasizing important details like the side of the body affected and specific dosages. Include normal ranges where relevant. A (Assessment): Offer a concise assessment combining subjective and objective data. State the primary diagnosis and any differential diagnoses, noting potential complications and the prognostic outlook. P (Plan): Outline the management plan, covering medication, diet, consultations, and education. Ensure to mention necessary referrals to other specialties and address compliance challenges. Considerations: Compile the report based solely on the transcript provided. Maintain confidentiality and document sensitively. Use concise medical jargon and abbreviations for effective doctor communication. Please format the summary in a clean, simple list format without using markdown or bullet points. Use 'S:', 'O:', 'A:', 'P:' directly followed by the text. Avoid any styling or special characters."

## Split and Save Data

In [8]:
# Separate data by split
train_dataset = ds['train']
valid_dataset = ds['validation']
test_dataset = ds['test']

# todo Use a subset of training data for debugging
# train_dataset = train_dataset.select(range(150))
# valid_dataset = valid_dataset.select(range(50))

In [9]:
train_dataset[0]

{'input': "Doctor: Hello, how can I help you today?\nPatient: My son has been having some issues with speech and development. He's 13 years old now.\nDoctor: I see. Can you tell me more about his symptoms? Does he have any issues with muscle tone or hypotonia?\nPatient: No, he doesn't have hypotonia. But he has mild to moderate speech and developmental delay, and he's been diagnosed with attention deficit disorder.\nDoctor: Thank you for sharing that information. We'll run some tests, including an MRI, to get a better understanding of your son's condition. \n(After the tests)\nDoctor: The MRI results are in, and I'm glad to say that there are no structural brain anomalies. However, I did notice some physical characteristics. Does your son have any facial features like retrognathia, mild hypertelorism, or a slightly elongated philtrum and thin upper lip?\nPatient: Yes, he has all of those features. His hands are also broad and short. And his feet have mild syndactyly of the second and t

In [10]:
# Save splits to separate csv files, to load only part at a time later
train_filepath = 'drive/MyDrive/DS266 Project/data/train.csv'
valid_filepath = 'drive/MyDrive/DS266 Project/data/valid.csv'
test_filepath = 'drive/MyDrive/DS266 Project/data/test.csv'

pd.DataFrame(train_dataset).to_csv(train_filepath, index=False)
pd.DataFrame(valid_dataset).to_csv(valid_filepath, index=False)
pd.DataFrame(test_dataset).to_csv(test_filepath, index=False)

# Save this because we'll need to tell the trainer how many examples we have
num_train_examples = len(train_dataset)
num_train_examples

9250

In [11]:
# Remove the full dataset from memory
ds = None
train_dataset = None
valid_dataset = None
test_dataset = None

# Load Models

In [24]:
# Load the pre-trained T5 model and tokenizer
model_name = "google/t5-base"  # also t5-small and t5-large
tokenizer = T5Tokenizer.from_pretrained(model_name)  # Load tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_name)  # Load model

OSError: google/t5-base is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

## Stream Dataset

In [13]:
# Stream load
# Hugging face load_dataset(), when only 1 file, it assumes the entire dataset is train data, thus the indexing ['train'] to remove that dictionary key level
train_dataset = load_dataset("csv", data_files=train_filepath, streaming=True)['train']
valid_dataset = load_dataset("csv", data_files=valid_filepath, streaming=False)['train']    # Set stream to false since it is a much smaller dataset and does not seem to be limiting factor

Generating train split: 0 examples [00:00, ? examples/s]

# Preprocess Encounter Data



In [14]:
## Preprocess: append instructions, set max length,
max_length = 600 #TODO Not long enough to capture entire input, but keeping it short to keep memory down

def preprocess_data(encounter):
    ''' Function to tokenize input and target output '''
    orig_text, target_text = encounter['input'], encounter['output']
    orig_text = basic_instruct + orig_text      #TODO Add/Change instruction

    # Tokenize input dialogue
    orig_encoded = tokenizer.batch_encode_plus(
        [orig_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    orig_input_ids = orig_encoded['input_ids'][0]
    orig_attention_mask = orig_encoded['attention_mask'][0]

    # Tokenize ground truth summary
    target_encoded = tokenizer.batch_encode_plus(
        [target_text],
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    label_ids = target_encoded['input_ids'][0]

    # Check text processing step
    # print('--------')
    # print(orig_text)
    # print(target_text)

    return {'input_ids': orig_input_ids,
            'attention_mask': orig_attention_mask,
            'labels': label_ids}

In [15]:
# Map the preprocessing function to the datasets (it will be called when batches are loaded)

train_dataset = train_dataset.map(preprocess_data)
valid_dataset = valid_dataset.map(preprocess_data)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [16]:
valid_dataset[0]['input_ids']
tokenizer.decode(valid_dataset[0]['input_ids'])

"Summarize: Doctor: Hello! I see that you were referred to our hospital for a lung adenocarcinoma measuring 28 mm in your right upper lobe. Is that correct? Patient: Yes, that's right. Doctor: We have planned a UVATS procedure to resect the tumor. Are you familiar with this procedure? Patient: Not really, can you explain it to me? Doctor: Of course. During the procedure, you will be placed in the left lateral decubitus position under general anesthesia. We will then make a 4-cm skin incision for the main port in the sixth intercostal space at the anterior axillary line. Patient: Okay, I see. Doctor: A wound retractor will be used to allow the insertion of a flexible thoracoscope, endoscopic autosuturing device, and vessel-sealing device via the main port incision. This will also allow us to extract the specimen after the operation. Patient: Hmm, I understand. Doctor: During the operation, we found an incomplete interlobar fissure between the upper and middle lobe, as well as abnormal l

# Training

In [17]:
#TODO Modify this filepath to where you want to save the model after fine-tuning
dir_path = 'drive/MyDrive/DS266 Project/model_checkpoints/'
file_path = dir_path + 't5base-finetuned-soap_01'

In [18]:
#TODO Specify batch size and other training arguments

batch_size = 16  #todo
num_epochs = 3
eval_steps = 50


args = Seq2SeqTrainingArguments(
    file_path,
    evaluation_strategy='steps',
    eval_steps = eval_steps,
    logging_steps = eval_steps,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_checkpointing=True,  #todo Decrease computational cost
    # gradient_accumulation_steps=2,  #todo  Accumulate gradients to decrease computational cost
    max_steps=int(num_epochs * num_train_examples / batch_size)   # Streaming dataset, we don't know how much data. Steps are the number of batches per epoch * num of epochs
)



In [19]:
# log examples

# def compute_metrics(eval_predictions):
#   # Get only the first 5 predictions, labels, and inputs
#     predictions, labels, inputs = eval_predictions[0][:5], eval_predictions[1][:5], eval_predictions[2][:5]

#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
#     decoded_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)

#     for input_text, pred, label in zip(decoded_inputs, decoded_preds, decoded_labels):
#         print(f"Input Text: {input_text}\nPredicted Summary: {pred}\nGround Truth: {label}\n-------------")

    # ... other metrics calculation ...
    # return {"rouge-l": rouge_score(decoded_preds, decoded_labels)}


In [20]:
# from transformers import TrainerCallback

# class CustomCallback(TrainerCallback):
#     def on_train_step_end(self, args, state, control, **kwargs):
#         print('callback function run')
#         if state.global_step % 50 == 0:
#             print('divisible by 50 step')
#             # Get a batch of validation data
#             eval_dataloader = self.trainer.get_eval_dataloader()
#             batch = next(iter(eval_dataloader))

#             # Move the batch to the device
#             batch = {k: v.to(self.trainer.device) for k, v in batch.items()}

#             # Generate predictions
#             with torch.no_grad():
#                 outputs = self.trainer.model(**batch)
#                 logits = outputs.logits

#             # Decode predictions and labels
#             predictions = torch.argmax(logits, dim=-1)
#             decoded_preds = self.tokenizer.batch_decode(predictions, skip_special_tokens=True)
#             decoded_labels = self.tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
#             decoded_inputs = self.tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)

#             # Log the first 5 examples
#             for input_text, pred_text, target_text in zip(decoded_inputs[:5], decoded_preds[:5], decoded_labels[:5]):
#                 print(f"Input: {input_text}")
#                 print(f"Prediction: {pred_text}")
#                 print(f"Target: {target_text}")
#                 print("-" * 50)

In [21]:
# Define the trainer, passing in the model, training args, and data generators

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    # compute_metrics = compute_metrics,
    # callbacks=[CustomCallback()]
)

max_steps is given, it will override any value given in num_train_epochs


In [22]:
# free up GPU memory
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

# Gemini suggestion to prevent fragmentation
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [23]:
# Call train
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjonnyluo[0m ([33mjonnyluo-university-of-california-berkeley[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

## Load from Checkpoint


In [16]:
# Set directory to desired model
file_path = dir_path + 't5base-finetuned-soap'
model_saved = T5ForConditionalGeneration.from_pretrained(file_path + '/checkpoint-1734')

# Generate: Validate and Test

In [17]:
## Try using preprocess mapper to add instructions
## Validation data check

# Move the model to the GPU
# model_saved = model_saved.cuda()  # Move the model to the GPU

# Preprocess: Tokenize, add instructions
valid_dataset = valid_dataset.map(preprocess_data)

for i, encounter in enumerate(valid_dataset):
    if i >= 5:
        break  #TODO Stop after 3 iterations

    print(encounter['input'])   # Print input dialogue
    print(encounter['output'])  # Print ground truth

    # Convert input_ids to tensor before generation
    input_ids_tensor = torch.tensor(encounter['input_ids']).unsqueeze(0)  # Add batch dimension
    # Generate summary prediction
    predict_output_ids = model_saved.generate(input_ids_tensor, min_length=200, max_length=400,   #TODO set the model
                                              do_sample = True, num_beams=5, no_repeat_ngram_size=3,
                                              early_stopping=True, temperature=0)
    # Decode tokens to human text
    print([tokenizer.decode(out_ids, skip_special_tokens=True,
                               clean_up_tokenization_spaces=False) for out_ids in predict_output_ids])
    print('---------')

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Doctor: Hello! I see that you were referred to our hospital for a lung adenocarcinoma measuring 28 mm in your right upper lobe. Is that correct?
Patient: Yes, that's right.
Doctor: We have planned a UVATS procedure to resect the tumor. Are you familiar with this procedure?
Patient: Not really, can you explain it to me?
Doctor: Of course. During the procedure, you will be placed in the left lateral decubitus position under general anesthesia. We will then make a 4-cm skin incision for the main port in the sixth intercostal space at the anterior axillary line.
Patient: Okay, I see.
Doctor: A wound retractor will be used to allow the insertion of a flexible thoracoscope, endoscopic autosuturing device, and vessel-sealing device via the main port incision. This will also allow us to extract the specimen after the operation.
Patient: Hmm, I understand.
Doctor: During the operation, we found an incomplete interlobar fissure between the upper and middle lobe, as well as abnormal lobulation of



["S: The patient reports being referred for a lung adenocarcinoma measuring 28 mm in the right upper lobe. The patient has not been familiar with the procedure. O: During the UVATS procedure, the patient was placed in the left lateral decubitus position under general anesthesia, followed by a 4-cm skin incision for the main port in the sixth intercostal space at the anterior axillary line. A wound retractor was used to allow the insertion of a flexible thoracoscope, endoscopic autosuturing device, and vessel-sealing device. Post-surgery, an incomplete interlobar fissure was found between the upper and midlobes of the lung. A: The primary diagnosis is a recurrent lung tumor with a history of lung cancer. Differential diagnoses could include other causes of lung tumors, but these are less likely given the patient's history and clinical presentation. P: The management plan includes a UVATS operation to resect the tumor. A follow-up appointment will be scheduled to monitor the progress of 

In [23]:
# # Reduce unnecessary output
# # transformers.logging.set_verbosity_error()


# # Move the model to the GPU
# # model_saved = model_saved.cuda()  # Move the model to the GPU

# # Check some validation outputs
# for encounter in valid_dataset.select(range(3)):
#     input = encounter['input']    # Get input dialogue from encounter
#     print(input)

#     # Append instruction and tokenize input
#     predict_inputs = tokenizer([basic_instruct + input], return_tensors='pt')
#     # Move input tensors to the GPU
#     # predict_inputs = predict_inputs.to('cuda') #This line moves the input tensors to the same device as the model.
#     # Generate summary through decoding
#     predict_output_ids = model_saved.generate(predict_inputs['input_ids'], min_length=200, max_length=400,
#                                               num_beams=5, no_repeat_ngram_size=3)
#     # Decode tokens to human text
#     print([tokenizer.decode(out_ids, skip_special_tokens=True,
#                                clean_up_tokenization_spaces=False) for out_ids in predict_output_ids])
#     print('---------')

AttributeError: 'IterableDataset' object has no attribute 'select'

# Evaluator

In [39]:
def generate_predictions(test_dataset, model, tokenizer, device): # Add device parameter to swtich from T4 to local device
    '''function to tokenize the test data input & ground truth and generate predictions'''
    predictions = []
    references = []

    # Preprocess: tokenize input and output
    test_dataset = test_dataset.map(preprocess_data)

    for example in test_dataset:
        #tokenize inputs
        # inputs = tokenizer(
        #     example["input"], return_tensors="pt", max_length=900, truncation=True, padding="max_length"
        # )

        # generate tokenized output predictions
        input_ids = torch.tensor([example['input_ids']]).to(device)
        output_ids = model_saved.generate(input_ids, min_length=200, max_length=400,    #TODO set the model, consider creating function
                                              num_beams=5, no_repeat_ngram_size=3)
        # Decode to human language
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True,
                                      clean_up_tokenization_spaces=False)
        predictions.append(prediction)

        #reference text (ground truth)
        references.append(example['output'])

    return predictions, references

In [37]:
# debug by not streaming
valid_dataset = load_dataset("csv", data_files=valid_filepath)['train']
valid_dataset_first3 = valid_dataset.select(range(3))

In [40]:
#load rouge metric
rouge = load("rouge") #lrouge metric using load function
#gpu to local device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#move model to local
model.to(device)

#generate predictions and references
predictions, references = generate_predictions(valid_dataset, model, tokenizer, device)


#calcuating rouge score
rouge_results = rouge.compute(predictions=predictions, references=references)
print("ROUGE Results:", rouge_results)

#BLEU expects references as a list of lists
references = [[ref] for ref in references]
#calculating BLEU score
bleu_score = sacrebleu.corpus_bleu(predictions, references)
print("BLEU Score:", bleu_score.score)

ROUGE Results: {'rouge1': 0.5354700040430262, 'rouge2': 0.27954138753000163, 'rougeL': 0.33394486831423514, 'rougeLsum': 0.3838069935500195}
BLEU Score: 32.48970153123982


In [41]:
predictions[:3]

["S: The patient reports being referred for a lung adenocarcinoma measuring 28 mm in the right upper lobe. The patient has not been familiar with the procedure. O: During the UVATS procedure, the patient was placed in the left lateral decubitus position under general anesthesia, followed by a 4-cm skin incision for the main port in the sixth intercostal space at the anterior axillary line. A wound retractor was used to allow the insertion of a flexible thoracoscope, endoscopic autosuturing device, and vessel-sealing device. Post-surgery, an incomplete interlobar fissure was found between the upper and midlobes of the lung. A: The primary diagnosis is a recurrent lung tumor with a history of lung cancer. Differential diagnoses could include other causes of lung tumors, but these are less likely given the patient's history and clinical presentation. P: The management plan includes a UVATS operation to resect the tumor. A follow-up appointment will be scheduled to monitor the progress of 

In [None]:
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=54bbe02d04e13b3f66e9ec89883130a4ba0d59df1c8545f5d0f4cbb415abaad2
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import evaluate

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

# Training

In [None]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    # per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    # warmup_steps=500,
    # weight_decay=0.01,
    logging_dir=OUT_DIR,
    # logging_steps=10,
    # evaluation_strategy='steps',
    # eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    # learning_rate=0.0001,
    # dataloader_num_workers=4
)

# training_args = TrainingArguments(
#     output_dir=OUT_DIR,
#     num_train_epochs=3,
#     per_device_train_batch_size=16,
#     save_steps=10_000,
#     save_total_limit=2,
# )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    # preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

history = trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


## Using t5-dialogie-summarization model
https://huggingface.co/chanifrusydi/t5-dialogue-summarization?library=transformers

In [None]:
#pipe = pipeline("summarization", model="chanifrusydi/t5-dialogue-summarization")

# Using stock T5 models
pipe = pipeline("summarization", model="t5-3b")

config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/11.4G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
summary = pipe(dialogue_sample0)
print(summary[0]['summary_text'])

## Fine tune T5

## TODO: filter **instructions**

In [None]:
t5_model = TFT5ForConditionalGeneration.from_pretrained('t5-large') #also t5-small and t5-large
t5_tokenizer = T5Tokenizer.from_pretrained('t5-large')

t5_model.summary()

Set input

In [None]:
t5_input_text = "summarize: " + dialogue_sample0

In [None]:
t5_inputs = t5_tokenizer([t5_input_text], return_tensors='tf')

Summarize

In [None]:
t5_summary_ids = t5_model.generate(t5_inputs['input_ids'],
                                    num_beams=3,
                                    no_repeat_ngram_size=10,
                                    min_length=100,
                                    max_length=500)

print([t5_tokenizer.decode(g, skip_special_tokens=True,
                           clean_up_tokenization_spaces=False) for g in t5_summary_ids])

In [None]:
# Load the pre-trained T5 model and tokenizer
model_name = "t5-base"  # also t5-small and t5-large
tokenizer = T5Tokenizer.from_pretrained(model_name)  # Load tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_name)  # Load model

In [None]:
ds

In [None]:
def tokenize_function(examples):
    inputs = tokenizer(examples["input"], max_length=512, padding="max_length", truncation=True)
    targets = tokenizer(examples["output"], max_length=512, padding="max_length", truncation=True)
    inputs["labels"] = targets["input_ids"]
    inputs["decoder_input_ids"] = targets["input_ids"]
    return inputs

tokenized_dataset = ds.map(tokenize_function, batched=True)

train_data = tokenized_dataset["train"]
val_data = tokenized_dataset["validation"]

In [None]:
train_data

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

# Start training
trainer.train()

In [None]:
trainer.evaluate()