In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import load_dataset, Dataset, DatasetDict
import re
import torch
from evaluate import load
from tqdm import tqdm
import numpy as np
from evaluate import load
import pandas as pd

In [None]:
# ------------------------
# 1. Install Required Libraries
# ------------------------
#!pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
#!pip install transformers datasets evaluate -q
#!pip install tqdm numpy
#!pip install rouge_score

In [24]:
# ------------------------------------------------------------------------
# 2. Load Dataset 
# ------------------------------------------------------------------------
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("urvog/llama2_transcripts_healthcare_callcenter")
print(len(dataset["train"]["text"]))
train_set = dataset["train"].select(range(0, 800))        
validation_set = dataset["train"].select(range(800, 900)) 
test_set = dataset["train"].select(range(900, 1000))       


split_dataset = DatasetDict({
    "train": train_set,
    "validation": validation_set,
    "test": test_set
})


print(split_dataset)

1000
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text'],
        num_rows: 100
    })
})


In [25]:
#print(dataset)

# ------------------------------------------------------------------------
# 3. Load Pre-trained Model & Tokenizer
# ------------------------------------------------------------------------

model_checkpoint = "google/flan-t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.add_tokens(["<MASK>"]) #add <MASK> token to tokenizer

model.resize_token_embeddings(len(tokenizer))

Embedding(32101, 768)

In [None]:

# ------------------------------------------------------------------------
# 3. Mask the 3rd-agent responses in the datasets
# ------------------------------------------------------------------------

def mask_dataset(dataset, datatype):
    # Set max index based on split size
    if datatype == "test" or datatype == "validation":
        max = 99
    elif datatype == "train":
        max = 799
    else:
        raise ValueError(f"Unknown datatype: {datatype}")
        
    processed_methods = []
    processed_targets = []
    i = 0

    # Track success/failure
    yes = 0
    no = 0

    # Loop through the dataset and apply masking
    while i <= max:
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1} examples from {datatype}")

        # Get original transcript
        full_transcript = dataset[datatype]['text'][i]

        # Flatten the transcript (remove newlines and extra whitespace)
        flattened = " ".join(full_transcript.split())

        # Find all Agent responses using regex
        agent_responses = re.findall(r"Agent \d+: (.*?)(?=Customer:|Agent \d+:|$)", flattened)

        if len(agent_responses) >= 3:
            target = agent_responses[2].strip()
            masked = flattened.replace(target, "<MASK>", 2)

            processed_methods.append(masked)
            processed_targets.append(target)
            yes += 1
        else:
            no += 1 

        i += 1

    print(f"{datatype} — Successfully masked: {yes}, Skipped: {no}")
    return {
        "processed_method": processed_methods,
        "target_block": processed_targets
    }
valid = mask_dataset(split_dataset, "validation")
test = mask_dataset(split_dataset, "test")
train = mask_dataset(split_dataset, "train")

Processed 50 examples from validation
Processed 100 examples from validation
validation — Successfully masked: 95, Skipped: 5
Processed 50 examples from test
Processed 100 examples from test
test — Successfully masked: 95, Skipped: 5
Processed 50 examples from train
Processed 100 examples from train
Processed 150 examples from train
Processed 200 examples from train
Processed 250 examples from train
Processed 300 examples from train
Processed 350 examples from train
Processed 400 examples from train
Processed 450 examples from train
Processed 500 examples from train
Processed 550 examples from train
Processed 600 examples from train
Processed 650 examples from train
Processed 700 examples from train
Processed 750 examples from train
Processed 800 examples from train
train — Successfully masked: 766, Skipped: 34


In [27]:
print(train["processed_method"][0])
print(train["target_block"][0])
#print(train)

<s>[INST] Classify the following call transcript: Agent 3: Thank you for calling HealthHarbor, my name is Agent 3. How can I assist you today? Customer: Hi Agent 3, my name is Emma Johnson. I've been experiencing some symptoms lately and I wanted to seek medical advice or get a symptom assessment. Agent 3: I'm sorry to hear that, Emma. I'll do my best to help you. Can you please describe the symptoms you've been experiencing? Customer: Sure. I've been having a persistent headache for the past few days, and it's been accompanied by dizziness and occasional nausea. I'm not sure what could be causing it. Agent 3: <MASK> Customer: No major lifestyle changes, but I have noticed that my vision seems a bit blurry at times. And I've been feeling more fatigued than usual. Agent 3: Thank you for sharing that information, Emma. Blurry vision and fatigue can also be related to your symptoms. It's important to consider all these factors for a proper assessment. Based on your symptoms, I would recom

In [29]:

# ------------------------------------------------------------------------------------------------
# 4. We prepare now the fine-tuning dataset using the tokenizer we preloaded
# ------------------------------------------------------------------------------------------------

def preprocess_function(dataset):
    inputs = dataset["processed_method"]
    targets = dataset["target_block"]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


train = Dataset.from_dict(train)
valid = Dataset.from_dict(valid)
test = Dataset.from_dict(test)
train = train.map(preprocess_function, batched=True)
valid = valid.map(preprocess_function, batched = True)
test = test.map(preprocess_function, batched = True)
#print(valid)
#print(train)
#print(test)


Map: 100%|██████████| 766/766 [00:00<00:00, 1854.91 examples/s]
Map: 100%|██████████| 95/95 [00:00<00:00, 2064.68 examples/s]
Map: 100%|██████████| 95/95 [00:00<00:00, 1978.72 examples/s]


In [30]:
# ------------------------------------------------------------------------
# 5. Define Training Arguments and Trainer
# ------------------------------------------------------------------------


training_args = TrainingArguments(
    output_dir=".google/flan-t5-base",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_total_limit=2,
    logging_steps=100,
    push_to_hub=False,
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


  trainer = Trainer(


In [48]:

model2 = model.to('cuda')
model2.eval()

all_inputs_untrained = test["processed_method"]
batch_size = 8  
decoded_outputs = []
# ------------------------------------------------------------------------
# 8. Run the model generation in batches in order to run code without memory errors
# ------------------------------------------------------------------------

for i in tqdm(range(0, len(all_inputs_untrained), batch_size)):
    batch = all_inputs_untrained[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to('cuda') for k, v in inputs.items()}

    with torch.no_grad():
        outputs_untrained = model2.generate(**inputs, max_length=256)

    # Decode each output
    decoded_batch = tokenizer.batch_decode(outputs_untrained, skip_special_tokens=True)
    decoded_outputs.extend(decoded_batch)
    outputs_untrained = decoded_outputs

100%|██████████| 12/12 [00:22<00:00,  1.90s/it]


In [40]:
print(len(outputs_untrained))

95


In [66]:
# ------------------------
# 6. Train the Model
# ------------------------
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0947,0.11195
2,0.0826,0.110143
3,0.0792,0.109998
4,0.0686,0.114833
5,0.065,0.115739


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1915, training_loss=0.07693713601513258, metrics={'train_runtime': 574.0438, 'train_samples_per_second': 13.344, 'train_steps_per_second': 6.672, 'total_flos': 1311188342538240.0, 'train_loss': 0.07693713601513258, 'epoch': 5.0})

In [75]:
save_path = ".google/flan-t5-base"

# Save model
trainer.save_model(save_path)

# Save tokenizer
tokenizer.save_pretrained(save_path)

('.google/flan-t5-base\\tokenizer_config.json',
 '.google/flan-t5-base\\special_tokens_map.json',
 '.google/flan-t5-base\\tokenizer.json')

In [41]:

save_path = ".google/flan-t5-base"
# Load the saved model
model = T5ForConditionalGeneration.from_pretrained(save_path)

# Load the saved tokenizer
tokenizer = AutoTokenizer.from_pretrained(save_path)

In [42]:

# ------------------------
# 7. Test Code Translation
# ------------------------
model2 = model.to('cuda')
input_code = test["processed_method"][2]
print(test["target_block"][2])
inputs = tokenizer(input_code, return_tensors="pt", padding=True, truncation=True)
outputs = model2.generate(**inputs.to('cuda'), max_length=256)
print(tokenizer.decode(outputs[0]))
model2.eval()

all_inputs = test["processed_method"]
batch_size = 8  
decoded_outputs = []

Of course, Sarah. HealthHarbor is a hospital that offers a wide range of medical services. We have specialized doctors, advanced diagnostic tools, and state-of-the-art facilities. If you're experiencing serious symptoms or if it's an emergency, it's best to visit our hospital. However, for general check-ups or less urgent issues, you can consider visiting a clinic.
<pad> Of course, Sarah. HealthHarbor is a comprehensive healthcare facility that offers a wide range of medical services. We have departments for general medicine, pediatrics, obstetrics and gynecology, orthopedics, and more. We also have specialized clinics for various conditions such as cardiology, neurology, and oncology.</s>


In [43]:

# ------------------------------------------------------------------------
# 8. Run the model generation in batches in order to run code without memory errors
# ------------------------------------------------------------------------

for i in tqdm(range(0, len(all_inputs), batch_size)):
    batch = all_inputs[i:i+batch_size]

    # Tokenize batch
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to('cuda') for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model2.generate(**inputs, max_length=256)

    # Decode each output
    decoded_batch = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_outputs.extend(decoded_batch)

100%|██████████| 12/12 [00:22<00:00,  1.85s/it]


In [44]:
outputs_trained = decoded_outputs
print(outputs_trained)

['Thank you, Mr. Smith. How can I assist you with your insurance and billing inquiries today?', 'Thank you, Sarah. How can I assist you with your medication refills and prescription inquiries today?', 'Of course, Sarah. HealthHarbor is a comprehensive healthcare facility that offers a wide range of medical services. We have departments for general medicine, pediatrics, obstetrics and gynecology, orthopedics, and more. We also have specialized clinics for various conditions such as cardiology, neurology, and oncology.', 'Thank you, Ms. Johnson. How can I assist you with your healthcare provider recommendations?', 'Thank you, Emily. How can I assist you with your medication refills today?', 'Thank you, Emily. How can I assist you with your insurance and billing inquiries today?', 'Thank you, Sarah. Let me pull up your records. How can I assist you with your medication refills today?', 'Thank you, Jessica. Let me check the system for your lab and test results. Please bear with me for a mo

In [45]:
for i in range(5):
    print(test["target_block"][i+80])
    print(f"Prediction: {outputs_trained[i+80]}")

Thank you, Emily. Let me pull up your account. Okay, I see your information here. How can I assist you today?
Prediction: Thank you, Emily. How can I assist you with your insurance and billing inquiries today?
Of course, John. I'll be happy to assist you with that. Could you please provide me with your account number or the date of service mentioned on the bill?
Prediction: Of course, John. Let me check that for you. Can you please provide me with your account number or date of service?
Thank you, Sarah. Could you please provide me with your date of birth and the medication you need a refill for?
Prediction: Thank you, Sarah. Can you please provide me with your date of birth and the medication you need a refill for?
Thank you, Emily. Let me pull up your file. I see here that you had some lab tests done. Can you please tell me which specific tests were conducted?
Prediction: Thank you, Emily. Let me pull up your records. Could you please provide me with the details of the tests you had 

In [None]:
rouge = load("rouge")
predictions = decoded_outputs
references = test["target_block"]
# Calculate the ROUGE scores
rouge_scores = rouge.compute(predictions=predictions, references=references, use_stemmer=True)

# Print separate ROUGE scores
print("ROUGE Scores:")
for metric, score in rouge_scores.items():
    print(f"{metric}: {round(score, 4)}")
print("Overall ROUGE Score: ", sum(rouge_scores.values()) / len(rouge_scores))

ROUGE Scores:
rouge1: 0.633
rouge2: 0.4708
rougeL: 0.5879
rougeLsum: 0.5891
Overall ROUGE Score:  0.5701928157335951


In [49]:
df = pd.DataFrame({
    "inputs": test["processed_method"],
    "target_block": test["target_block"],
    "untrained_model_predictions": outputs_untrained,
    "trained_model_predictions": outputs_trained
})

# Step 4: Save to CSV
df.to_csv("results.csv", index=False)
print("Saved to results.csv")

Saved to results.csv
