In [1]:
from datasets import load_from_disk
my_dataset = load_from_disk("/home/UNT/ap1923/ap1923/betterDeliberation/src/deliberation/storedDeliberations/my_hf_dataset")
my_dataset["train"][0]

{'models_used': ['meta-llama/Llama-3. 1-70B-Instruct'],
 'rounds': 2,
 'number_of_agents': 3,
 'problem': 'You are given the following symptoms fatigue, cough, high_fever, breathlessness, mucoid_sputum. Your task is to diagnose a disease that best matches the symptoms.',
 'gold_diagnosis': 'Bronchial Asthma',
 'conversation_id': 88,
 'history_current': 'The symptoms of fatigue, cough, high_fever, breathlessness, and mucoid_sputum are commonly associated with respiratory infections, particularly pneumonia. Pneumonia is an infection that inflames the air sacs in one or both lungs, which may fill with fluid. This condition can cause cough, which may produce mucus, and can lead to fatigue, high fever, and difficulty breathing (breathlessness). The presence of mucoid sputum, which is thick and sticky, further supports this diagnosis as it is often seen in',
 'history_neighbour': 'Based on the given symptoms of fatigue, cough, high fever, breathlessness, and mucoid sputum, I would diagnose t

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
# meta-llama/Llama-3.1-8B-Instruct
# meta-llama/Llama-3.2-1B

In [4]:
def preprocess_function(examples):
    combined_inputs = [
        f"{p} {hc} {hn} {r}"
        for p, hc, hn, r in zip(
            examples["problem"],
            examples["history_current"],
            examples["history_neighbour"],
            examples["reply"],
        )
    ]
    return tokenizer(combined_inputs, truncation=True, padding="max_length")

In [5]:
tokenized_data = my_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns= my_dataset["train"].column_names,
)

In [6]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_data.map(group_texts, batched=True, num_proc=4)

In [8]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [9]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
model_untouched = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
training_args = TrainingArguments(
    output_dir="finetunedDistilledBert_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("finetunedDistilledBert_model")
tokenizer.save_pretrained("finetunedDistilledBert_model")

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,1.565049
2,2.048400,1.385418
3,1.524900,1.338614


('finetunedDistilledBert_model/tokenizer_config.json',
 'finetunedDistilledBert_model/special_tokens_map.json',
 'finetunedDistilledBert_model/vocab.json',
 'finetunedDistilledBert_model/merges.txt',
 'finetunedDistilledBert_model/added_tokens.json',
 'finetunedDistilledBert_model/tokenizer.json')

In [11]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 3.81


In [12]:
untouched_trainer = Trainer(
    model=model_untouched,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
eval_results = untouched_trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  untouched_trainer = Trainer(


Perplexity: 31.23


In [14]:
prompt = "What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye"
# using conversation_id 11 and the gold diagnosis is allergy
prompt_two = "You are given the following symptoms: continous sneezing, chills and watering of the eye. Your task is to diagnose a disease that best matches the symptoms."
from transformers import pipeline

generator = pipeline("text-generation", model="finetunedDistilledBert_model")
print('prompt1 result', generator(prompt))
print('prompt2 result', generator(prompt_two))

generator_two = pipeline("text-generation", model=model_untouched, tokenizer=tokenizer)
print('prompt1 result from untouched model', generator_two(prompt))
print('prompt2 result from untouched model', generator_two(prompt_two))



Device set to use cuda:0
Device set to use cuda:0


prompt1 result [{'generated_text': 'What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye, as well as breathlessness. A likely diagnosis would be polychromic - a condition caused by the'}]
prompt2 result [{'generated_text': 'You are given the following symptoms: continous sneezing, chills and watering of the eye. Your task is to diagnose a disease that best matches the symptoms. To rule out other possible causes, I recommend consulting a healthcare professional for a more'}]
prompt1 result from untouched model [{'generated_text': 'What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye, itching, sweating, soreness and diarrhea, swelling, and abdominal pain. Although these signs are rarely'}]
prompt2 result from untouched model [{'generated_text': 'You are given the following symptoms: continous sneezing, chills

In [17]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_path = "/home/UNT/ap1923/ap1923/modelfinetuning/finetunedDistilledBert_model"
# tokenizer = AutoTokenizer.from_pretrained(model_path)
model_three = AutoModelForCausalLM.from_pretrained(model_path)

device = model_three.device
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
outputs = model_three.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
print('fine tuned model result', tokenizer.batch_decode(outputs, skip_special_tokens=True))

# same thing but from untouched model
device = model_untouched.device
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
outputs = model_untouched.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


fine tuned model result ['What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye While in the early stages of symptoms such as cough, high fever, and breathlessness, they all present a condition characterized by rapid onset of the respiratory disease jaundice, which can be caused by a virus or viral infection. However, if symptoms such as chest pain, spotting urination, and spotting urination are not enough to effectively diagnose the disease, the symptoms provided are collectively indicative of a systemic infection or viral infection, such as a flu. The symptoms of a cough, high fever,']


['What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']