In [13]:
from datasets import load_from_disk
my_dataset = load_from_disk("/home/UNT/ap1923/ap1923/betterDeliberation/src/deliberation/storedDeliberations/context110/new110/my_hf_dataset_new110")
my_dataset["train"][0]

{'models_used': ['meta-llama/Llama-3. 1-70B-Instruct'],
 'rounds': 2,
 'number_of_agents': 3,
 'problem': 'You are given the following symptoms joint_pain, skin_peeling, silver_like_dusting, small_dents_in_nails, inflammatory_nails. Your task is to diagnose a disease that best matches the symptoms.',
 'gold_diagnosis': 'Psoriasis',
 'conversation_id': 803,
 'history_current': "Both responses accurately diagnose the condition as Psoriasis based on the provided symptoms. However, Response 2 provides a clearer explanation of each symptom's relevance to Psoriasis, making it a more comprehensive and informative answer. A definitive diagnosis by a dermatologist is still necessary for confirmation.",
 'history_neighbour': 'Both responses suggest psoriasis as the likely diagnosis, and they are consistent in their descriptions of the characteristic symptoms. However, Response 1 provides a clearer explanation of the underlying cause of psoriasis, describing it as an autoimmune condition that cau

In [14]:
my_dataset

DatasetDict({
    train: Dataset({
        features: ['models_used', 'rounds', 'number_of_agents', 'problem', 'gold_diagnosis', 'conversation_id', 'history_current', 'history_neighbour', 'reply'],
        num_rows: 5827
    })
    validation: Dataset({
        features: ['models_used', 'rounds', 'number_of_agents', 'problem', 'gold_diagnosis', 'conversation_id', 'history_current', 'history_neighbour', 'reply'],
        num_rows: 728
    })
    test: Dataset({
        features: ['models_used', 'rounds', 'number_of_agents', 'problem', 'gold_diagnosis', 'conversation_id', 'history_current', 'history_neighbour', 'reply'],
        num_rows: 729
    })
})

In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
tokenizer.pad_token = tokenizer.eos_token
# meta-llama/Llama-3.1-8B-Instruct
# meta-llama/Llama-3.2-1B

In [16]:
def preprocess_function(examples):
    combined_inputs = [
        f"{p} {hc} {hn} {r}"
        for p, hc, hn, r in zip(
            examples["problem"],
            examples["history_current"],
            examples["history_neighbour"],
            examples["reply"],
        )
    ]
    return tokenizer(combined_inputs, truncation=True, padding="max_length")

In [17]:
tokenized_data = my_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns= my_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/5827 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/728 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/729 [00:00<?, ? examples/s]

In [18]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_dataset = tokenized_data.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/5827 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/728 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/729 [00:00<?, ? examples/s]

In [19]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [20]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
model_untouched = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [21]:
training_args = TrainingArguments(
    output_dir="finetunedDistilledBert_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("finetunedDistilledBert_model")
tokenizer.save_pretrained("finetunedDistilledBert_model")

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,1.1634,1.074373
2,1.021,0.973795
3,0.9818,0.946868


('finetunedDistilledBert_model/tokenizer_config.json',
 'finetunedDistilledBert_model/special_tokens_map.json',
 'finetunedDistilledBert_model/vocab.json',
 'finetunedDistilledBert_model/merges.txt',
 'finetunedDistilledBert_model/added_tokens.json',
 'finetunedDistilledBert_model/tokenizer.json')

In [22]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 2.58


In [23]:
untouched_trainer = Trainer(
    model=model_untouched,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
eval_results = untouched_trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  untouched_trainer = Trainer(


Perplexity: 31.40


In [28]:
prompt = "What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye"
# using conversation_id 11 and the gold diagnosis is allergy
prompt_two = "You are given the following symptoms: continous sneezing, chills and watering of the eye. Your task is to diagnose a disease that best matches the symptoms."
from transformers import pipeline

generator = pipeline("text-generation", model="finetunedDistilledBert_model")
print('prompt1 result', generator(prompt))
print('prompt2 result', generator(prompt_two))

generator_two = pipeline("text-generation", model=model_untouched, tokenizer=tokenizer)
print('prompt1 result from untouched model', generator_two(prompt))
print('prompt2 result from untouched model', generator_two(prompt_two))



Device set to use cuda:0
Device set to use cuda:0


prompt1 result [{'generated_text': 'What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye. Impetigo is a highly contagious bacterial skin infection that typically causes red sores, black sores'}]
prompt2 result [{'generated_text': 'You are given the following symptoms: continous sneezing, chills and watering of the eye. Your task is to diagnose a disease that best matches the symptoms. Both responses accurately diagnose the condition as Allergic Rhinitis based on the provided'}]
prompt1 result from untouched model [{'generated_text': 'What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye, cough, and diarrhea.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n'}]
prompt2 result from untouched model [{'generated_text': 'You are given the following symptoms: continous sneezing, chills and watering of the eye. Your task is to diagnos

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
model_path = "/home/UNT/ap1923/ap1923/modelfinetuning/finetunedDistilledBert_model"
# tokenizer = AutoTokenizer.from_pretrained(model_path)
model_three = AutoModelForCausalLM.from_pretrained(model_path)

device = model_three.device
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
outputs = model_three.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
print('fine tuned model result', tokenizer.batch_decode(outputs, skip_special_tokens=True))

# same thing but from untouched model
device = model_untouched.device
inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
outputs = model_untouched.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
tokenizer.batch_decode(outputs, skip_special_tokens=True)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


fine tuned model result ['What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye A medical professional would diagnose the condition as Influenza (Flu). The combination of symptoms such as continuous sneezing, chills, watering from the eyes, and watering from the eyes suggest an upper respiratory tract infection. It is essential to consult a healthcare professional for a definitive diagnosis and appropriate treatment. Based on the symptoms provided, the disease diagnosis is likely Influenza (Flu). The combination of symptoms such as continuous sneezing, chills, and watering of the eyes strongly suggests Influ']


['What will be the most likely disease diagnosis for a patient showing the following symptoms: continous sneezing, chills and watering of the eye, mouth, nose, eyes, back, chest, and chest â€” those who have a high risk of developing an infection or a disease such as flu, flu, or a few more types of pneumonia.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']