In [1]:
from datasets import load_from_disk
my_dataset = load_from_disk("/home/UNT/ap1923/ap1923/betterDeliberation/src/deliberation/storedDeliberations/my_hf_dataset")
my_dataset

DatasetDict({
    train: Dataset({
        features: ['models_used', 'rounds', 'number_of_agents', 'problem', 'gold_diagnosis', 'conversation_id', 'history_current', 'history_neighbour', 'reply'],
        num_rows: 441
    })
    validation: Dataset({
        features: ['models_used', 'rounds', 'number_of_agents', 'problem', 'gold_diagnosis', 'conversation_id', 'history_current', 'history_neighbour', 'reply'],
        num_rows: 55
    })
    test: Dataset({
        features: ['models_used', 'rounds', 'number_of_agents', 'problem', 'gold_diagnosis', 'conversation_id', 'history_current', 'history_neighbour', 'reply'],
        num_rows: 56
    })
})

In [2]:
my_dataset["train"][0]

{'models_used': ['meta-llama/Llama-3. 1-70B-Instruct'],
 'rounds': 2,
 'number_of_agents': 3,
 'problem': 'You are given the following symptoms fatigue, cough, high_fever, breathlessness, mucoid_sputum. Your task is to diagnose a disease that best matches the symptoms.',
 'gold_diagnosis': 'Bronchial Asthma',
 'conversation_id': 88,
 'history_current': 'The symptoms of fatigue, cough, high_fever, breathlessness, and mucoid_sputum are commonly associated with respiratory infections, particularly pneumonia. Pneumonia is an infection that inflames the air sacs in one or both lungs, which may fill with fluid. This condition can cause cough, which may produce mucus, and can lead to fatigue, high fever, and difficulty breathing (breathlessness). The presence of mucoid sputum, which is thick and sticky, further supports this diagnosis as it is often seen in',
 'history_neighbour': 'Based on the given symptoms of fatigue, cough, high fever, breathlessness, and mucoid sputum, I would diagnose t

In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")

def preprocess_function(examples):
    combined_inputs = [
        f"{p} {hc} {hn} {r}"
        for p, hc, hn, r in zip(
            examples["problem"],
            examples["history_current"],
            examples["history_neighbour"],
            examples["reply"],
        )
    ]
    return tokenizer(combined_inputs, truncation=True, padding="max_length")

tokenized_eli5 = my_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=my_dataset["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/441 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/55 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/56 [00:00<?, ? examples/s]

In [4]:
print(tokenized_eli5)
# print("length", len(tokenized_eli5["train"][0]['input_ids']))
# tokenized_eli5["train"][0]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 441
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 55
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 56
    })
})


In [5]:
block_size = 128


def group_texts(examples):
    print("examples", examples)
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/441 [00:00<?, ? examples/s]

examplesexamples  examplesexamples  {'input_ids': [[0, 1185, 32, 576, 5, 511, 5298, 23600, 6, 872, 1215, 1116, 1215, 3340, 594, 1459, 6, 28670, 1215, 32981, 6, 9078, 1215, 1116, 1215, 571, 9354, 6, 3425, 1215, 22853, 4, 2486, 3685, 16, 7, 29263, 10, 2199, 14, 275, 2856, 5, 5298, 4, 7253, 15, 5, 5298, 47, 1286, 6, 38, 74, 29263, 20691, 1001, 293, 6673, 1580, 337, 8526, 27643, 11817, 36, 31595, 495, 322, 20, 5298, 47, 2801, 32, 10266, 3059, 19, 35683, 495, 6, 10, 1881, 147, 9377, 10395, 7964, 124, 88, 5, 2714, 6673, 29962, 6, 3735, 19535, 4, 468, 1075, 2838, 8, 872, 9, 11037, 64, 5948, 528, 7, 5, 6701, 7150, 12257, 9, 9377, 10395, 4, 2060, 12623, 6204, 2400, 8, 9078, 9, 20038, 32, 67, 6097, 5298, 9, 35683, 495, 4, 18387, 35188, 16, 10, 540, 1537, 28667, 6, 53, 24, 64, 28, 1330, 7, 5, 32776, 9, 5, 2714, 6673, 29962, 8, 9377, 20, 5298, 1286, 32, 22206, 9, 10, 36805, 696, 4, 7253, 15, 5, 5298, 9, 23600, 6, 872, 9, 11037, 6, 28670, 2400, 6, 9078, 9, 20038, 6, 8, 3425, 35188, 6, 38, 74, 29263

Map (num_proc=4):   0%|          | 0/55 [00:00<?, ? examples/s]

examplesexamplesexamplesexamples    {'input_ids': [[0, 1185, 32, 576, 5, 511, 5298, 8698, 1215, 605, 15374, 6, 239, 1215, 506, 6294, 6, 1823, 1215, 3916, 8632, 1215, 10800, 19170, 4, 2486, 3685, 16, 7, 29263, 10, 2199, 14, 275, 2856, 5, 5298, 4, 7253, 15, 5, 5298, 1286, 6, 38, 74, 29263, 5, 3186, 19, 7947, 73, 30968, 4, 46376, 1258, 35, 40221, 21025, 16, 10, 1537, 28667, 9, 3319, 7947, 73, 30968, 6, 147, 5, 809, 18, 9161, 467, 16, 11166, 13969, 6, 981, 7, 2408, 872, 8, 8698, 35790, 16628, 4, 755, 11696, 64, 67, 5948, 11, 7947, 73, 30968, 1484, 528, 7, 23609, 5580, 11341, 4, 18355, 12, 3916, 8632, 9872, 712, 5, 810, 9, 20933, 7947, 6, 10, 5912, 20579, 7910, 4, 20, 4069, 9, 209, 5298, 6, 1605, 11, 5, 5377, 9, 239, 12, 10848, 3650, 6, 3649, 7947, 73, 30968, 25, 10, 533, 9726, 4, 7029, 3044, 6, 215, 25, 7253, 15, 5, 5298, 1286, 6, 10, 678, 2199, 9726, 16, 7947, 73, 30968, 4, 40221, 21025, 16, 10, 1537, 28667, 9, 3319, 7947, 73, 30968, 528, 7, 5, 809, 18, 12561, 7, 15709, 20012, 4, 755, 116

Map (num_proc=4):   0%|          | 0/56 [00:00<?, ? examples/s]

examplesexamples examples examples  {'input_ids': [[0, 1185, 32, 576, 5, 511, 5298, 11152, 1215, 29, 858, 5841, 154, 6, 1481, 30579, 6, 1855, 5622, 4, 2486, 3685, 16, 7, 29263, 10, 2199, 14, 275, 2856, 5, 5298, 4, 7253, 15, 5, 5298, 9, 11152, 18013, 5841, 154, 6, 1481, 30579, 6, 8, 1855, 5622, 6, 10, 533, 9726, 16, 10, 7696, 2853, 17960, 7910, 6, 215, 25, 5, 1537, 2569, 50, 6626, 4, 1216, 5298, 32, 747, 7513, 30, 10, 422, 2855, 8658, 6, 15744, 6, 8, 10, 12867, 14599, 4, 83, 3299, 18, 9434, 16, 5131, 13, 10, 4692, 9726, 8, 1416, 563, 6, 61, 189, 680, 81, 12, 627, 12, 24774, 12102, 7, 3616, 5298, 8, 1079, 4, 3702, 1452, 34339, 32, 45, 2375, 136, 7696, 11341, 4, 7253, 15, 5, 5298, 1286, 6, 5, 533, 9726, 16, 5, 1537, 2569, 6, 61, 16, 10, 7696, 2853, 17960, 7910, 4, 37737, 18013, 5841, 154, 6, 1481, 30579, 6, 8, 1855, 5622, 32, 1537, 5298, 9, 42, 1881, 6, 747, 7513, 30, 10, 422, 2855, 8658, 6, 15744, 6, 8, 10, 12867, 14599, 4, 616, 15671, 7443, 24238, 8, 23807, 32, 678, 6, 5, 5298, 1286, 32

In [6]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1764
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 220
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 224
    })
})

In [None]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import AutoModelForMaskedLM
model_untouched = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
training_args = TrainingArguments(
    output_dir="deli_modelling",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    logging_steps=5,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("deli_modelling")
tokenizer.save_pretrained("deli_modelling")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.1654,0.453578
2,0.3321,0.419401
3,0.3389,0.412282


('deli_modelling/tokenizer_config.json',
 'deli_modelling/special_tokens_map.json',
 'deli_modelling/vocab.json',
 'deli_modelling/merges.txt',
 'deli_modelling/added_tokens.json',
 'deli_modelling/tokenizer.json')

In [21]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 1.49


## Inference

In [13]:
text = "The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of <mask>." 
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask",
    model="deli_modelling",
    tokenizer="deli_modelling"
)
mask_filler(text, top_k=3)

Device set to use cuda:0


[{'score': 0.22924388945102692,
  'token': 7947,
  'token_str': ' HIV',
  'sequence': 'The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of HIV.'},
 {'score': 0.0902792289853096,
  'token': 32285,
  'token_str': ' schizophrenia',
  'sequence': 'The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of schizophrenia.'},
 {'score': 0.07672925293445587,
  'token': 17296,
  'token_str': ' AIDS',
  'sequence': 'The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of AIDS.'}]

In [19]:
mask_filler = pipeline(
    "fill-mask",
    model=model_untouched,
    tokenizer=tokenizer
)
mask_filler(text, top_k=3)

Device set to use cuda:0


[{'score': 0.11855622380971909,
  'token': 30360,
  'token_str': ' acne',
  'sequence': 'The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of acne.'},
 {'score': 0.11488759517669678,
  'token': 1668,
  'token_str': ' cancer',
  'sequence': 'The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of cancer.'},
 {'score': 0.09250980615615845,
  'token': 32285,
  'token_str': ' schizophrenia',
  'sequence': 'The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of schizophrenia.'}]

In [22]:
trainer_1 = Trainer(
    model=model_untouched,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
eval_results = trainer_1.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  trainer_1 = Trainer(


Perplexity: 1496.53


In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
model_path = "/home/UNT/ap1923/ap1923/modelfinetuning/deli_modelling"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)
text = "The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of <mask>." 
inputs = tokenizer(text, return_tensors="pt")
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
logits = model(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]
print('mask_token_logits', mask_token_logits.shape,  mask_token_logits)

mask_token_logits torch.Size([1, 50265]) tensor([[-0.5628,  0.6115,  2.2776,  ..., -3.1758, -2.6760,  4.5781]],
       grad_fn=<IndexBackward0>)


In [18]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
# The Milky Way is a spiral galaxy.
# The Milky Way is a massive galaxy.
# The Milky Way is a small galaxy.

The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of  HIV.
The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of  schizophrenia.
The symptoms of itching, skin rash, nodal skin eruptions, and dischromic patches suggest a diagnosis of  AIDS.
