In [1]:
from huggingface_hub import notebook_login
# notebook_login()

In [1]:
# from datasets import load_dataset
# eli5 = load_dataset("eli5_category", split="train[:5000]")
from datasets import load_from_disk
eli5 = load_from_disk("~/ap1923/eli5_5k_split")
eli5 = eli5.flatten()
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 1000
    })
})

In [2]:
eli5["train"][0]

{'q_id': '798v9m',
 'title': 'What would happen if one of my eyes are covered for a long time?',
 'selftext': '',
 'category': 'Biology',
 'subreddit': 'explainlikeimfive',
 'answers.a_id': ['dp03dep', 'dp031p3'],
 'answers.text': ["If you are a child and still in a developing stage for vision and perception, covering one of your eyes can cause amblyopia. Amblyopia -a disorder also called lazy eye- causes decreased vision in covered eye due to the interruption in the eye-brain pathway. If done with growing up, i think it is fine to strut around with pirate's eyepatch.",
  'What do you mean by "a long time"? Is it half an hour? Half a month? Five years? Twenty years? In like 15 minutes, your pupil would dilate(non-native English speaker, is this right? I am thinking expand), letting in more light. Therefore, if you plan on going somewhere dark, or you think it suddenly will become dark where you are, wear an eyepatch on eye. When entering the dark room, switch Wich eye you are covering.

In [3]:
# print("length", eli5["train"][0]["answers.text"])
eli5

DatasetDict({
    train: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['q_id', 'title', 'selftext', 'category', 'subreddit', 'answers.a_id', 'answers.text', 'answers.score', 'answers.text_urls', 'title_urls', 'selftext_urls'],
        num_rows: 1000
    })
})

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")

def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

In [5]:
print(tokenized_eli5)
# print("length", len(tokenized_eli5["train"][0]['input_ids']))
# tokenized_eli5["train"][0]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1000
    })
})


In [6]:
block_size = 128


def group_texts(examples):
    print("examples", examples)
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

In [7]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 10719
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2441
    })
})

In [26]:
from transformers import DataCollatorForLanguageModeling, TrainingArguments, Trainer
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_mlm_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()
trainer.save_model("my_awesome_eli5_mlm_model")
tokenizer.save_pretrained("my_awesome_eli5_mlm_model")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.7112,2.116071
2,1.8803,2.086677
3,2.0831,2.049644


('my_awesome_eli5_mlm_model/tokenizer_config.json',
 'my_awesome_eli5_mlm_model/special_tokens_map.json',
 'my_awesome_eli5_mlm_model/vocab.json',
 'my_awesome_eli5_mlm_model/merges.txt',
 'my_awesome_eli5_mlm_model/added_tokens.json',
 'my_awesome_eli5_mlm_model/tokenizer.json')

In [32]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 7.69


## Inference

In [34]:
text = "The Milky Way is a <mask> galaxy."
from transformers import pipeline

mask_filler = pipeline(
    "fill-mask",
    model="my_awesome_eli5_mlm_model",
    tokenizer="my_awesome_eli5_mlm_model"
)
mask_filler(text, top_k=3)

Device set to use cuda:0


[{'score': 0.6642128825187683,
  'token': 21300,
  'token_str': ' spiral',
  'sequence': 'The Milky Way is a spiral galaxy.'},
 {'score': 0.06015821918845177,
  'token': 2232,
  'token_str': ' massive',
  'sequence': 'The Milky Way is a massive galaxy.'},
 {'score': 0.03243763744831085,
  'token': 3065,
  'token_str': ' giant',
  'sequence': 'The Milky Way is a giant galaxy.'}]

In [59]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
model_path = "/home/UNT/ap1923/ap1923/maskedmodelling/my_awesome_eli5_mlm_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForMaskedLM.from_pretrained(model_path)
text = "The Milky Way is a <mask> galaxy."
# text2 = "The Milky Way is a galaxy."
inputs = tokenizer(text, return_tensors="pt")
# inputs2 = tokenizer(text2, return_tensors="pt")
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
# print('inputs', inputs)
# print('inputs', inputs2)
# print('decoded', tokenizer.decode([50264]))
# print('mask token index', mask_token_index)
# print('mask token id', tokenizer.mask_token_id)
# print('mask token index', mask_token_index)
logits = model(**inputs).logits
mask_token_logits = logits[0, mask_token_index, :]
print('mask_token_logits', mask_token_logits.shape,  mask_token_logits)

mask_token_logits torch.Size([1, 50265]) tensor([[-2.5637, -3.2536,  4.1820,  ..., -1.9573, -2.9342,  2.7244]],
       grad_fn=<IndexBackward0>)


In [60]:
top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
# The Milky Way is a spiral galaxy.
# The Milky Way is a massive galaxy.
# The Milky Way is a small galaxy.

The Milky Way is a  spiral galaxy.
The Milky Way is a  massive galaxy.
The Milky Way is a  giant galaxy.
