In [None]:
#hide
!nvidia-smi

Fri Apr 30 17:09:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro M4000        On   | 00000000:00:05.0 Off |                  N/A |
| 46%   26C    P8    11W / 120W |    553MiB /  8126MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#all_slow

In [None]:
#hide
import sys
if 'google.colab' in sys.modules:
    !pip install -Uqq fastcore sentencepiece
    !pip install -Uqq --no-deps fastai
    !pip install -Uqq transformers datasets wandb
    !pip install git+git://github.com/aikindergarten/fasthugs.git

# Masked Language Modeling

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from datasets import load_dataset, concatenate_datasets

from fastai.text.all import *
from fasthugs.learner import TransLearner
from fasthugs.data import *

## Setup

In [None]:
model_name = 'distilroberta-base'
# data
max_length = 128
bs = 16
val_bs = bs*4
# training
lr = 3e-5

## Data preprocessing

In this example notebook we use HuggingFace datasets for preprocessing (as show in example notebook [here](https://github.com/huggingface/notebooks/blob/master/examples/language_modeling.ipynb)).

In [None]:
ds_name = 'imdb'

In [None]:
dataset = load_dataset(ds_name)

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/4ea52f2e58a08dbc12c2bd52d0d92b30b88c00230b4522801b3636782f625c5b)


In [None]:
dataset = dataset['train'].select(range(2000))

In [None]:
dataset.column_names

['label', 'text']

In [None]:
# dataset['unsupervised'][2]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], return_attention_mask=True, return_special_tokens_mask=True, verbose=False)

In [None]:
# dataset = dataset.map(tokenize, batched=True, batch_size=100, remove_columns=dataset['train'].column_names, num_proc=4)
dataset = dataset.map(tokenize, batched=True, batch_size=100, remove_columns=dataset.column_names, num_proc=4)







In [None]:
block_size = max_length

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)







In [None]:
# lm_dataset = concatenate_datasets([lm_dataset['train'], lm_dataset['unsupervised'], lm_dataset['test']])

## Training

In [None]:
import random
N = len(lm_dataset)
idx = list(range(N))
random.shuffle(idx)

In [None]:
split = int(N*0.9)
train_idx = idx[:split]
valid_idx = idx[split:]

In [None]:
dblock = DataBlock(blocks=[TransformersLMBlock(tokenizer=tokenizer)],
                   splitter=IndexSplitter(valid_idx))

In [None]:
dls = dblock.dataloaders(lm_dataset, bs=bs, val_bs=val_bs, num_workers=4)
dls.show_batch()

Unnamed: 0,text
0,"routine for an metaph audience. Then ten or twelve comics are selected to live<mask><mask> house<mask> and do ""Surviv<mask>"" style competitions using comedic tactics.<mask> one will<mask> determined as ""Last Comic Standing."" I dowid<mask> up comedy, so boldly is the one<mask> show must<mask> to my<mask>.<mask> are usually some<mask> funny comics<mask> through. It<mask><mask><mask> of such talents as<mask><mask>zo<mask>odden, Ralphie Silk,<mask> Josh Blue.<br /><br />My negative criticisms is the<mask> that there is the possibility that a lot of these comics were selected for their contribution to reality show drama."
1,"have gotten<mask> attention while it was being aired, it was definitely an original and very special<mask> that should have<mask> appreciated much more than it was.</s><s>This<mask><mask> as<mask> once was and comparing this with the two remakes, THE MONEY PIT and ARE WE DONE YET?,<mask> points out all the more how the 40's movie makers had a<mask> for comedy which has since<mask> regretfully,<mask> lost.<br /><br />I was 15 when I first saw<mask> and even at that tender age, there was much I could laugh at.<mask> of<mask> being familiar<mask> adult frustrations, I see<mask> whole"
2,"Sep. 11<mask>, I thought to myself ""It's OK, the policemen and firemen<mask><mask> the people out that<mask>"". To<mask> honest, I<mask> it was an<mask><mask><mask> was in<mask><mask> year of high orally and getting changed from gym and getting ready<mask> go to my<mask> class. Someone came into<mask> locker room shouting ""Some building just got bombed in New York!"", we all got dressed quickly<mask> ran to our classrooms as we watched the first tower burning on<mask>. Not only<mask> seconds later live on TV does the<mask> plane<mask> into the other World Trade Center and we<mask> this was<mask> accident.<mask> few"
3,"seems<mask> work someway, but is deeply flawed and influenced by events. The<mask> character played by the director is a playwright whose mid-life personal and creative crisis is amplified by<mask> pressure of the events and<mask> the fact that he<mask> lucky enough to leave<mask> terror attack<mask><mask><mask> the bomb explodes. He hires a private detective to follow his girlfriend who is a TV investigative reporter whom he suspects is falling in for the subject<mask> her next show - another failed man, former military, whose business and family life<mask>les under the events. He starts to write a play that carbon-copies the reality and will bring it to"
4,"aha, well, yes<mask> i don't think a movie with prematurely budget like this could afford ""good"" actors or effects so they worked with what they had. the guts and entrails were actually very convincing<mask><mask> movie<mask> a little<mask>ppy going from sequence to sequence but overall, this is one of the better movies i have seen lately that<mask>'t follow any<mask> or predictability<mask> very goodished a laugh<mask></s><s>Well this<mask> was probobly one of the funniest scary movie i have ever seen. The effects are so bad you just have to laugh<mask><mask><mask> acting, well lets say its no mel"
5,"<mask> the<mask><mask> framed in decorated moving triangles or circles. Trans<mask> are filled with<mask>, and Celtic knots.<mask> the trees to the floors, many things in this world are<mask> in shapes or<mask>.<br /�br />Clocking in<mask> 70Na minus credits, The Secret of Kells is a fun little history lesson with a little<mask> and silliness thrown in to keep<mask> (maybe just<mask>) exped. I<mask> one has to generally be open-<mask><mask> to The Secret of Kells as half art piece, half movie about history.<mask> looking<mask> it was animated with Adobe illustrator, It's a very"
6,"<mask> wounded<mask> manages to set fire to a gas<mask>, providing a perfect target for his fellow bombard<mask>. Stylistically, Bomb<mask>ier is<mask> of the most schizophrenic of war films, with moments of subtle poignancy (the death<mask> trainee Eddie Albert) alternating with scenes of ludicrous ""Yellow Peril"" melodrama (the Japanese<mask> hiss through their teeth as<mask> torture the helpless<mask>). Though it can't help but seem<mask> today,<mask>ard constituents remains an<mask> propaganda effort (<mask> film<mask> sometimes erroneously<mask> as the debut of Robert Ryan, who'd actually been appearing<mask> the cameras since 1940"
7,",<mask> so doing, reveals a lot about the reporter's character.) <br<mask><mask><mask> />Firefall<mask> this episode appears to have a bad reputation among fans, but I enjoyed it because it's got a great<mask> herring and a really creepy, almost unstoppable-seeming monster.<br<mask><mask>br<mask>Though I've singled<mask> these three<mask> for praise, I'd say that most of the stories are entertaining at the very least. For my money,<mask> are only<mask> complete turkeys in the 20-<mask> run: Primal Scream, which is about monkey-men running rampant in Chicago, and<mask> Sentry, which"
8,"be resisted, that<mask> just who we are.<br /><br />There is a consistent<mask>ness from start to finish century<mask> photography<mask> sharp composition,<mask> pleasant<mask> when<mask><mask> provocative content, well suited music and laugh out loud scripting.<br /><br /><mask> out for the very young ""lone wise voice""... brilliant<mask> wisdom<mask> innocence balancing comedy from the human condition.</s><s>This is one ofRepresent unfortunate films that<mask> an even more sad, unfortunate<mask> at the box office. I saw this<mask> at a local art cinema,in revival form,shortly after it tanked in<mask> cinemas. It"


In [None]:
b = dls.one_batch()
b[0]['input_ids'], b[0]['labels']

({'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]), 'input_ids': tensor([[   95,    10,   828,  ...,  1183,   143,  3238],
         [    8,   841,   116,  ...,  1589, 49007,  3809],
         [   80,   664, 50264,  ...,     9,   256,  2459],
         ...,
         [    7,   120,    69,  ...,    31, 50264, 27942],
         [   21, 50264, 50264,  ...,   843, 50264, 17768],
         [ 3121,  4558,    53,  ...,   747,  6269,     6]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
         [ -100,  -100, 13148,  ...,  -100,  -100,  -100],
         ...,
         [ -100,  -100,  -100,  ...,  -100,     5,  -100],
         [ -100,    14,    24,  ...,  -100,    29,     8],
         [ -100,  -100,  -100,  ...,  -100,  -100,  -10

The labels are constructed by `DataCollatorForLanguageModeling` and the loss computed by the model is used for training.

In [None]:
model = AutoModelForMaskedLM.from_pretrained(model_name)
learn = TransLearner(dls, model, loss_func=noop, metrics=perplexity)

As masking is done randomly on the fly, validation score may vary.

In [None]:
learn.validate()

(#2) [2.126232862472534,8.38322639465332]

In [None]:
learn.fit_flat_cos(2, 3e-5)

epoch,train_loss,valid_loss,perplexity,time
0,2.38168,2.202039,9.043432,03:40
1,2.275901,2.131905,8.430911,03:41
