In [None]:
from transformers import BertConfig, BertModel

bert_base = BertConfig()
model = BertModel(bert_base)
print(f"{model.num_parameters() / (10**6)} million parameters")

109.48224) million parameters


In [None]:
from transformers import AlbertConfig, AlbertModel

albert_base = AlbertConfig(hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072)

model = AlbertModel(albert_base)
print(f"{model.num_parameters() / (10**6)} million parameters")

11.683584 million parameters


In [None]:
from transformers import AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')
text = "The cat is so sad ."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=760289.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=684.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=47376696.0), HTML(value='')))




In [None]:
!pip install pandas

Collecting pandas
  Downloading pandas-1.1.5-cp36-cp36m-manylinux1_x86_64.whl (9.5 MB)
[K     |################################| 9.5 MB 24.8 MB/s eta 0:00:01
Collecting pytz>=2017.2
  Downloading pytz-2021.3-py2.py3-none-any.whl (503 kB)
[K     |################################| 503 kB 27.6 MB/s eta 0:00:01
Installing collected packages: pytz, pandas
Successfully installed pandas-1.1.5 pytz-2021.3
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [None]:
import pandas as pd
from transformers import pipeline
fillmask = pipeline('fill-mask', model='albert-base-v2')
pd.DataFrame(fillmask("The cat is so [MASK] ."))

Unnamed: 0,sequence,score,token,token_str
0,[CLS] the cat is so cute.[SEP],0.281025,10901,▁cute
1,[CLS] the cat is so adorable.[SEP],0.094893,26354,▁adorable
2,[CLS] the cat is so happy.[SEP],0.042963,1700,▁happy
3,[CLS] the cat is so funny.[SEP],0.040976,5066,▁funny
4,[CLS] the cat is so affectionate.[SEP],0.024233,28803,▁affectionate


In [None]:
# RoBERTa

from transformers import RobertaConfig, RobertaModel
conf = RobertaConfig()
model = RobertaModel(conf)

print(f"{model.num_parameters() / (10**6)} million parameters")

109.48224 million parameters


In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
text = "The cat is so sad ."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [None]:
from transformers import pipeline

fillmask = pipeline("fill-mask", model="roberta-base", tokenizer=tokenizer)
pd.DataFrame(fillmask("The cat is so <mask> ."))

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,sequence,score,token,token_str
0,<s>The cat is so cute.</s>,0.191843,11962,Ġcute
1,<s>The cat is so sweet.</s>,0.051524,4045,Ġsweet
2,<s>The cat is so funny.</s>,0.033595,6269,Ġfunny
3,<s>The cat is so handsome.</s>,0.032893,19222,Ġhandsome
4,<s>The cat is so beautiful.</s>,0.032314,2721,Ġbeautiful


In [None]:
# find the mask token

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
tokenizer.mask_token

'[MASK]'

In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
tokenizer.mask_token

'<mask>'

In [None]:
fillmask(f"The cat is very {fillmask.tokenizer.mask_token}.")

[{'sequence': '<s>The cat is very happy.</s>',
  'score': 0.0522228442132473,
  'token': 1372,
  'token_str': 'Ġhappy'},
 {'sequence': '<s>The cat is very shy.</s>',
  'score': 0.04954611882567406,
  'token': 9152,
  'token_str': 'Ġshy'},
 {'sequence': '<s>The cat is very friendly.</s>',
  'score': 0.0414116233587265,
  'token': 5192,
  'token_str': 'Ġfriendly'},
 {'sequence': '<s>The cat is very cute.</s>',
  'score': 0.0360177680850029,
  'token': 11962,
  'token_str': 'Ġcute'},
 {'sequence': '<s>The cat is very smart.</s>',
  'score': 0.031842395663261414,
  'token': 2793,
  'token_str': 'Ġsmart'}]

In [None]:
#ELECTRA

fillmask = pipeline("fill-mask", model="google/electra-small-generator")
fillmask(f"The cat is very {fillmask.tokenizer.mask_token} .")

[{'sequence': '[CLS] the cat is very friendly. [SEP]',
  'score': 0.09815675020217896,
  'token': 5379,
  'token_str': 'friendly'},
 {'sequence': '[CLS] the cat is very cute. [SEP]',
  'score': 0.08666384965181351,
  'token': 10140,
  'token_str': 'cute'},
 {'sequence': '[CLS] the cat is very sensitive. [SEP]',
  'score': 0.05456872284412384,
  'token': 7591,
  'token_str': 'sensitive'},
 {'sequence': '[CLS] the cat is very shy. [SEP]',
  'score': 0.03385474532842636,
  'token': 11004,
  'token_str': 'shy'},
 {'sequence': '[CLS] the cat is very smart. [SEP]',
  'score': 0.031776364892721176,
  'token': 6047,
  'token_str': 'smart'}]