In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelWithLMHead

# Load model using Pipeline

Pipeline consists of a tokenizer and a model and postprocessing for output

In [2]:
# by default DistillBERT is loaded and cashed
# simply pass a str with task name
classifier = pipeline('sentiment-analysis')

Call of a pipeline function returns task-specific pipelines like TextClassificationPipeline, SummarizationPipeline etc.

In [3]:
type(classifier)

transformers.pipelines.TextClassificationPipeline

In [4]:
# check the device (cpu of gpu)
classifier.device

device(type='cpu')

In [5]:
# check whether pytorch or tenserflow is used
classifier.framework

'pt'

In [6]:
# we can pass text directly
classifier('I like this song!')

[{'label': 'POSITIVE', 'score': 0.9998732805252075}]

In [7]:
# or as a batch
classifier(['Good', 'Bad'])

[{'label': 'POSITIVE', 'score': 0.9998160600662231},
 {'label': 'NEGATIVE', 'score': 0.999782383441925}]

# Load without pipeline

In [8]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelWithLMHead.from_pretrained("distilbert-base-uncased")



In [19]:
type(model)

transformers.modeling_distilbert.DistilBertForMaskedLM

In [52]:
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[0]

In [53]:
last_hidden_states.shape

torch.Size([1, 8, 30522])

In [9]:
tokenizer.encode('Hello')

[101, 7592, 102]

In [10]:
tokenizer.encode('Hello')

[101, 7592, 102]

In [11]:
tokenizer.all_special_tokens

['[SEP]', '[CLS]', '[PAD]', '[UNK]', '[MASK]']

In [12]:
# This model doesn't support sentiment analysis, but it can perform another task:
pipe = pipeline('fill-mask', model=model, tokenizer=tokenizer)

In [13]:
pipe.predict('hi [MASK]')

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:766.)
  masked_index = (input_ids == self.tokenizer.mask_token_id).nonzero()


[{'sequence': '[CLS] hi! [SEP]',
  'score': 0.450893372297287,
  'token': 999,
  'token_str': '!'},
 {'sequence': '[CLS] hi. [SEP]',
  'score': 0.09369070827960968,
  'token': 1012,
  'token_str': '.'},
 {'sequence': '[CLS] hikari [SEP]',
  'score': 0.07224056124687195,
  'token': 20224,
  'token_str': '##kari'},
 {'sequence': '[CLS] hirata [SEP]',
  'score': 0.02200491353869438,
  'token': 14660,
  'token_str': '##rata'},
 {'sequence': '[CLS] hi ; [SEP]',
  'score': 0.02121773175895214,
  'token': 1025,
  'token_str': ';'}]

# Load rubert

In [14]:
rubert_tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
rubert_model = AutoModelWithLMHead.from_pretrained("DeepPavlov/rubert-base-cased")

Some weights of BertForMaskedLM were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
pipe = pipeline('fill-mask', model=rubert_model, tokenizer=rubert_tokenizer)

In [16]:
pipe('Привет [MASK]')

[{'sequence': '[CLS] привет затем [SEP]',
  'score': 0.00020898805814795196,
  'token': 6575,
  'token_str': 'затем'},
 {'sequence': '[CLS] привет кра [SEP]',
  'score': 0.0002084413863485679,
  'token': 6394,
  'token_str': 'кра'},
 {'sequence': '[CLS] привет мед [SEP]',
  'score': 0.0001815023715607822,
  'token': 5212,
  'token_str': 'мед'},
 {'sequence': '[CLS] приветвз [SEP]',
  'score': 0.0001307556958636269,
  'token': 58964,
  'token_str': '##вз'},
 {'sequence': '[CLS] приветерез [SEP]',
  'score': 0.0001249431079486385,
  'token': 91381,
  'token_str': '##ерез'}]

In [80]:
inputs = rubert_tokenizer('привет, как дела?', 'Нормально')
output = rubert_model(inputs)

AttributeError: 

In [30]:
import torch

In [77]:
input_ids = torch.tensor(rubert_tokenizer.encode("Привет как дела")).unsqueeze(0)  # Batch size 1
a, b = rubert_model(input_ids)

ValueError: not enough values to unpack (expected 2, got 1)

In [76]:
outputs[0][0][0].shape

torch.Size([119547])

In [68]:
outputs[0].shape

torch.Size([1, 5, 119547])