In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.14.1-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 26.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 444 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 66.6 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 60.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 69.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Atte

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokenizer

PreTrainedTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [4]:
raw_text = "Today is going to be a hectic day!"

tokens = tokenizer.tokenize(raw_text)
tokens

['Today', 'is', 'going', 'to', 'be', 'a', 'he', '##ctic', 'day', '!']

In [25]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[3570, 1110, 1280, 1106, 1129, 170, 1119, 11143, 1285, 106]

In [26]:
tokenizer.decode(ids)

'Today is going to be a hectic day!'

In [27]:
input_ids = tokenizer.prepare_for_model(ids)
input_ids, type(input_ids)

({'input_ids': [101, 3570, 1110, 1280, 1106, 1129, 170, 1119, 11143, 1285, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
 transformers.tokenization_utils_base.BatchEncoding)

In [10]:
tokenizer.decode(input_ids['input_ids'])

'[CLS] Today is going to be a hectic day! [SEP]'

In [35]:
ids

[3570, 1110, 1280, 1106, 1129, 170, 1119, 11143, 1285, 106]

In [36]:
tokenizer.decode(ids)

'Today is going to be a hectic day!'

In [38]:
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased")
logits = model(torch.tensor([ids])).logits
predictions = torch.nn.functional.softmax(logits, dim = -1)
predictions

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

tensor([[0.8181, 0.1819]], grad_fn=<SoftmaxBackward0>)

In [42]:
model.config.label2id

{'LABEL_0': 0, 'LABEL_1': 1}

In [39]:
logits

tensor([[ 0.3747, -1.1286]], grad_fn=<AddmmBackward0>)

In [43]:
model.config

BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.14.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

In [49]:
from transformers.utils.dummy_pt_objects import AutoModelForTokenClassification
## doing all at once 

import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
raw_text = "I've been waiting for this course for my whole life"

tokens = tokenizer.tokenize(raw_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
import torch
logits = model(torch.tensor([ids]))
prediction = torch.nn.functional.softmax(logits.logits, dim = -1)
prediction


tensor([[0.0022, 0.9978]], grad_fn=<SoftmaxBackward0>)

In [50]:
# trying with batched ids 

torch.nn.functional.softmax(model(torch.tensor([ids,ids])).logits, dim = -1)

tensor([[0.0022, 0.9978],
        [0.0022, 0.9978]], grad_fn=<SoftmaxBackward0>)

## `Padding` and `Attention Mask`

In [51]:

tokenizer.pad_token_id

0

In [59]:
batched_id = [
              [200, 300,400],
              [400, 100,tokenizer.pad_token_id]
]
print(f"First model  : {model(torch.tensor([[200,300,400]])).logits}")
print(f"Second model  : {model(torch.tensor([[400,100]])).logits}")
print(f"Bacched model  : {model(torch.tensor(batched_id)).logits}")

First model  : tensor([[ 1.0187, -0.9559]], grad_fn=<AddmmBackward0>)
Second model  : tensor([[0.0874, 0.0905]], grad_fn=<AddmmBackward0>)
Bacched model  : tensor([[ 1.0187, -0.9559],
        [ 0.6479, -0.5964]], grad_fn=<AddmmBackward0>)


In [52]:
batched_id = [
              [200, 300,400],
              [400, 100,tokenizer.pad_token_id]
]

attention_mask = [
                  [1,1,1],
                  [1,1,0]
]

output = model(torch.tensor(batched_id), attention_mask = torch.tensor(attention_mask))
output.logits

tensor([[ 1.0187, -0.9559],
        [ 0.0874,  0.0905]], grad_fn=<AddmmBackward0>)

### Try out section 

In [81]:
raw_text = [
            "I've been waiting for a HuggingFace course my whle life.",
            "I hate this so much!"
]
token_1  = tokenizer.tokenize(raw_text[0])
token_2 = tokenizer.tokenize(raw_text[1])
id_1= tokenizer.convert_tokens_to_ids(token_1)
id_2 = tokenizer.convert_tokens_to_ids(token_2)
print(id_1)
print(id_2)
i = len(id_2)
j = len(id_1)

while i <j:
  id_2.append(tokenizer.pad_token_id)
  i= i+1

batch_ids = [id_1, id_2]

attention_mask = [
                  [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
                  [1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
]

print(len(attention_mask[1]), len(attention_mask[0]))

print(attention_mask[1].count(1))

torch.nn.functional.softmax(model(torch.tensor(batch_ids), attention_mask = torch.tensor(attention_mask)).logits,dim = -1)


[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 1059, 7317, 2063, 2166, 1012]
[1045, 5223, 2023, 2061, 2172, 999]
16 16
8


tensor([[0.7464, 0.2536],
        [0.9980, 0.0020]], grad_fn=<SoftmaxBackward0>)

In [77]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model = checkpoint)
classifier(raw_text)

[{'label': 'NEGATIVE', 'score': 0.8816303610801697},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]