## Models (BertModel)

In [3]:
import transformers

transformers.__version__

'4.36.2'

In [4]:
from transformers import BertConfig, BertModel

In [11]:
config = BertConfig()

model = BertConfig(config)

In [12]:
print(config)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.36.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [14]:
from transformers import BertConfig, BertModel
config = BertConfig()
model = BertModel(config)

In [15]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-cased")

config.json: 100%|██████████| 570/570 [00:00<00:00, 190kB/s]
model.safetensors: 100%|██████████| 436M/436M [00:15<00:00, 28.7MB/s] 


In [16]:
model.save_pretrained("directory_on_my_computer")

In [22]:
ls directory_on_my_computer

 Volume in drive C has no label.
 Volume Serial Number is D4EC-AFE4

 Directory of c:\Users\antho\code\NLP\directory_on_my_computer

01/13/2024  10:50 PM    <DIR>          .
01/13/2024  10:50 PM    <DIR>          ..
01/13/2024  10:50 PM               682 config.json
01/13/2024  10:50 PM       433,263,448 model.safetensors
               2 File(s)    433,264,130 bytes
               2 Dir(s)  14,606,368,768 bytes free


## Using transformers models to inference

In [25]:
sequences = ["Hello!","Cool","Nice!"]

In [26]:
encoded_sequences = [
    [101, 7592, 999, 102],
    [101, 4658, 1012, 102],
    [101, 3835, 999, 102],
]

In [28]:
import torch 

model_input = torch.tensor(encoded_sequences)

print(model_input)

tensor([[ 101, 7592,  999,  102],
        [ 101, 4658, 1012,  102],
        [ 101, 3835,  999,  102]])


In [None]:
output = model(model_input)

print(output)

## Tokenization

In [1]:
tokenized_text = "Jim Henson was a puppeteeer".split()
print(tokenized_text)

['Jim', 'Henson', 'was', 'a', 'puppeteeer']


Loading and saving: tokens

In [3]:
#Loading the BERT tokenizer trained with the same checkpoint as BERT is done the same way as loading the model, except we use the BertTokenizer class:

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [4]:
#Similar to AutoModel, the AutoTokenizer class will grab the proper tokenizer class in the library based on the checkpoint name, and can be used directly with any checkpoint:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [7]:
tokenizer("Using Transformer network is simple \n")

{'input_ids': [101, 7993, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokenizer.save_pretrained("directory_on_my_computer")

('directory_on_my_computer\\tokenizer_config.json',
 'directory_on_my_computer\\special_tokens_map.json',
 'directory_on_my_computer\\vocab.txt',
 'directory_on_my_computer\\added_tokens.json',
 'directory_on_my_computer\\tokenizer.json')

### Encoding 


Translating text to numbers is known as encoding. Encoding is done in a two-step process: the tokenization, followed by the conversion to input IDs.

In [10]:
#*Transforming text to tokens:*
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequences = "Using a Transfomer network is simple"
tokens = tokenizer.tokenize(sequences)

print(tokens)

['Using', 'a', 'Trans', '##fo', '##mer', 'network', 'is', 'simple']


In [12]:
# *transforming tokens into ids:*
ids = tokenizer.convert_tokens_to_ids(tokens)

print(ids)

[7993, 170, 13809, 14467, 4027, 2443, 1110, 3014]


In [14]:
# *Decoding ids to text*
decoding_string = tokenizer.decode([7993, 170, 13809, 14467, 4027, 2443, 1110, 3014])

print(decoding_string)

Using a Transfomer network is simple


 ## Handling multiple sequences


In [24]:
## batching inputs

import torch 
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = "I've be waiting for HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequences)
ids = tokenizer.convert_tokens_to_ids(sequences)
input_ids = torch.tensor(ids)

# This line will fail.
model(input_ids)


The problem is that we sent a single sequence to the model, whereas 🤗 Transformers models expect multiple sentences by default. 

In [34]:
# you’ll see that the tokenizer didn’t just convert the list of input IDs into a tensor, it added a dimension on top of it:
tokenized_input = tokenizer(sequences, return_tensors="pt")

print(tokenized_input)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2022,  3403,  2005, 17662, 12172,  2607,
          2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [33]:
#correct output
tokenized_input = tokenizer(sequences, return_tensors="pt")

print(tokenized_input["input_ids"])

tensor([[  101,  1045,  1005,  2310,  2022,  3403,  2005, 17662, 12172,  2607,
          2026,  2878,  2166,  1012,   102]])


In [36]:
#Let’s try again and add a new dimension:

import torch 
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = "I've be waiting for HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequences)
ids = tokenizer.convert_tokens_to_ids(tokens)

input_ids = torch.tensor([ids])
print("input_IDs: ", input_ids)

output = model(input_ids)
print("Logits: ", output.logits)

input_IDs:  tensor([[ 1045,  1005,  2310,  2022,  3403,  2005, 17662, 12172,  2607,  2026,
          2878,  2166,  1012]])
Logits:  tensor([[-2.3035,  2.4422]], grad_fn=<AddmmBackward0>)


In [39]:
#Batching is the act of sending multiple sentences through the model, all at once. If you only have one sentence, you can just build a batch with a single sequence:

batch_ids = [ids,ids]
print(ids)  

[1045, 1005, 2310, 2022, 3403, 2005, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
