<a href="https://colab.research.google.com/github/blurred421/LFD473-code/blob/main/notebooks/Chapter8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chapter 8: Pretrained Models for Natural Language Processing

In [1]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xx

## 8.2 Learning Objectives

By the end of this chapter, you should be able to:
- understand the role of tokenization in preprocessing sentences as inputs
- load pretrained models and pipelines for NLP using HuggingFace
- understand the general idea behind generative models for NLP

## 8.3 Natural Language Processing

### 8.3.1 Model

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step1.png)

In [2]:
import torch
from transformers import RobertaConfig, RobertaModel

configuration = RobertaConfig()
configuration

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 50265
}

In [3]:
model = RobertaModel(configuration)
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(512, 768, padding_idx=1)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dr

In [4]:
repo_id = "FacebookAI/roberta-base"
model = RobertaModel.from_pretrained(repo_id)
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dr

### 8.3.2 Tokenizers

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/data_step3.png)

In [5]:
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained(repo_id)
tokenizer

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

RobertaTokenizer(name_or_path='FacebookAI/roberta-base', vocab_size=50265, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}
)

#### 8.3.2.1 Tokenizer

In [6]:
input_batch = ["I am really liking this course!", "This course is too complicated!"]

tokenized = tokenizer.tokenize(input_batch[0])
tokenized

['I', 'Ġam', 'Ġreally', 'Ġliking', 'Ġthis', 'Ġcourse', '!']

In [7]:
tokenizer.convert_tokens_to_ids('I'), tokenizer.decode(tokenizer.convert_tokens_to_ids('I'))

(100, 'I')

In [8]:
tokenizer.convert_tokens_to_ids(tokenized), tokenizer.decode(tokenizer.convert_tokens_to_ids(tokenized))

([100, 524, 269, 25896, 42, 768, 328], 'I am really liking this course!')

#### 8.3.2.2 Vocabulary

In [9]:
tokens_to_idx = tokenizer.get_vocab()
tokens_to_idx

{'<s>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 '.': 4,
 'Ġthe': 5,
 ',': 6,
 'Ġto': 7,
 'Ġand': 8,
 'Ġof': 9,
 'Ġa': 10,
 'Ġin': 11,
 '-': 12,
 'Ġfor': 13,
 'Ġthat': 14,
 'Ġon': 15,
 'Ġis': 16,
 'âĢ': 17,
 "'s": 18,
 'Ġwith': 19,
 'ĠThe': 20,
 'Ġwas': 21,
 'Ġ"': 22,
 'Ġat': 23,
 'Ġit': 24,
 'Ġas': 25,
 'Ġsaid': 26,
 'Ļ': 27,
 'Ġbe': 28,
 's': 29,
 'Ġby': 30,
 'Ġfrom': 31,
 'Ġare': 32,
 'Ġhave': 33,
 'Ġhas': 34,
 ':': 35,
 'Ġ(': 36,
 'Ġhe': 37,
 'ĠI': 38,
 'Ġhis': 39,
 'Ġwill': 40,
 'Ġan': 41,
 'Ġthis': 42,
 ')': 43,
 'ĠâĢ': 44,
 'Ġnot': 45,
 'Ŀ': 46,
 'Ġyou': 47,
 'ľ': 48,
 'Ġtheir': 49,
 'Ġor': 50,
 'Ġthey': 51,
 'Ġwe': 52,
 'Ġbut': 53,
 'Ġwho': 54,
 'Ġmore': 55,
 'Ġhad': 56,
 'Ġbeen': 57,
 'Ġwere': 58,
 'Ġabout': 59,
 ',"': 60,
 'Ġwhich': 61,
 'Ġup': 62,
 'Ġits': 63,
 'Ġcan': 64,
 'Ġone': 65,
 'Ġout': 66,
 'Ġalso': 67,
 'Ġ$': 68,
 'Ġher': 69,
 'Ġall': 70,
 'Ġafter': 71,
 '."': 72,
 '/': 73,
 'Ġwould': 74,
 "'t": 75,
 'Ġyear': 76,
 'Ġwhen': 77,
 'Ġfirst': 78,
 'Ġshe': 79,
 'Ġtwo': 

In [10]:
tokens_to_idx['ed'], tokens_to_idx['ing'], tokens_to_idx['Ġonly'], tokens_to_idx['only']

(196, 154, 129, 8338)

In [11]:
tokenizer.tokenize('I am dissecting this, am I?')

['I', 'Ġam', 'Ġdissect', 'ing', 'Ġthis', ',', 'Ġam', 'ĠI', '?']

In [12]:
tokenizer.tokenize('I am playing with the word play.')

['I', 'Ġam', 'Ġplaying', 'Ġwith', 'Ġthe', 'Ġword', 'Ġplay', '.']

#### 8.3.2.3 Max Length

In [13]:
tokenizer.max_len_single_sentence, tokenizer.model_max_length

(510, 512)

In [14]:
truncated_token_ids = tokenizer(input_batch[0], truncation=True, max_length=5)['input_ids']
truncated_token_ids

[0, 100, 524, 269, 2]

In [15]:
tokenizer.decode(truncated_token_ids)

'<s>I am really</s>'

#### 8.3.2.4 Special Tokens

In [16]:
tokenizer.special_tokens_map

{'bos_token': '<s>',
 'eos_token': '</s>',
 'unk_token': '<unk>',
 'sep_token': '</s>',
 'pad_token': '<pad>',
 'cls_token': '<s>',
 'mask_token': '<mask>'}

In [17]:
token_ids = tokenizer.encode(input_batch[0], add_special_tokens=True)
token_ids

[0, 100, 524, 269, 25896, 42, 768, 328, 2]

In [18]:
tokenizer.decode(token_ids)

'<s>I am really liking this course!</s>'

In [19]:
input_batch = ["I am really liking this course!", "This course is too complicated!"]
transformed = tokenizer(input_batch)['input_ids']
transformed

[[0, 100, 524, 269, 25896, 42, 768, 328, 2],
 [0, 713, 768, 16, 350, 6336, 328, 2]]

### 8.3.3 Inference

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step5.png)

In [20]:
torch.as_tensor(transformed)

ValueError: expected sequence of length 9 at dim 1 (got 8)

In [21]:
tokenizer.pad_token_id

1

In [None]:
input_batch = ["I am really liking this course!", "This course is too complicated!"]
model_input = tokenizer(input_batch, padding=True, return_tensors='pt')['input_ids']
model_input, model_input.shape

(tensor([[    0,   100,   524,   269, 25896,    42,   768,   328,     2],
         [    0,   713,   768,    16,   350,  6336,   328,     2,     1]]),
 torch.Size([2, 9]))

In [None]:
model.eval()
output = model(model_input)
output.last_hidden_state.shape

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


torch.Size([2, 9, 768])

### 8.3.4 Attaching a Head

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step1.png)

In [None]:
from transformers import RobertaForSequenceClassification

torch.manual_seed(11)
model_with_head = RobertaForSequenceClassification.from_pretrained(repo_id, num_labels=2)
model_with_head

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
classifier_head = model_with_head.classifier
classifier_head

RobertaClassificationHead(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (out_proj): Linear(in_features=768, out_features=2, bias=True)
)

In [None]:
model_with_head.eval()
output = model_with_head(model_input)
output, output.logits.shape

(SequenceClassifierOutput(loss=None, logits=tensor([[-0.1540,  0.0212],
         [-0.1685,  0.0220]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 torch.Size([2, 2]))

### 8.3.5 Logits and Loss Functions

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step2.png)

#### 8.3.5.1 One Logit or Two Logits?

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/one_logit.png)

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/two_logits.png)


#### 8.3.5.2 Cross-Entropy Loss

So, in a nutshell, if your classifier head produces more than one logit (and it is OK to use two for a binary classification), you must use the cross-entropy loss.

The table below may help you organize a little bit the ideas presented in this section (right now, we're going to use the last column only):

|                         | BCE Loss               | BCE With Logits Loss     | NLL Loss                    | Cross-Entropy Loss   
| --- | --- | --- | --- | --- |
|     Classification      | binary                | binary                | multiclass / binary                | multiclass / binary
| Model output (each data point) | probability           | logit                 | array of two or more log probabilities | array of two or more logits    
| Label (each data point) | float (0.0 or 1.0)    | float (0.0 or 1.0)    | long (class index)         | long (class index)
|   Model's last layer    | Sigmoid               | Linear                | LogSoftmax                 | Linear              

#### 8.3.5.3 Losses in Hugging Face Models

In [None]:
input_batch = ["I am really liking this course!", "This course is too complicated!"]
model_input = tokenizer(input_batch, padding=True, return_tensors='pt')['input_ids']
labels = torch.as_tensor([1, 0])

In [None]:
model_with_head.train()
output = model_with_head(model_input, labels=labels)
output

SequenceClassifierOutput(loss=tensor(0.6711, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0871,  0.1107],
        [ 0.0525, -0.0134]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
import torch.nn as nn

loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(output.logits, labels)
loss

tensor(0.6711, grad_fn=<NllLossBackward0>)

## 8.4 TensorBoard

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step4.png)

First, we need to load TensorBoard’s extension for Jupyter. It is possible to run some special commands inside Jupyter Notebooks using a `%` characters at the start of a line, they are built-in [magic commands](https://ipython.readthedocs.io/en/stable/interactive/magics.html). A magic is a kind of shortcut that extends a notebook's capabilities. Once it is loaded, we can run TensorBoard using the newly available magic:

In [None]:
%load_ext tensorboard
%tensorboard --logdir runs

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/empty_tensorboard.png)

In [None]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter('runs/test')

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/tensorboard_losses.png)

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/smooth_slider.png)

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/tensorboard_losses_smooth.png)

## 8.6 HuggingFace Pipelines

![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch6/hf_nlp_tasks.png)

In [None]:
from transformers.pipelines import SUPPORTED_TASKS
SUPPORTED_TASKS['text-classification']['default']

{'model': {'pt': ('distilbert/distilbert-base-uncased-finetuned-sst-2-english',
   'af0f99b'),
  'tf': ('distilbert/distilbert-base-uncased-finetuned-sst-2-english',
   'af0f99b')}}

In [None]:
from transformers import pipeline

model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
classifier = pipeline('text-classification', model=model_name)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


![](https://raw.githubusercontent.com/dvgodoy/assets/main/PyTorchInPractice/images/ch0/model_step5.png)

In [None]:
input_batch = ["I am really liking this course!", "This course is too complicated!"]

classifier(input_batch)

[{'label': 'POSITIVE', 'score': 0.9997199177742004},
 {'label': 'NEGATIVE', 'score': 0.9996912479400635}]

### 8.6.1 Transforms / Tokenizer

In [None]:
classifier.tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
tokenized_dict = classifier.tokenizer(input_batch)
tokenized_dict

{'input_ids': [[101, 1045, 2572, 2428, 16663, 2023, 2607, 999, 102], [101, 2023, 2607, 2003, 2205, 8552, 999, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
classifier.tokenizer.decode(tokenized_dict['input_ids'][0])

'[CLS] i am really liking this course! [SEP]'

In [None]:
from transformers import AutoTokenizer

hf_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

tokenized_output = hf_tokenizer(input_batch, add_special_tokens=True, padding=True, return_tensors='pt')
tokenized_output

{'input_ids': tensor([[  101,  1045,  2572,  2428, 16663,  2023,  2607,   999,   102],
        [  101,  2023,  2607,  2003,  2205,  8552,   999,   102,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0]])}

### 8.6.2 Model

In [None]:
classifier.model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
classifier.model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased-finetuned-sst-2-english",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.44.0",
  "vocab_size": 30522
}

In [None]:
from transformers import AutoModel
headless_model = AutoModel.from_pretrained('distilbert-base-uncased')

In [None]:
headless_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [None]:
import torch
headless_model.eval()

with torch.inference_mode():
    output = headless_model(tokenized_output['input_ids'])

output['last_hidden_state'].shape

torch.Size([2, 9, 768])

## 8.7 Generative Models

In [None]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

model = AutoModelForCausalLM.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2')

In [None]:
sentence = "Hello, how are you"

In [None]:
tokenized = tokenizer(sentence, return_tensors="pt")
tokenized

{'input_ids': tensor([[15496,    11,   703,   389,   345]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [None]:
outputs = model(**tokenized)
outputs['logits'].shape

torch.Size([1, 5, 50257])

In [None]:
probabilities = torch.nn.functional.softmax(outputs['logits'][0], dim=1)
values, indices = torch.topk(probabilities, 1)
values, indices

(tensor([[0.0960],
         [0.1005],
         [0.0908],
         [0.6630],
         [0.2651]], grad_fn=<TopkBackward0>),
 tensor([[  11],
         [ 314],
         [ 546],
         [ 345],
         [1804]]))

In [None]:
predictions = tokenizer.decode(indices[:, 0])
predictions

', I about you doing'

In [None]:
tokens = [tokenizer.decode(t) for t in tokenized['input_ids'][0]]
predicted_tokens = predictions.split(' ')

for i, p in enumerate(predicted_tokens):
    print(f"{i+1}. Tokens so far: {' '.join(tokens[:i+1])}\n   Predicted token to follow: {p}")

1. Tokens so far: Hello
   Predicted token to follow: ,
2. Tokens so far: Hello ,
   Predicted token to follow: I
3. Tokens so far: Hello ,  how
   Predicted token to follow: about
4. Tokens so far: Hello ,  how  are
   Predicted token to follow: you
5. Tokens so far: Hello ,  how  are  you
   Predicted token to follow: doing
