In [1]:
'''
GPT-2
Let's first start with GPT-2:
________________________________
'''

# ________________________________
# This script loads the pretrained GPT-2 model, tokenizes an input prompt, and generates a continuation of the input text.

  from .autonotebook import tqdm as notebook_tqdm


GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Initialize the GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer

In [2]:
model = GPT2LMHeadModel.from_pretrained("gpt2")
model

Downloading pytorch_model.bin: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 548M/548M [00:32<00:00, 17.1MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading generation_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<?, ?B/s]


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [9]:
# Define a text prompt
text = "Our main vald"
# Encode the input text to tensor
indexed_tokens = tokenizer.encode(text, return_tensors='pt')
indexed_tokens

tensor([[5122, 1388, 1188,   67]])

In [10]:
# Generate text until the output length (which includes the context length) reaches 50
output_text = model.generate(indexed_tokens, max_length=50, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)
output_text

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[ 5122,  1388,  1188,    67,   364,   389,    25,   198,   198,    16,
            13,   383,   749,  1593,  1517,   284,  3505,   318,   326,   345,
           836,   470,   761,   284,   760,  1997,   546,   262,   983,   284,
           711,   340,    13,  1002,   345,   765,   284,  2193,   517,   546,
           340,    11,   345,   460,  2198,   503,   262, 22719,    13,   198]])

In [11]:
# Decode the output text
output_text_decoded = tokenizer.decode(output_text[0], skip_special_tokens=True)
print(output_text_decoded)


Our main valders are:

1. The most important thing to remember is that you don't need to know anything about the game to play it. If you want to learn more about it, you can check out the wiki.



In [None]:
'''
BERT
Now, let's see how we can leverage BERT for a classification task:
________________________________
'''

# ________________________________
# This script uses a pretrained BERT model to classify the sentiment of a movie review. You can replace "bert-base-uncased" with the model of your choice from the list of pretrained models available on the Hugging Face model hub.

In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer


BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [13]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [22]:
# Define a text prompt
text = "The movie was fantastic!"
# Encode the input text to tensor
inputs = tokenizer(text, return_tensors="pt")
inputs

{'input_ids': tensor([[  101,  1996,  3185,  2001, 10392,   999,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [23]:
# Classify the text
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[0.0675, 0.2200]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [24]:
# The model returns the classification scores which are usually passed to a softmax function
# Here, we will just print the raw scores
print(outputs.logits)

tensor([[0.0675, 0.2200]], grad_fn=<AddmmBackward0>)
