In [1]:
pip install transformers



In [2]:
from transformers import T5Model, BertModel, GPT2Model

In [3]:
t5 = T5Model.from_pretrained("t5-small")
t5

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

T5Model(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dropout(p=0.1, inplace=

In [4]:
bert = BertModel.from_pretrained("bert-base-uncased")
bert

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [5]:
gpt2 = GPT2Model.from_pretrained("gpt2")
gpt2

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D(nf=2304, nx=768)
        (c_proj): Conv1D(nf=768, nx=768)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D(nf=3072, nx=768)
        (c_proj): Conv1D(nf=768, nx=3072)
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

# Bert with a Classification Head


In [6]:
from transformers import BertForSequenceClassification

bert_classification = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
bert_classification

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Example 1: T5 for Translation (Encoder-Decoder)

T5 uses both encoder and decoder - perfect for transformation tasks like translation.

In [11]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")


text = "translate English to German: I am Good"
input_ids = t5_tokenizer(text, return_tensors= "pt").input_ids


outputs = t5_model.generate(input_ids, max_length=40)
translation = t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Input: {text}")
print(f"Translation:  {translation}")

Input: translate English to German: I am Good
Translation:  Ich bin gut.


# Example 2: BERT for Classification (Encoder-Only)

BERT uses only the encoder - perfect for understanding and classification tasks like sentiment analysis.


In [14]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis", model= "distilbert-base-uncased-finetuned-sst-2-english")

# Test sentences
sentences = [
    "This movie was absolutely amazing!",
    "Terrible service, very disappointed.",
    "The product works as expected."
]

print("Sentiment Analysis Results:")

for sentence in sentences:
  result = classifier(sentence)[0]
  print(f"Text: {sentence}")
  print(f"Sentiment: {result['label']}, Confidence: {result['score']:.4f}\n")


Device set to use cuda:0


Sentiment Analysis Results:
Text: This movie was absolutely amazing!
Sentiment: POSITIVE, Confidence: 0.9999

Text: Terrible service, very disappointed.
Sentiment: NEGATIVE, Confidence: 0.9996

Text: The product works as expected.
Sentiment: POSITIVE, Confidence: 0.9978



# Example 3: GPT-2 for Text Generation (Decoder-Only)

GPT-2 uses only the decoder - perfect for generating fluent, coherent text.


In [16]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

prompt = "Once upon a time in a distant land,"
input_ids = gpt2_tokenizer(prompt, return_tensors="pt").input_ids

# Generate text
outputs = gpt2_model.generate(
    input_ids,
    max_length=100,
    num_return_sequences=1,
    temperature=0.7,
    do_sample=True,
    top_p=0.9
)

generated_text = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Prompt: {prompt}")
print(f"\nGenerated Text:\n{generated_text}")

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Prompt: Once upon a time in a distant land,

Generated Text:
Once upon a time in a distant land, the king was a king. And as he was king, his enemies, the king's friends, the king's enemies, the king's friends, were like unto him.

The King's friends, the King's friends, were like unto him. And as he was king, his enemies, the king's friends, were like unto him. And as he was king, his enemies, the king's friends, were like unto him. And


# Summary: Three Architectures in Action

- **T5 (Encoder-Decoder)**: Transforms input to different output → Translation, summarization
- **BERT (Encoder-Only)**: Understands and classifies → Sentiment analysis, NER, classification
- **GPT-2 (Decoder-Only)**: Generates text fluently → Stories, code, conversations

Notice how each architecture excels at its designed task!