In [None]:
!pip install torch transformers  # uncomment and run if needed

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device



device(type='cpu')

In [None]:
bert_name = "distilbert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_model = AutoModel.from_pretrained(bert_name).to(device)

bert_model.eval()
bert_model.config

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "dtype": "float32",
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.57.2",
  "vocab_size": 30522
}

In [None]:
sent1 = "The bowler warmed up before stepping onto the pitch."
sent2 = "The singer struggled to hold the high pitch of the note."
sent3= "The groundskeeper repaired a divot in the muddy pitch."

encoding1 = bert_tokenizer(sent1, return_tensors="pt")
encoding2 = bert_tokenizer(sent2, return_tensors="pt")
encoding3 = bert_tokenizer(sent3,return_tensors = "pt")

with torch.no_grad():
    out1 = bert_model(**{k: v.to(device) for k, v in encoding1.items()})
    out2 = bert_model(**{k: v.to(device) for k, v in encoding2.items()})
    out3 = bert_model(**{k: v.to(device) for k, v in encoding3.items()})

# BERT outputs: last_hidden_state [batch, seq_len, hidden_dim]
emb1 = out1.last_hidden_state[0]  # [seq_len, hidden]
emb2 = out2.last_hidden_state[0]
emb3 = out3.last_hidden_state[0]

tokens1 = bert_tokenizer.convert_ids_to_tokens(encoding1["input_ids"][0])
tokens2 = bert_tokenizer.convert_ids_to_tokens(encoding2["input_ids"][0])
tokens3 = bert_tokenizer.convert_ids_to_tokens(encoding3["input_ids"][0])

print("Sentence 1 tokens:", tokens1)
print("Sentence 2 tokens:", tokens2)
print("Sentence 3 tokens:", tokens3)

Sentence 1 tokens: ['[CLS]', 'the', 'bowler', 'warmed', 'up', 'before', 'stepping', 'onto', 'the', 'pitch', '.', '[SEP]']
Sentence 2 tokens: ['[CLS]', 'the', 'singer', 'struggled', 'to', 'hold', 'the', 'high', 'pitch', 'of', 'the', 'note', '.', '[SEP]']
Sentence 3 tokens: ['[CLS]', 'the', 'grounds', '##keeper', 'repaired', 'a', 'di', '##vot', 'in', 'the', 'muddy', 'pitch', '.', '[SEP]']


In [None]:
idx1 = tokens1.index("pitch")
idx2 = tokens2.index("pitch")
idx3 = tokens3.index("pitch")

vec1 = emb1[idx1]
vec2 = emb2[idx2]
vec3 = emb3[idx3]

cos_sim = F.cosine_similarity(vec1, vec2, dim=0).item()

print(f"Index of 'pitch' in sentence 1: {idx1}")
print(f"Index of 'pitch' in sentence 2: {idx2}")
print(f"Cosine similarity between 'pitch' embeddings: {cos_sim:.4f}")

cos_sim = F.cosine_similarity(vec1, vec3, dim=0).item()
print(f"Index of 'pitch' in sentence 1: {idx1}")
print(f"Index of 'pitch' in sentence 3: {idx3}")
print(f"Cosine similarity between 'pitch' embeddings: {cos_sim:.4f}")

print("\nNote: similarity is usually far from 1.0, showing BERT encodes different meanings.")

Index of 'pitch' in sentence 1: 9
Index of 'pitch' in sentence 2: 8
Cosine similarity between 'pitch' embeddings: 0.5231
Index of 'pitch' in sentence 1: 9
Index of 'pitch' in sentence 3: 11
Cosine similarity between 'pitch' embeddings: 0.8036

Note: similarity is usually far from 1.0, showing BERT encodes different meanings.


In [None]:
sentences = [
    "This movie was fantastic and I loved it.",
    "The film was terrible and I hated it.",
    "I felt incredibly bored",
    "The film was truly amazing"
]

enc = bert_tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    outputs = bert_model(**{k: v.to(device) for k, v in enc.items()})

cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] is at position 0

sim = F.cosine_similarity(cls_embeddings[3], cls_embeddings[1], dim=0).item()
print("Cosine similarity between sentence embeddings:", sim)
print("Sentence 1:", sentences[1])
print("Sentence 2:", sentences[3])
#print("\nWe expect them to be somewhat dissimilar because one is positive and the other negative.")

Cosine similarity between sentence embeddings: 0.9562892913818359
Sentence 1: The film was terrible and I hated it.
Sentence 2: The film was truly amazing


In [None]:
print(cls_embeddings)

tensor([[ 0.0664, -0.1415, -0.0490,  ..., -0.1742,  0.3737,  0.1768],
        [ 0.1670,  0.0781, -0.0438,  ..., -0.0933,  0.3885,  0.3234],
        [-0.0032,  0.0148, -0.0414,  ...,  0.0086,  0.2716,  0.2578],
        [-0.0229, -0.1522,  0.1528,  ..., -0.1847,  0.3819,  0.1098]])


In [None]:
import torch
import torch.nn.functional as F
import pandas as pd

# Assume the previous steps defining sentences, bert_tokenizer, bert_model,
# device, and generating cls_embeddings have been successfully run.

# 1. Normalize the CLS embeddings
# Normalization is necessary so that the matrix multiplication calculates the dot product
# of unit vectors, which is equivalent to cosine similarity.
normalized_embeddings = F.normalize(cls_embeddings, p=2, dim=1)

# 2. Compute the 4x4 cosine similarity matrix
# The matrix product of the normalized embeddings and its transpose (E * E^T)
# gives the cosine similarity between all pairs of vectors.
similarity_matrix = torch.matmul(normalized_embeddings, normalized_embeddings.transpose(0, 1))

# 3. Format and Display the Result
sentences = [
    "This movie was fantastic and I loved it.",  # Index 0: Positive
    "The film was terrible and I hated it.",     # Index 1: Negative
    "I felt incredibly bored",                    # Index 2: Negative
    "The film was truly amazing"                 # Index 3: Positive
]

# Convert the PyTorch tensor to a NumPy array and round the values
similarity_matrix_np = similarity_matrix.cpu().numpy().round(4)

# Create a DataFrame for a clean, labeled display
index_cols = [f"S{i} ({'Pos' if i in [0, 3] else 'Neg'})" for i in range(len(sentences))]
similarity_df = pd.DataFrame(similarity_matrix_np, index=index_cols, columns=index_cols)

print("\n## 4x4 Cosine Similarity Matrix (CLS Embeddings)")
print("-" * 50)
print(similarity_df)


## 4x4 Cosine Similarity Matrix (CLS Embeddings)
--------------------------------------------------
          S0 (Pos)  S1 (Neg)  S2 (Neg)  S3 (Pos)
S0 (Pos)    1.0000    0.9724    0.9453    0.9622
S1 (Neg)    0.9724    1.0000    0.9616    0.9563
S2 (Neg)    0.9453    0.9616    1.0000    0.9483
S3 (Pos)    0.9622    0.9563    0.9483    1.0000


task 2: Different prompts with DistilGPT2. For each, generate 2 samples (change top_k / top_p if you like). Observe how sensitive GPT is to the prompt wording. T5 Text **Transformation**


In [None]:
gpt_name = "distilgpt2"
gpt_tokenizer = AutoTokenizer.from_pretrained(gpt_name)
if gpt_tokenizer.pad_token is None:
    gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

gpt_model = AutoModelForCausalLM.from_pretrained(gpt_name).to(device)
gpt_model.eval()
gpt_model.config

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "dtype": "float32",
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.57.2",
  "use_cache": true,
  "vocab_size": 50257
}



*   Top-P (or Nucleus Sampling) means you set a probability threshold ($P$) (e.g., 90%). The model then includes the minimum number of most likely words needed to reach that total probability.Example: top_p=0.85 (or 85%)The model starts with the most probable word: "mat" (40%). (Cumulative probability: 40%)It adds the next most probable word: "pillow" (30%). (Cumulative probability: 40% + 30% = 70%)It adds the next most probable word: "rug" (15%). (Cumulative probability: 70% + 15% = 85%)The cumulative probability has reached the 85% threshold, so the model stops.The final drawing pool contains: "mat," "pillow," and "rug."

*   1. Top-K Sampling (top_k) ✂️Top-K means you set a fixed number ($K$) of the most likely words the model is allowed to choose from.Example: top_k=3The model looks at all its options.It selects the top 3 most probable words: "mat," "pillow," and "rug."All other words, even "floor" (5% probability), are completely removed from the drawing pool.The model then samples the next word only from the remaining three.



max_length  = This sets the maximum total length (in tokens) of the entire generated sequence,

🤯 What the Code is Doing
The core idea of Experiment 3 is to see how much control the top_p parameter gives you over the GPT model's creativity.

It asks the model the same question twice.

It controls the "drawing pool" of words for the model.

1. High Diversity (The Adventurous Choice)
Setting: top_p=0.95

Meaning: The model considers a very large group of possible next words—the 95% most likely tokens.

Result: The output is very creative, unpredictable, and diverse. It might suggest an unusual method or go off on a tangent, like "The best way to learn a new language is to live in a cave for three months with only a dictionary and a goat." (Exaggerated, but you get the idea).

2. Low Diversity (The Safe Choice)
Setting: top_p=0.5

Meaning: The model considers a very small group of possible next words—only the 50% most likely tokens.

Result: The output is very conservative, focused, and predictable. It will almost certainly stick to the most common advice, like "The best way to learn a new language is to practice speaking with native speakers every day."

In [None]:
prompt= "The best way to learn a new language is to"
inputs = gpt_tokenizer(prompt, return_tensors="pt").to(device)

with torch.no_grad():
    output_ids = gpt_model.generate(
        **inputs,
        max_length=60,
        num_return_sequences=1,
        do_sample=True,
        top_p=0.50,
        top_k=50,
        pad_token_id=gpt_tokenizer.eos_token_id,
    )

generated = gpt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print("PROMPT:", prompt)
print("\nGPT CONTINUATION:")
print(generated)

PROMPT: The best way to learn a new language is to

GPT CONTINUATION:
The best way to learn a new language is to learn from it.



The first time you learn a new language is to learn from it. You can learn from it, and learn from it, by reading through the code, and by reading through the code.
This is the


In [None]:
prompts = [
    "The main purpose of a transformer model is to",
    "The main purpose of a self-attention mechanism is to"
]

for prompt in prompts:
    inputs = gpt_tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = gpt_model.generate(
            **inputs,
            max_length=40,
            num_return_sequences=1,
            do_sample=True,
            top_p=0.9,
            top_k=50,
            pad_token_id=gpt_tokenizer.eos_token_id,
        )

    generated = gpt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"\nPROMPT: {prompt}")
    print(f"OUTPUT: {generated}")


PROMPT: The main purpose of a transformer model is to
OUTPUT: The main purpose of a transformer model is to prevent noise by converting to high-frequency, low-frequency and low-frequency, and to prevent unwanted noise. The use of transformer transformer models can lead

PROMPT: The main purpose of a self-attention mechanism is to
OUTPUT: The main purpose of a self-attention mechanism is to identify and communicate the actions of participants. This is especially useful for learning about the nature of the human brain as well as to evaluate their cognitive


The top_p parameter sets a boundary for the words the model is allowed to pick. (Controls diversity)

top_k parameter simply controls the maximum number of most probable words the model is allowed to choose from at any given step(Controls the initial search pool.)

In [None]:
prompts = [
    "The astronaut looked out the window and saw a strange",
    "The terrified astronaut looked out the window and saw a strange"
]

for prompt in prompts:
    inputs = gpt_tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        output_ids = gpt_model.generate(
            **inputs,
            max_length=55,
            num_return_sequences=1,
            do_sample=True,
            top_p=0.7,  # Lower p value to be more focused/less diverse
            top_k=50,
            pad_token_id=gpt_tokenizer.eos_token_id,
        )

    generated = gpt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"\nPROMPT: {prompt}")
    print(f"OUTPUT: {generated}")


PROMPT: The astronaut looked out the window and saw a strange
OUTPUT: The astronaut looked out the window and saw a strange object. It looked like a small, black, green object, but it was a little more than a large one. The object was just as big as the object itself. The object was like a black box. It had

PROMPT: The terrified astronaut looked out the window and saw a strange
OUTPUT: The terrified astronaut looked out the window and saw a strange man standing outside.










































T5 Text Transformation

Use t5-small with prompts like "translate English to German:" or "summarize:".
Compare T5 outputs with GPT outputs on the same text.

In [None]:
t5_name = "t5-small"
t5_tokenizer = AutoTokenizer.from_pretrained(t5_name)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_name).to(device)
t5_model.eval()

text = "Machine learning allows computers to learn patterns from data and make predictions without being explicitly programmed."
t5_input = "summarize: " + text

enc = t5_tokenizer(t5_input, return_tensors="pt", truncation=True).to(device)

with torch.no_grad():
    summary_ids = t5_model.generate(
        **enc,
        max_length=40,
        num_beams=4,
        early_stopping=True,
    )

summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("ORIGINAL TEXT:\n", text)
print("\nT5 SUMMARY:\n", summary)

print("\nConfig check:")
print("is_encoder_decoder =", t5_model.config.is_encoder_decoder)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

ORIGINAL TEXT:
 Machine learning allows computers to learn patterns from data and make predictions without being explicitly programmed.

T5 SUMMARY:
 machine learning allows computers to learn patterns from data and make predictions without being explicitly programmed.

Config check:
is_encoder_decoder = True


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
import torch

# 1. Setup Device
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. Setup T5 (Sequence-to-Sequence: Encoder-Decoder)
t5_name = "t5-small"
t5_tokenizer = AutoTokenizer.from_pretrained(t5_name)
t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_name).to(device).eval()

# 3. Setup GPT (Causal LM: Decoder-Only)
gpt_name = "distilgpt2"
gpt_tokenizer = AutoTokenizer.from_pretrained(gpt_name)
gpt_model = AutoModelForCausalLM.from_pretrained(gpt_name).to(device).eval()
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token # Set pad token for generation

In [None]:
text_to_summarize = "The recent advancements in renewable energy technology, especially solar and wind power, promise a cleaner future, mitigating the impact of climate change caused by fossil fuels."
t5_prompt = "summarize: " + text_to_summarize

In [None]:
# --- T5 Generation (Summarization) ---
enc_t5 = t5_tokenizer(t5_prompt, return_tensors="pt", truncation=True).to(device)
with torch.no_grad():
    summary_ids = t5_model.generate(**enc_t5, max_length=50, num_beams=4, early_stopping=True)
t5_summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)


# --- GPT Generation (Treats it as Continuation) ---
enc_gpt = gpt_tokenizer(text_to_summarize, return_tensors="pt", truncation=True).to(device)
with torch.no_grad():
    continuation_ids = gpt_model.generate(**enc_gpt, max_length=50, do_sample=False, pad_token_id=gpt_tokenizer.eos_token_id)
gpt_output = gpt_tokenizer.decode(continuation_ids[0], skip_special_tokens=True)


print("## 1. Summarization Comparison")
print("-" * 50)
print(f"ORIGINAL:\n{text_to_summarize}\n")
print(f"T5 SUMMARY (Trained): {t5_summary}")
print(f"GPT OUTPUT (Continuation): {gpt_output}")

## 1. Summarization Comparison
--------------------------------------------------
ORIGINAL:
The recent advancements in renewable energy technology, especially solar and wind power, promise a cleaner future, mitigating the impact of climate change caused by fossil fuels.

T5 SUMMARY (Trained): recent advancements in renewable energy technology promise a cleaner future, mitigating the impact of climate change caused by fossil fuels.
GPT OUTPUT (Continuation): The recent advancements in renewable energy technology, especially solar and wind power, promise a cleaner future, mitigating the impact of climate change caused by fossil fuels.






















In [None]:
text_to_translate = "The weather is currently cold and rainy."
t5_prompt = "translate English to German: " + text_to_translate

In [None]:
# --- T5 Generation (Translation) ---
enc_t5 = t5_tokenizer(t5_prompt, return_tensors="pt", truncation=True).to(device)
with torch.no_grad():
    translation_ids = t5_model.generate(**enc_t5, max_length=20, num_beams=4, early_stopping=True)
t5_translation = t5_tokenizer.decode(translation_ids[0], skip_special_tokens=True)


# --- GPT Generation (Treats it as Continuation) ---
gpt_prompt = text_to_translate + " German translation:"
enc_gpt = gpt_tokenizer(gpt_prompt, return_tensors="pt", truncation=True).to(device)
with torch.no_grad():
    continuation_ids = gpt_model.generate(**enc_gpt, max_length=30, do_sample=False, pad_token_id=gpt_tokenizer.eos_token_id)
gpt_output = gpt_tokenizer.decode(continuation_ids[0], skip_special_tokens=True)


print("\n## 2. Translation Comparison")
print("-" * 50)
print(f"ORIGINAL:\n{text_to_translate}\n")
print(f"T5 TRANSLATION (Trained): {t5_translation}")
print(f"\nGPT OUTPUT (Guess): {gpt_output}")


## 2. Translation Comparison
--------------------------------------------------
ORIGINAL:
The weather is currently cold and rainy.

T5 TRANSLATION (Trained): Das Wetter ist derzeit kalt und regnerisch.

GPT OUTPUT (Guess): The weather is currently cold and rainy. German translation: “The weather is currently cold and rainy. German translation: “The weather is currently


Conclusion
This comparison highlights that:

T5 (Encoder-Decoder) is superior for structured transformation tasks like summarization and translation because it can consume the entire source text and then generate a completely new target sequence.

GPT (Decoder-Only) is better for creative continuation and generation but struggles with structured input-output tasks unless explicitly fine-tuned for them, often resorting to pattern repetition.