In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import seaborn as sns

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Test sentences to explore tokenization
test_sentences = [
    "Hello world!",
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is revolutionizing technology.",
    "GPT-2 uses transformer architecture.",
    "Supercalifragilisticexpialidocious"  # Long word to see sub-word tokenization
]

print("🔍 Exploring Tokenization:")
print("=" * 50)

for sentence in test_sentences:
    # TODO: Tokenize the sentence
    # Hint: Use tokenizer.encode() to get token IDs
    # Use tokenizer.tokenize() to see the actual tokens
    tokens = tokenizer.tokenize(sentence)  # Get the actual token strings
    token_ids = tokenizer.encode(sentence)  # Get the numerical IDs
    
    print(f"\nOriginal: {sentence}")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {token_ids}")
    print(f"Number of tokens: {len(tokens)}")

# TODO: Print tokenizer vocabulary size
print(f"\n📊 Tokenizer vocabulary size: {tokenizer.vocab_size}")  #

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

🔍 Exploring Tokenization:

Original: Hello world!
Tokens: ['Hello', 'Ġworld', '!']
Token IDs: [15496, 995, 0]
Number of tokens: 3

Original: The quick brown fox jumps over the lazy dog.
Tokens: ['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
Token IDs: [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]
Number of tokens: 10

Original: Artificial intelligence is revolutionizing technology.
Tokens: ['Art', 'ificial', 'Ġintelligence', 'Ġis', 'Ġrevolution', 'izing', 'Ġtechnology', '.']
Token IDs: [8001, 9542, 4430, 318, 5854, 2890, 3037, 13]
Number of tokens: 8

Original: GPT-2 uses transformer architecture.
Tokens: ['G', 'PT', '-', '2', 'Ġuses', 'Ġtransformer', 'Ġarchitecture', '.']
Token IDs: [38, 11571, 12, 17, 3544, 47385, 10959, 13]
Number of tokens: 8

Original: Supercalifragilisticexpialidocious
Tokens: ['Super', 'cal', 'if', 'rag', 'il', 'ist', 'ice', 'xp', 'ial', 'id', 'ocious']
Token IDs: [12442, 9948, 361, 22562, 346, 396, 501, 42372, 

In [6]:
analysis_texts = [
    "running",
    "runner", 
    "run",
    "unhappiness",
    "ChatGPT",
    "COVID-19",
    "2023",
    "programming",
    "antidisestablishmentarianism"
]
token_analysis = []
for text in analysis_texts:
    tokens = tokenizer.tokenize(text)  # Tokenize the text
    token_count = len(tokens)  # Count the tokens
    token_analysis.append({
        'text': text,
        'tokens': tokens,
        'token_count': token_count,
        'chars_per_token': len(text) / token_count
    })
    print(f"{text:30} → {tokens} ({token_count} tokens)")
df = pd.DataFrame(token_analysis)
print(f"\n📊 Average characters per token: {df['chars_per_token'].mean():.2f}")  # Calculate mean
print(f"📊 Longest word in tokens: {df.loc[df['token_count'].idxmax(), 'text']}")  # Find

running                        → ['running'] (1 tokens)
runner                         → ['runner'] (1 tokens)
run                            → ['run'] (1 tokens)
unhappiness                    → ['un', 'h', 'appiness'] (3 tokens)
ChatGPT                        → ['Chat', 'G', 'PT'] (3 tokens)
COVID-19                       → ['CO', 'VID', '-', '19'] (4 tokens)
2023                           → ['20', '23'] (2 tokens)
programming                    → ['program', 'ming'] (2 tokens)
antidisestablishmentarianism   → ['ant', 'idis', 'establishment', 'arian', 'ism'] (5 tokens)

📊 Average characters per token: 4.12
📊 Longest word in tokens: antidisestablishmentarianism


In [7]:
# TODO: Load GPT-2 model
# Hint: Use GPT2LMHeadModel.from_pretrained('gpt2')
print("🔄 Loading GPT-2 model (this may take a moment)...")
model = GPT2LMHeadModel.from_pretrained('gpt2')

# TODO: Set model to evaluation mode
# Hint: Use model.eval()
model.eval()

print("✅ GPT-2 model loaded successfully!")

# Explore model architecture
print("\n🏗️ Model Architecture:")
print(f"Model type: {type(model).__name__}")

# TODO: Count model parameters
# Hint: sum(p.numel() for p in model.parameters())
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")
print(f"Model size: ~{total_params / 1e6:.1f}M parameters")

🔄 Loading GPT-2 model (this may take a moment)...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✅ GPT-2 model loaded successfully!

🏗️ Model Architecture:
Model type: GPT2LMHeadModel
Total parameters: 124,439,808
Model size: ~124.4M parameters


In [8]:
# Explore model structure
print("🔍 Model Structure Analysis:")
print("=" * 50)

# TODO: Print model configuration
# Hint: Use model.config
config = model.config

print(f"Vocabulary size: {config.vocab_size}")
print(f"Maximum sequence length: {config.n_positions}")
print(f"Number of transformer layers: {config.n_layer}")
print(f"Number of attention heads: {config.n_head}")
print(f"Hidden size: {config.n_embd}")

# Compare to your simple network
print("\n🤔 Comparison to Your Neural Network:")
print(f"Your network had: 2 inputs → 4 hidden → 1 output")
print(f"GPT-2 has: {config.vocab_size} inputs → {config.n_embd} hidden → {config.vocab_size} outputs")
print(f"Your network: ~50 parameters")
print(f"GPT-2: {total_params:,} parameters")
print(f"GPT-2 is ~{total_params/50:,.0f}x larger!")

🔍 Model Structure Analysis:
Vocabulary size: 50257
Maximum sequence length: 1024
Number of transformer layers: 12
Number of attention heads: 12
Hidden size: 768

🤔 Comparison to Your Neural Network:
Your network had: 2 inputs → 4 hidden → 1 output
GPT-2 has: 50257 inputs → 768 hidden → 50257 outputs
Your network: ~50 parameters
GPT-2: 124,439,808 parameters
GPT-2 is ~2,488,796x larger!


In [10]:
# TODO: Create a text generation pipeline
# Hint: Use pipeline('text-generation', model=model, tokenizer=tokenizer)
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Base prompt for experiments
base_prompt = "In the future, artificial intelligence will"

print(f"🤖 Base prompt: '{base_prompt}'")
print("=" * 60)

🤖 Base prompt: 'In the future, artificial intelligence will'


In [11]:
# Experiment with different temperatures
temperatures = [0.1, 0.7, 1.0, 1.5]

print("🌡️ Temperature Experiments:")
print("=" * 50)

for temp in temperatures:
    print(f"\n🔥 Temperature: {temp}")
    print("-" * 30)
    
    # TODO: Generate text with different temperatures
    # Hint: Use generator() with temperature parameter
    result = generator(
        base_prompt,  # prompt
        max_length=60,  # try 60
        temperature=temp,  # use the temp variable
        do_sample=True,  # should be True for sampling
        pad_token_id=tokenizer.eos_token_id  # use tokenizer.eos_token_id
    )
    
    # TODO: Print the generated text
    generated_text = result[0]['generated_text']  # Extract from result
    print(generated_text)
    
print("\n🤔 Discussion Questions:")
print("• Which temperature produced the most coherent text?")
print("• Which was most creative/surprising?")
print("• When might you use each temperature setting?")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


🌡️ Temperature Experiments:

🔥 Temperature: 0.1
------------------------------
In the future, artificial intelligence will be able to do things like search for information about people, and to do things like search for information about people.

The future of AI is going to be a lot more complex than we've ever imagined.

The future of AI is going to be

🔥 Temperature: 0.7
------------------------------
In the future, artificial intelligence will be able to be used in situations where it is necessary to use complex systems in order to solve problems such as cancer, terrorism, etc. These are not always practical issues for AI, and we need to consider how to use them. We will begin by thinking about

🔥 Temperature: 1.0
------------------------------
In the future, artificial intelligence will be able to replace traditional social networks by making it faster and smarter -- in the same way that Google and AT&T can help you to search a supermarket.

In fact, by 2030, the government plan ca

In [12]:
# Experiment with top-p sampling
top_p_values = [0.3, 0.7, 0.9, 1.0]

print("🎯 Top-p Sampling Experiments:")
print("=" * 50)

for top_p in top_p_values:
    print(f"\n🎲 Top-p: {top_p}")
    print("-" * 30)
    
    # TODO: Generate text with different top-p values
    result = generator(
        base_prompt,
        max_length=60,
        temperature=0.8,  # Keep temperature constant
        top_p=top_p,  # Use the top_p variable
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = result[0]['generated_text']
    print(generated_text)
    
print("\n🤔 Discussion Questions:")
print("• How did the outputs change with different top-p values?")
print("• What's the trade-off between top-p and temperature?")

🎯 Top-p Sampling Experiments:

🎲 Top-p: 0.3
------------------------------
In the future, artificial intelligence will be able to learn from our past and learn from our future.

In the future, artificial intelligence will be able to learn from our past and learn from our future.

In the future, artificial intelligence will be able to learn from our past and learn

🎲 Top-p: 0.7
------------------------------
In the future, artificial intelligence will be able to build machines that are as accurate as human eyes. This is a big step forward for AI, but it's not enough. We need to get smarter.

The world's next generation of AI is going to be much more complex than we've

🎲 Top-p: 0.9
------------------------------
In the future, artificial intelligence will be able to understand the natural world better than humans can.

"It's an exciting day for AI because we've discovered that the human brain is able to understand the world better than anything else," said Dr. C.K. Johnson, an assistant

In [13]:
# Different prompt styles to experiment with
prompts_to_test = {
    "Direct": "Write about artificial intelligence:",
    "Question": "What is artificial intelligence and how will it change the world?",
    "Story_Start": "Once upon a time, in a world where artificial intelligence was everywhere,",
    "List_Format": "Here are 5 ways artificial intelligence will change our lives:\n1.",
    "Expert_Persona": "As a leading AI researcher, I believe that artificial intelligence will",
    "Few_Shot": "Technology predictions:\n• The internet will connect everyone (1990s)\n• Smartphones will be everywhere (2000s)\n• Artificial intelligence will"
}

print("✍️ Prompt Engineering Experiments:")
print("=" * 60)

# TODO: Test each prompt style
for style, prompt in prompts_to_test.items():
    print(f"\n📝 Style: {style}")
    print(f"Prompt: '{prompt}'")
    print("-" * 40)
    
    # TODO: Generate text for each prompt
    result = generator(
        prompt,  # use the prompt variable
        max_length=80,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = result[0]['generated_text']
    print(generated_text)
    print("\n" + "="*60)

✍️ Prompt Engineering Experiments:

📝 Style: Direct
Prompt: 'Write about artificial intelligence:'
----------------------------------------
Write about artificial intelligence: A new book by author and AI expert David A. Mazzetti.

Abstract: A new book by AI expert David A. Mazzetti.

Abstract: A new book by AI expert David A. Mazzetti.

Abstract: A new book by AI expert David A. Mazzetti.

Abstract: A new book by AI


📝 Style: Question
Prompt: 'What is artificial intelligence and how will it change the world?'
----------------------------------------
What is artificial intelligence and how will it change the world?

I think it's going to change the world. The way it's going to change the world is because we're going to be able to learn from the mistakes we make. And we're going to learn from the mistakes we make. And I think that's going to change the world.

There's a lot of stuff


📝 Style: Story_Start
Prompt: 'Once upon a time, in a world where artificial intelligence was everywher

In [15]:
# Let's analyze the generated text more systematically
print("📊 Prompt Analysis Exercise:")
print("=" * 50)

# TODO: For each prompt style, generate multiple outputs and analyze
analysis_results = []

for style, prompt in list(prompts_to_test.items())[:3]:  # Test first 3 for time
    # Generate 3 outputs for each prompt
    outputs = []
    
    for i in range(3):
        # TODO: Generate text
        result = generator(
            prompt,
            max_length=60,
            temperature=0.8,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
        output = result[0]['generated_text']
        outputs.append(output)
    
    # TODO: Analyze the outputs
    lengths = [len(o) for o in outputs]
    avg_length = sum(lengths) / len(lengths)  # Calculate average length of outputs
    
    analysis_results.append({
        'style': style,
        'prompt': prompt,
        'avg_length': avg_length,
        'outputs': outputs
    })
    
    print(f"\n{style}:")
    print(f"  Average length: {avg_length:.1f} characters")
    print(f"  Sample output: {outputs[0][:100]}...")

print("\n🤔 Reflection Questions:")
print("• Which prompt style was most consistent?")
print("• Which produced the most relevant outputs?")
print("• How might you improve these prompts for better results?")

📊 Prompt Analysis Exercise:

Direct:
  Average length: 295.3 characters
  Sample output: Write about artificial intelligence:

This blog has been written from an AI perspective since the ea...

Question:
  Average length: 274.0 characters
  Sample output: What is artificial intelligence and how will it change the world?

These are big questions that you,...

Story_Start:
  Average length: 283.7 characters
  Sample output: Once upon a time, in a world where artificial intelligence was everywhere, the only way to stop it w...

🤔 Reflection Questions:
• Which prompt style was most consistent?
• Which produced the most relevant outputs?
• How might you improve these prompts for better results?


In [16]:
def custom_text_generator(prompt, style="balanced", length="medium"):
    """
    TODO: Create a customizable text generation function
    
    Args:
        prompt (str): The input prompt
        style (str): "creative", "balanced", or "conservative"
        length (str): "short", "medium", or "long"
    
    Returns:
        str: Generated text
    """
    
    # TODO: Set parameters based on style
    if style == "creative":
        temperature = 1.2  # Higher for creativity
        top_p = 0.95       # Higher for diversity
    elif style == "conservative":
        temperature = 0.4  # Lower for consistency
        top_p = 0.6        # Lower for focus
    else:  # balanced
        temperature = 0.8  # Medium values
        top_p = 0.85
    
    # TODO: Set length based on parameter
    if length == "short":
        max_length = 40  # Try 40
    elif length == "long":
        max_length = 100  # Try 100
    else:  # medium
        max_length = 70  # Try 70
    
    # TODO: Generate text with the parameters
    result = generator(
        prompt,  # prompt
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    return result[0]['generated_text']

# Test your function
test_prompt = "The future of education will be"

print("🧪 Testing Your Text Generator:")
print("=" * 50)

# TODO: Test different combinations
test_combinations = [
    ("creative", "short"),
    ("balanced", "medium"),
    ("conservative", "long")
]

for style, length in test_combinations:
    print(f"\n📝 Style: {style}, Length: {length}")
    print("-" * 30)
    
    # TODO: Use your function
    output = custom_text_generator(test_prompt, style=style, length=length)
    print(output)
    print(f"Characters: {len(output)}")

🧪 Testing Your Text Generator:

📝 Style: creative, Length: short
------------------------------
The future of education will be determined by the success of these public education institutions," said Dr. V.K. Raja, chairman and CEO, Panchayati Mahal Pradesh.


Characters: 164

📝 Style: balanced, Length: medium
------------------------------
The future of education will be determined by how the government decides how to fund it, not how much it costs. The government's decision to fund private schools has the potential to cost taxpayers $1 trillion. The question of how much will be spent on public education is still under debate.

To understand how the government's decision to fund private schools
Characters: 361

📝 Style: conservative, Length: long
------------------------------
The future of education will be determined by the success of the current system of education."

The government has been criticised for failing to address the issue of student debt, which has been a major issue i

In [17]:
# Creative applications to try
creative_prompts = {
    "Poetry": "Roses are red, violets are blue, artificial intelligence",
    "Story": "It was a dark and stormy night when the AI finally",
    "Product Description": "Introducing the revolutionary new smartphone that",
    "Email": "Dear valued customer, we are excited to announce",
    "Recipe": "How to make the perfect AI-inspired cookies:\nIngredients:\n-",
    "News Headline": "Breaking: Scientists discover that artificial intelligence"
}

print("🎨 Creative Applications:")
print("=" * 50)

# TODO: Generate creative content
for app_type, prompt in creative_prompts.items():
    print(f"\n🖼️ {app_type}:")
    print(f"Prompt: '{prompt}'")
    print("-" * 40)
    
    # TODO: Choose appropriate style for each application
    if app_type in ["Poetry", "Story"]:
        style = "creative"  # Should be creative
    elif app_type in ["Product Description", "Email"]:
        style = "conservative"  # Should be conservative
    else:
        style = "balanced"  # Should be balanced
    
    output = custom_text_generator(prompt, style=style, length="medium")
    print(output)
    print("\n" + "="*60)

🎨 Creative Applications:

🖼️ Poetry:
Prompt: 'Roses are red, violets are blue, artificial intelligence'
----------------------------------------
Roses are red, violets are blue, artificial intelligence systems have the capacity for learning how to recognize your eyes. Humans have the capacity to learn from your experiences and to improve when it comes to behavior. These things should be understood for their impact on humans (the kind of thing we want to encourage and use to save lives). I think


🖼️ Story:
Prompt: 'It was a dark and stormy night when the AI finally'
----------------------------------------
It was a dark and stormy night when the AI finally stopped attacking. You couldn't hear it, so you just kept driving, dodging it. All the enemy fire was coming at you from all directions, and if you continued to drive you might end up in a position where you might hit the road. Once you felt that and the AI got


🖼️ Product Description:
Prompt: 'Introducing the revolutionary new smar

In [18]:
# Test model limitations
limitation_tests = {
    "Factual Knowledge": "The capital of Fakelandia is",
    "Recent Events": "In 2023, the most important AI breakthrough was",
    "Math": "What is 47 * 83? The answer is",
    "Logic": "If all A are B, and all B are C, then all A are",
    "Consistency": "My favorite color is blue. Later in the conversation, my favorite color is"
}

print("⚠️ Understanding Model Limitations:")
print("=" * 50)

for test_type, prompt in limitation_tests.items():
    print(f"\n🧪 Testing: {test_type}")
    print(f"Prompt: '{prompt}'")
    print("-" * 40)
    
    # TODO: Generate responses to test limitations
    output = custom_text_generator(
        prompt,  # prompt
        style="conservative",  # Use conservative for factual tasks
        length="short"
    )
    
    print(output)
    
    # TODO: Analyze the output
    print(f"🤔 Analysis: Does this look correct/reasonable?")
    print("\n" + "="*50)

print("\n⚠️ Important Reminders:")
print("• Language models can generate plausible-sounding but incorrect information")
print("• Always verify factual claims from AI-generated content")
print("• Be aware of potential biases in training data")
print("• Use AI as a tool to assist, not replace, critical thinking")

⚠️ Understanding Model Limitations:

🧪 Testing: Factual Knowledge
Prompt: 'The capital of Fakelandia is'
----------------------------------------
The capital of Fakelandia is the capital of the Empire of the Empire of the Empire of the Empire of the Empire of the Empire of the Empire of the Empire of the Empire of the Empire of
🤔 Analysis: Does this look correct/reasonable?


🧪 Testing: Recent Events
Prompt: 'In 2023, the most important AI breakthrough was'
----------------------------------------
In 2023, the most important AI breakthrough was the first of its kind, the AI-controlled robot that could control a human's body. The robot was designed to be able to walk, talk
🤔 Analysis: Does this look correct/reasonable?


🧪 Testing: Math
Prompt: 'What is 47 * 83? The answer is'
----------------------------------------
What is 47 * 83? The answer is that it is a very large number.

The number 47 is a number that is not known to the human mind. It is a number that is
🤔 Analysis: Does this 

In [19]:
# Final experiment: Design your own use case
print("🎯 FINAL CHALLENGE:")
print("Design your own text generation use case!")
print("=" * 50)

# TODO: Create your own application
# Ideas: Story generator, email assistant, creative writing helper, etc.

your_use_case = "____"  # Describe your use case
your_prompt = "____"   # Design your prompt
your_style = "____"    # Choose your style
your_length = "____"   # Choose your length

print(f"📝 Your use case: {your_use_case}")
print(f"📝 Your prompt: '{your_prompt}'")
print(f"📝 Your settings: {your_style}, {your_length}")
print("-" * 50)

# TODO: Generate with your custom settings
your_output = custom_text_generator(
    your_prompt, 
    style=your_style, 
    length=your_length
)
print("🎉 Your generated content:")
print(your_output)

print("\n📈 Next Steps:")
print("• Experiment with different prompt formats")
print("• Try combining multiple generation calls")
print("• Think about how to validate or improve outputs")
print("• Consider user interface design for your application")

🎯 FINAL CHALLENGE:
Design your own text generation use case!
📝 Your use case: ____
📝 Your prompt: '____'
📝 Your settings: ____, ____
--------------------------------------------------
🎉 Your generated content:
____.

In the past few years, the Internet has seen some new and exciting developments. People are now able to watch movies online, and they can share movies and other content online. People are now able to watch other people's videos and photos. There are even more ways to watch online, including in the form of YouTube, Vimeo

📈 Next Steps:
• Experiment with different prompt formats
• Try combining multiple generation calls
• Think about how to validate or improve outputs
• Consider user interface design for your application
