In [10]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch.nn.functional as F

# Load the pre-trained GPT-2 model from Hugging Face
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Move the model to evaluation mode
model.eval()

# Quantization function for dynamic INT8 quantization
def quantize_model(model, precision=8):
    # Perform dynamic quantization only on linear layers
    if precision == 8:
        print(f"Applying INT{precision} dynamic quantization...")
        model_quantized = torch.quantization.quantize_dynamic(
            model,  # Model to be quantized
            {torch.nn.Linear},  # Layers to quantize
            dtype=torch.qint8  # Quantize to INT8
        )
    return model_quantized

# Quantize the model
model_quantized = quantize_model(model, precision=8)

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Function to generate text from the model with advanced sampling strategies
def generate_text_from_model(model, prompt, max_length=100, temperature=0.7, top_k=50, top_p=0.9):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to('cpu')

    # Move the model to CPU (quantized models are usually on CPU)
    model.to('cpu')
    
    # Generate text using top-k sampling, top-p (nucleus sampling), and adjusted temperature
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_length=max_length, 
            temperature=temperature,  # Control the randomness
            top_k=top_k,  # Use top-k sampling
            top_p=top_p,  # Use nucleus sampling
            do_sample=True  # Ensure sampling is used instead of greedy decoding
        )
    
    # Decode the generated tokens back to text
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return output_text

# Provide a prompt to test the GPT-2 model
prompt = "University of Virginia is a top-ranking public school in the U.S., where  "

# Generate text from the original GPT-2 model with advanced sampling
generated_text_fp32 = generate_text_from_model(model, prompt, max_length=100)

# Generate text from the quantized GPT-2 model with advanced sampling
generated_text_int8 = generate_text_from_model(model_quantized, prompt, max_length=100)

# Print the generated text for comparison
print(f"Original GPT-2 Generated text: {generated_text_fp32}")
print(f"Quantized GPT-2 Generated text: {generated_text_int8}")


Applying INT8 dynamic quantization...


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Original GPT-2 Generated text: University of Virginia is a top-ranking public school in the U.S., where  Â it is ranked No. 1 in the country for undergraduate education and No. 2 in the nation for graduate education. It is also the only U.S. public school that has a public enrollment rate of 2.6 percent.
In 2007, the university announced that it would open its doors to students in the fall of 2010.
The university will offer a three-year, $12,
Quantized GPT-2 Generated text: University of Virginia is a top-ranking public school in the U.S., where  amusab is a way to ensure the two-year, - in-the-back-school-s-" of school.
(Citation and sponsorship given to, and for use in, "K-9 " and "Carry On"
t and the T.A. and, intervening, the, the R&A, the. (-, the. "The and


In [11]:
import math

# Function to calculate perplexity for a given model and generated text
def calculate_perplexity(model, prompt):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to('cpu')

    # Move the model to CPU
    model.to('cpu')
    
    # Get model output (logits)
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    
    # Compute cross-entropy loss
    loss = outputs.loss.item()
    
    # Compute perplexity from the loss
    perplexity = math.exp(loss)
    
    return perplexity

# Calculate perplexity for the original GPT-2 model
perplexity_fp32 = calculate_perplexity(model, prompt)

# Calculate perplexity for the quantized GPT-2 model
perplexity_int8 = calculate_perplexity(model_quantized, prompt)

# Print the perplexity scores for comparison
print(f"Perplexity of Original GPT-2: {perplexity_fp32}")
print(f"Perplexity of Quantized GPT-2: {perplexity_int8}")


Perplexity of Original GPT-2: 25.82721043362427
Perplexity of Quantized GPT-2: 3141.5037193211997
