<a href="https://colab.research.google.com/github/bforoura/GenAI/blob/main/Module8/Pretrained_Shakespeare_No_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers datasets

from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import requests
import torch

# Step 1: Download and combine multiple Shakespeare texts
urls = [
    "https://www.gutenberg.org/files/1041/1041-0.txt",  # Hamlet
    "https://www.gutenberg.org/files/152/152-0.txt",   # Macbeth
    "https://www.gutenberg.org/files/1112/1112-0.txt"   # Othello
]

# Initialize an empty string to hold all text
all_text = ""

# Download each text file and append to all_text
for url in urls:
    response = requests.get(url)
    text = response.text
    all_text += text + "\n\n"  # Separate texts by newlines

# Save combined text to a single file
with open("combined_shakespeare.txt", "w", encoding="utf-8") as file:
    file.write(all_text)


# Step 3: Load the formatted text into a Hugging Face Dataset
with open("combined_shakespeare.txt", "r", encoding="utf-8") as file:
    formatted_data = file.read()

# Create a simple dataset from the formatted text
dataset = Dataset.from_dict({"text": formatted_data.split("\n\n")})  # Split into paragraphs

# Step 4: Tokenize the dataset
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Set the pad_token to eos_token if it's not already set
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Step 5: Reload the original GPT-2 model (This removes fine-tuning)
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Step 6: Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to=["none"],  # Disable Weights & Biases logging
    logging_steps=10,
)

# Step 7: Initialize the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False because GPT-2 is not a masked language model
)

# Step 8: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # If you have a separate eval dataset
    data_collator=data_collator
)

# Step 9: Train the model (Optional, if you still want to fine-tune)
# trainer.train()

# Step 10: Generate Text
# After training is complete, or if you just want to generate with the original model

# Step 10.1: Check if GPU is available and set the device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Step 10.2: Move the model to the appropriate device (GPU or CPU)
model.to(device)




Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Map:   0%|          | 0/1057 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [11]:
# Step 10.3: Tokenize the prompt and move the input tensor to the same device
prompt = "to be or not to be"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Explicitly set the attention mask and pad_token_id
inputs["attention_mask"] = inputs["attention_mask"].to(device)
inputs["input_ids"] = inputs["input_ids"].to(device)
inputs["pad_token_id"] = tokenizer.pad_token_id  # Ensure padding token ID is set

# Step 10.4: Generate text using the model
generated_text = model.generate(
    inputs["input_ids"],  # Use the input tensor
    max_length=50,
    num_return_sequences=1,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.pad_token_id  # Pass pad_token_id here as well
)

# Step 10.5: Decode and print the generated text
output = tokenizer.decode(generated_text[0], skip_special_tokens=True)
print(output)

to be or not to be.

The first thing to do is to make sure that you are not using the wrong software.

If you are using a different software, you should be able to use it without any problems.


