In [None]:
!pip install transformers datasets



In [None]:

from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import math

# Load pre-trained model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load and preprocess the dataset
dataset = load_dataset("karpathy/tiny_shakespeare",trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"],truncation=False, padding=False,max_length=1e20)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_train_dataset = tokenized_dataset["train"]["input_ids"][0]
tokenized_validation_dataset = tokenized_dataset["validation"]["input_ids"][0]
tokenized_test_dataset = tokenized_dataset["test"]["input_ids"][0]

def chunk_tokenized_sample(tokenized_sample, chunk_size=256):
    # Chunk the tokens into smaller pieces
    chunks = [tokenized_sample[i:i + chunk_size] for i in range(0, len(tokenized_sample), chunk_size)]

    # Reconstructing each chunk into a dictionary
    new_chunks = [{"input_ids": chunk, "attention_mask": [1] * len(chunk)} for chunk in chunks]

    return new_chunks


chunked_samples_train = chunk_tokenized_sample(tokenized_train_dataset)
chunked_samples_validation = chunk_tokenized_sample(tokenized_validation_dataset)
chunked_samples_test = chunk_tokenized_sample(tokenized_test_dataset)

# Convert to dataset format
train_dataset = Dataset.from_dict({
    "input_ids": [chunk["input_ids"] for chunk in chunked_samples_train],
    "attention_mask": [chunk["attention_mask"] for chunk in chunked_samples_train]
})
validation_dataset = Dataset.from_dict({
    "input_ids": [chunk["input_ids"] for chunk in chunked_samples_validation],
    "attention_mask": [chunk["attention_mask"] for chunk in chunked_samples_validation]
})
test_dataset = Dataset.from_dict({
    "input_ids": [chunk["input_ids"] for chunk in chunked_samples_test],
    "attention_mask": [chunk["attention_mask"] for chunk in chunked_samples_test]
})

# Create a TextDataset for training

# Create a DataCollator for batching
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-shakespeare",
    overwrite_output_dir=True,
    do_eval=True,
    num_train_epochs=10,
    learning_rate=5e-5,
    logging_steps=math.ceil(len(train_dataset) / 32),
    eval_strategy="epoch",
    weight_decay=0.01,
    per_device_train_batch_size=32,
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()
eval_results = trainer.evaluate()

# Save the fine-tuned model
model.save_pretrained("./gpt2-shakespeare-10epochs-chunk256")
tokenizer.save_pretrained("./gpt2-shakespeare-10epochs-chunk256")
print(eval_results)


Downloading data:   0%|          | 0.00/435k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,3.8462,3.484326
2,3.5586,3.438722
3,3.4592,3.421062
4,3.3954,3.41152
5,3.3499,3.403201
6,3.3142,3.402716
7,3.2851,3.40047
8,3.2635,3.401709
9,3.2477,3.401149
10,3.2371,3.40243


{'eval_loss': 3.402430295944214, 'eval_runtime': 0.3685, 'eval_samples_per_second': 192.674, 'eval_steps_per_second': 24.423, 'epoch': 10.0}


In [None]:
import torch
def generate_text(prompt, max_length=256):
    # Tokenize input prompt
    inputs = tokenizer(prompt,return_tensors="pt").to(torch.device("cuda"))

    # Generate output tokens
    output_sequences = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        do_sample=True,
        num_return_sequences=1
    )
    # Decode the output tokens to text
    generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

    return generated_text


In [None]:
print(generate_text(" "))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 I'll speak.

GLOUCESTER:
This is a gentleman's letter:
We have had too much ado with the king's death.

CAPULET:
Then tell me, what is the matter with him?

GLOUCESTER:
The king's son, Henry,
The bastard of the Duke of Clarence,
A true and honourable gentleman,
And heir of the royal throne,
And a loving and gracious father,
To whom, by God's holy mercy, we owe so much,
To be so near in the royal succession,
To so many good and noble souls
As that our king, King Henry,
To the royal heirs of England,
And our royal brothers, Henry,
Is to me the most gracious king,
And God's holy sovereign, to all the subjects
And to all the people, to all the people's royal kin,
To the queen, queen, and all the people's kings,
To all the subjects, all the subjects' subjects,
And all the subjects' subjects,
And all the subjects' subjects,
To all the subjects' subjects, and all the subjects' subjects,
And all the subjects
