In [21]:
import torch
from transformers import (GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments,
                          DataCollatorForLanguageModeling, TrainerCallback)
from datasets import load_dataset
from torch.utils.data import Dataset

# checking the GPU availability
print("GPU Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Device:", torch.cuda.get_device_name(0))

class ShakespeareDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=1024):
        self.input_ids = []
        self.attention_masks = []

        print("Tokenizing texts...")
        for i, text in enumerate(texts):
            encodings = tokenizer(
                text,
                truncation=True,
                max_length=max_length,
                padding='max_length',
                return_tensors="pt",
                return_attention_mask=True
            )
            self.input_ids.append(encodings['input_ids'].squeeze(0))
            self.attention_masks.append(encodings['attention_mask'].squeeze(0))

            if i % 100 == 0:
                print(f"Processed {i} texts")

        self.input_ids = torch.stack(self.input_ids)
        self.attention_masks = torch.stack(self.attention_masks)
        self.labels = self.input_ids.clone()
        self.labels[self.attention_masks == 0] = -100
        print("Dataset preparation complete!")

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.input_ids)

def load_and_preprocess_data():
    print("Loading dataset...")
    dataset = load_dataset("karpathy/tiny_shakespeare")
    texts = dataset["train"]["text"]


    chunk_size = 1024
    text_chunks = []

    for text in texts:
        for i in range(0, len(text), chunk_size // 2):
            chunk = text[i:i + chunk_size]
            if len(chunk) >= chunk_size // 2:
                text_chunks.append(chunk)

    # shuffling and splitting into train and eval sets
    print("Shuffling and splitting data...")
    np.random.shuffle(text_chunks)
    split_point = int(len(text_chunks) * 0.9)
    train_texts = text_chunks[:split_point]
    eval_texts = text_chunks[split_point:]

    print("Loading tokenizer...")
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2", clean_up_tokenization_spaces=True)
    tokenizer.pad_token = tokenizer.eos_token

    print("Creating datasets...")
    train_dataset = ShakespeareDataset(train_texts, tokenizer)
    eval_dataset = ShakespeareDataset(eval_texts, tokenizer)

    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Evaluation dataset size: {len(eval_dataset)}")

    return train_dataset, eval_dataset, tokenizer

def setup_training(tokenizer):
    print("Setting up training configuration...")
    training_args = TrainingArguments(
        output_dir="./shakespeare-gpt2",
        num_train_epochs=5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=200,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=100,
        eval_steps=100,
        evaluation_strategy="steps",
        save_strategy="steps",
        learning_rate=5e-5,
        fp16=True,
        gradient_accumulation_steps=2,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        warmup_ratio=0.1,
        report_to=["tensorboard"]
    )

    print("Initializing model...")
    model = GPT2LMHeadModel.from_pretrained("gpt2")
    model.resize_token_embeddings(len(tokenizer))

    return model, training_args

def train_model(model, training_args, train_dataset, eval_dataset, tokenizer):
    print("Initializing trainer...")

    class MetricsCallback(TrainerCallback):
        def on_log(self, args, state, control, logs=None, **kwargs):
            if state.is_local_process_zero and 'loss' in logs:
                print(f"Step {state.global_step}: Training Loss = {logs['loss']:.4f}")
                if 'eval_loss' in logs:
                    print(f"Validation Loss = {logs['eval_loss']:.4f}")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        callbacks=[MetricsCallback]
    )

    print("\nStarting training...")
    train_result = trainer.train()

    print("\nTraining completed! Final metrics:")
    print(f"Final training loss: {train_result.training_loss:.4f}")

    eval_results = trainer.evaluate()
    print(f"Final validation loss: {eval_results['eval_loss']:.4f}")

    print("\nSaving model...")
    trainer.save_model()
    print("Training complete!")
    return trainer

def generate_text(prompt, model, tokenizer, max_length=150):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.7,
        top_k=20,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=4,
        repetition_penalty=1.3,
        min_length=50,
        length_penalty=1.2
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

if __name__ == "__main__":
    train_dataset, eval_dataset, tokenizer = load_and_preprocess_data()
    model, training_args = setup_training(tokenizer)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"\nUsing device: {device}")
    model.to(device)

    trainer = train_model(model, training_args, train_dataset, eval_dataset, tokenizer)

    test_prompts = [
        "HAMLET: My kingdom for",
        "ROMEO: But soft, what light",
        "MACBETH: Tomorrow, and tomorrow",
        "LEAR: How sharper than a serpent's",
        "PROSPERO: Our revels now are"
    ]

    print("\nTesting text generation with different prompts:")
    for prompt in test_prompts:
        generated_text = generate_text(prompt, model, tokenizer)
        print(f"\nPrompt: {prompt}")
        print(f"Generated text:\n{generated_text}")
        print("-" * 50)


GPU Available: True
GPU Device: Tesla T4
Loading dataset...
Shuffling and splitting data...
Loading tokenizer...
Creating datasets...
Tokenizing texts...
Processed 0 texts
Processed 100 texts
Processed 200 texts
Processed 300 texts
Processed 400 texts
Processed 500 texts
Processed 600 texts
Processed 700 texts
Processed 800 texts
Processed 900 texts
Processed 1000 texts
Processed 1100 texts
Processed 1200 texts
Processed 1300 texts
Processed 1400 texts
Processed 1500 texts
Processed 1600 texts
Processed 1700 texts
Dataset preparation complete!
Tokenizing texts...
Processed 0 texts
Processed 100 texts
Dataset preparation complete!
Training dataset size: 1764
Evaluation dataset size: 196
Setting up training configuration...
Initializing model...




model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Using device: cuda
Initializing trainer...

Starting training...


Step,Training Loss,Validation Loss
100,7.5953,3.632193
200,7.0676,3.463009
300,6.8654,3.372488
400,6.9272,3.316606
500,6.4085,3.272451
600,6.2787,3.235863
700,6.2117,3.212636
800,6.2861,3.190828
900,6.2849,3.174132
1000,6.3033,3.166229


Step 10: Training Loss = 9.0362
Step 20: Training Loss = 8.8501
Step 30: Training Loss = 8.5744
Step 40: Training Loss = 8.2354
Step 50: Training Loss = 7.9009
Step 60: Training Loss = 7.9928
Step 70: Training Loss = 7.7849
Step 80: Training Loss = 7.7031
Step 90: Training Loss = 7.6566
Step 100: Training Loss = 7.5953
Step 110: Training Loss = 7.5094
Step 120: Training Loss = 7.3802
Step 130: Training Loss = 7.4130
Step 140: Training Loss = 7.3153
Step 150: Training Loss = 7.2643
Step 160: Training Loss = 7.2835
Step 170: Training Loss = 7.1902
Step 180: Training Loss = 7.1501
Step 190: Training Loss = 7.2861
Step 200: Training Loss = 7.0676
Step 210: Training Loss = 7.2788
Step 220: Training Loss = 7.0992
Step 230: Training Loss = 6.6301
Step 240: Training Loss = 6.9677
Step 250: Training Loss = 7.0075
Step 260: Training Loss = 6.8809
Step 270: Training Loss = 6.9141
Step 280: Training Loss = 6.8572
Step 290: Training Loss = 6.8773
Step 300: Training Loss = 6.8654
Step 310: Training 

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].



Training completed! Final metrics:
Final training loss: 6.6992


Final validation loss: 3.1614

Saving model...
Training complete!

Testing text generation with different prompts:





Prompt: HAMLET: My kingdom for
Generated text:
HAMLET: My kingdom for ever
Is in the hands of this traitor. O, look! my lord's face is pale; his eyes are heavy and he speaks so much ill--but I have seen him not speak well yet!--I am gone to-morrow with thee a thousand times more bitter than death? Nay if thou wert but dead still thy tears should dry them up like snow on an ice block'd shoreless ground.--Thou art slain by me now that hast done it all these years' work thus far alone against mine enemies:' let no one else know what hath been committed hereto save their lives from those who were most near at hand or had heard too late how great danger was upon us which must be avoided till
--------------------------------------------------

Prompt: ROMEO: But soft, what light
Generated text:
ROMEO: But soft, what light is it to your eyes?
I have seen the sun rise and fall in a thousand colours.
And yet I see no moon but that which lies between them.
But how long before this hour shall we