In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
import os

# Define the path to the root directory containing all the TXT files
root_path = '/kaggle/input/poemsdataset/forms'

# Get a list of all the TXT files in the root directory and its subdirectories
file_list = []
for dirpath, dirnames, filenames in os.walk(root_path):
    for filename in filenames:
        if filename.endswith('.txt'):
            file_list.append(os.path.join(dirpath, filename))

# Load the tokenizer
tokenizer = tokenizer

# Concatenate all the text data into a single file
output_file = '/kaggle/working/poemd.txt'
with open(output_file, 'w') as f:
    for file_path in file_list:
        with open(file_path, 'r') as f_in:
            f.write(f_in.read())

# Create a TextDataset object from the concatenated text data file
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=output_file,
    block_size=356
)




In [5]:
import os

# Define the path to the directory containing all the TXT files
dir_path = '/kaggle/input/poemsdataset/forms/carol'

# Get a list of all the TXT files in the directory
file_list = [os.path.join(dir_path, f) for f in os.listdir(dir_path) if f.endswith('.txt')]

# Load the tokenizer
tokenizer = tokenizer

# Concatenate all the text data into a single file
output_file = '/kaggle/working/poemd.txt'
with open(output_file, 'w') as f:
    for file_path in file_list:
        with open(file_path, 'r') as f_in:
            f.write(f_in.read())

# Create a TextDataset object from the concatenated text data file
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=output_file,
    block_size=512
)


In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)


In [7]:
training_args = TrainingArguments(
    output_dir='./results',           # output directory
    num_train_epochs=10,               # total number of training epochs
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=32,    # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',             # directory for storing logs
    logging_steps=100,    # number of steps between logging updates
    learning_rate=5e-5
)


In [8]:
trainer = Trainer(
    model=model,                      
    args=training_args,                
    data_collator=data_collator,        
    train_dataset=dataset
)


In [9]:
trainer.train()


***** Running training *****
  Num examples = 6190
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3870
  Number of trainable parameters = 124439808
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
100,3.9634
200,3.8356
300,3.6984
400,3.6413
500,3.5933
600,3.6057
700,3.5429
800,3.5168
900,3.4647
1000,3.4716


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Configuration saved in ./results/checkpoint-500/generation_config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Configuration saved in ./results/checkpoint-1000/generation_config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Configuration saved in ./results/checkpoint-1500/generation_config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Configuration saved in ./results/checkpoint-2000/generation_config.json
Model weights saved in ./results/checkpoint-2000/pytorch_mo

TrainOutput(global_step=3870, training_loss=3.3423165925097402, metrics={'train_runtime': 3327.8567, 'train_samples_per_second': 18.601, 'train_steps_per_second': 1.163, 'total_flos': 1.12459682304e+16, 'train_loss': 3.3423165925097402, 'epoch': 10.0})

In [12]:
model.save_pretrained("./gpt2_256_full/")

Configuration saved in ./gpt2_256_full/config.json
Configuration saved in ./gpt2_256_full/generation_config.json
Model weights saved in ./gpt2_256_full/pytorch_model.bin


In [None]:
# trainer.evaluate()

In [29]:
from transformers import pipeline, GPT2Tokenizer, AutoModelWithLMHead

# Load the tokenizer and fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = AutoModelWithLMHead.from_pretrained('/kaggle/working/gpt2_256_full')

# Set up the pipeline for generating text
poetry_generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Generate a poem based on a prompt
prompt = "Criminal, you took a great piece of my life"
generated_poem = poetry_generator(prompt, max_length=512)

# Print the generated poem


loading file vocab.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12

In [30]:
# generated_poem

In [31]:
print(generated_poem[0]['generated_text'])


Criminal, you took a great piece of my life -
And in it you must repay me.
You came in a robe of scarlet black,
And at a time of great concern
You had my advice, and was ready to answer,
For I said no to it;
If it be so, then no man will give you any rest,
No one will answer me again, if it be so.
For you, a fool, who had no sense -
I cannot tell - but you left behind me a
Sacrament of pain, and a debt unpaid.
My Lord, I shall not be sold again -
Nor may my dear Lord receive it from me,
For I am the son of a prostitute,
And that he was no longer my Lord was written
in the book of repentance.
Your pardon for me must not only be given,
Your life is mine, as if you loved me -
You gave a sacrifice for my sake;
My sins were your own.
Oh, it is not so, for he loved and was faithful -
I, an innocent virgin,
Had been his bride till you began.
He loved me, I was his only love
For I love you more than his mother did
For he was a faithful wife, and that he loved me
And that he was not deceived by

# INFERENCE V2

In [15]:
import torch
from transformers import GPT2Tokenizer, AutoModelWithLMHead

# Load the tokenizer and fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = AutoModelWithLMHead.from_pretrained('/kaggle/working/gpt2_256_full')

# Set the device for inference
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define a function for generating poems
def generate_poem(prompt, max_length=256, temperature=1.0):
    # Tokenize the prompt and convert to tensor
    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
    prompt_tensor = torch.tensor([prompt_tokens]).to(device)

    # Generate text using the fine-tuned model
    generated_tokens = model.generate(
        input_ids=prompt_tensor,
        max_length=max_length+len(prompt_tokens),
        temperature=temperature,
        do_sample=True,
        num_beams=5,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.5,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated tokens and return the resulting poem
    poem = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return poem


loading file vocab.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12

In [18]:
prompt = "There once was a ship that put to sea"
generated_poem = generate_poem(prompt, max_length=256, temperature=0.7)
print(generated_poem)


Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "transformers_version": "4.26.1"
}



RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [17]:
!zip -r gpt2_256 '/kaggle/working/gpt2_256_full'

  adding: kaggle/working/gpt2_256_full/ (stored 0%)
  adding: kaggle/working/gpt2_256_full/generation_config.json (deflated 24%)
  adding: kaggle/working/gpt2_256_full/pytorch_model.bin (deflated 9%)
  adding: kaggle/working/gpt2_256_full/config.json (deflated 51%)
