In [1]:
import re
import json
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments

In [4]:
pf_df = pd.read_csv('datasets/kaggle_poem_dataset.csv')
gt_df = pd.read_csv('datasets/Gutenberg-Poetry.csv')

In [5]:
pf_df.head()

Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [6]:
gt_df.head()

Unnamed: 0.1,Unnamed: 0,s,gid
0,0,The Song of Hiawatha is based on the legends a...,19
1,1,"many North American Indian tribes, but especia...",19
2,2,"Ojibway Indians of northern Michigan, Wisconsi...",19
3,3,"They were collected by Henry Rowe Schoolcraft,...",19
4,4,"Schoolcraft married Jane, O-bah-bahm-wawa-ge-z...",19


In [7]:
gt_df = gt_df.groupby('gid')['s'].apply(' '.join).reset_index(drop=True)

In [8]:
pf_df.shape, gt_df.shape

((15652, 5), (1191,))

In [9]:
data = pd.concat([pf_df['Content'], gt_df], axis=0).reset_index(drop=True)

In [10]:
train, test = train_test_split(data, test_size=0.15)

In [11]:
f = open('train_poems.txt', 'w')
data = ''

for j in train:
    summary = str(j).strip()
    summary = re.sub(r'\s+', ' ', summary)
    data += summary + '\n'

f.write(data)

121515264

In [12]:
f = open('test_poems.txt', 'w')
data = ''

for j in test:
    summary = str(j).strip()
    summary = re.sub(r'\s+', ' ', summary)
    data += summary + '\n'

f.write(data)

22004108

In [13]:
datasets = load_dataset("text", data_files={"train": 'train_poems.txt', "validation": 'test_poems.txt'})

Using custom data configuration default-57a0dae4e69e56cb


Downloading and preparing dataset text/default to /home/ec2-user/.cache/huggingface/datasets/text/default-57a0dae4e69e56cb/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset text downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/text/default-57a0dae4e69e56cb/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
model_checkpoint = "distilgpt2"

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [17]:
tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (66502 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1599 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1059 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (9679 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (10490 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence le

In [18]:
block_size = 128

In [19]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [20]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

In [21]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

In [22]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    output_dir="poems-distilgpt2",
    overwrite_output_dir=True,
    logging_steps=10000,
    save_steps=10000,
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=2
)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

In [24]:
trainer.train()

***** Running training *****
  Num examples = 240736
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 60184


Epoch,Training Loss,Validation Loss
1,4.6942,4.6006
2,4.6064,4.562054


Saving model checkpoint to poems-distilgpt2/checkpoint-10000
Configuration saved in poems-distilgpt2/checkpoint-10000/config.json
Model weights saved in poems-distilgpt2/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to poems-distilgpt2/checkpoint-20000
Configuration saved in poems-distilgpt2/checkpoint-20000/config.json
Model weights saved in poems-distilgpt2/checkpoint-20000/pytorch_model.bin
Saving model checkpoint to poems-distilgpt2/checkpoint-30000
Configuration saved in poems-distilgpt2/checkpoint-30000/config.json
Model weights saved in poems-distilgpt2/checkpoint-30000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 43542
  Batch size = 8
Saving model checkpoint to poems-distilgpt2/checkpoint-40000
Configuration saved in poems-distilgpt2/checkpoint-40000/config.json
Model weights saved in poems-distilgpt2/checkpoint-40000/pytorch_model.bin
Saving model checkpoint to poems-distilgpt2/checkpoint-50000
Configuration saved in poems-distilgpt2/checkpoi

TrainOutput(global_step=60184, training_loss=4.70659551756125, metrics={'train_runtime': 24145.8338, 'train_samples_per_second': 19.94, 'train_steps_per_second': 2.493, 'total_flos': 1.5725883637628928e+16, 'train_loss': 4.70659551756125, 'epoch': 2.0})

In [26]:
trainer.save_model()

Saving model checkpoint to poems-distilgpt2
Configuration saved in poems-distilgpt2/config.json
Model weights saved in poems-distilgpt2/pytorch_model.bin


In [28]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 43542
  Batch size = 8


Perplexity: 95.78


In [29]:
from transformers import pipeline

chef = pipeline('text-generation', model='poems-distilgpt2', tokenizer=model_checkpoint)

loading configuration file poems-distilgpt2/config.json
Model config GPT2Config {
  "_name_or_path": "distilgpt2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "f

In [31]:
orig = pipeline('text-generation', model=model_checkpoint, tokenizer=model_checkpoint)

loading configuration file https://huggingface.co/distilgpt2/resolve/main/config.json from cache at /home/ec2-user/.cache/huggingface/transformers/f985248d2791fcff97732e4ee263617adec1edb5429a2b8421734c6d14e39bee.422318838d1ec4e061efb4ea29671cb2a044e244dc69229682bebd7cacc81631
Model config GPT2Config {
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 6,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  

In [33]:
orig('How do we know')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'How do we know with these other options? Can you make a single example of a better solution that makes use of an existing database structure that was not created in the original file or has new features, and can be done using existing, more reliable databases'}]

In [30]:
chef('How do we know')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "How do we know but where? Where did I go? It seemed good enough that we could walk away— It wasn't even better— We'd run, but we could't. And even so, I was tired of walking away— I never"}]

In [38]:
orig('This scene')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'This scene during the match will be shown as per the script.\n\n\nIn the next scene the AI changes to the first thing to show. The AI does not seem to move up or down the side, even the left, and does not'}]

In [39]:
chef('This scene')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "This scene, all the country has, Yet only these to please the most: The country is the home of the most, The happy child of the country. My love's fair maiden, I adore To you, she loved for thee long and hard"}]

In [41]:
orig('What is Love?')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'What is Love?\u200d\u200d\u200d is a term coined by psychologist Patrick Rothbard in the late 19th century (in honor of the great philosopher) to describe how his ideas are to an average person:\n\n\n"Love is'}]

In [40]:
chef('What is Love?')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'What is Love?” His soul, like his blood, in every vein Will drain, as he will. The world does not know, He will never stay, no more; And, who knows, he will soon die. He spoke to'}]