# Prepare data for the finetuning

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_parquet("hf://datasets/shahules786/PoetryFoundationData/data/train-00000-of-00001-486832872ed96d17.parquet")

print(f"dataframe columns: {df.columns}")

newyork = df[df['author'].isin(["John Ashbery", "Barbara Guest", "James Schuyler", "Kenneth Koch", "Frank O'Hara"])]
shake = df[df['author'] == 'William Shakespeare']

print(f"Shakespeare: {len(shake)} examples\nNew Yorkers: {len(newyork)} examples")
print(f"Shakespeare avg length: {np.average([len(poem) for poem in shake['content']])}\nNew Yorkers avg length: {np.average([len(poem) for poem in newyork['content']])}")

  from .autonotebook import tqdm as notebook_tqdm


dataframe columns: Index(['poem name', 'content', 'author', 'type', 'age'], dtype='object')
Shakespeare: 85 examples
New Yorkers: 81 examples
Shakespeare avg length: 1468.5058823529412
New Yorkers avg length: 1810.6049382716049


In [3]:
def process_poem(poem) :
  proc = re.sub(r'[\r\n]+', ' ', poem)
  proc = re.sub(r'\s+', ' ', proc)
  sentences = re.split(r'(?<=[.!?])\s+', proc)
  sentences = [sentence for sentence in sentences if len(sentence) > 0]
  return sentences
process_poem("this is a sentence. This is -another :SENTENCE!!!!!\nAND this is a question? again.")

['this is a sentence.',
 'This is -another :SENTENCE!!!!!',
 'AND this is a question?',
 'again.']

In [4]:
newyork_processed = [] 
for i in range(len(newyork)) :
   newyork_processed += process_poem(newyork['content'].iloc[i])
newyork_labels = [0 for i in range(len(newyork_processed))]

shake_processed = [] 
for i in range(len(shake)) :
   shake_processed += process_poem(shake['content'].iloc[i])
shake_labels = [1 for i in range(len(shake_processed))]

processed_poems = newyork_processed + shake_processed
labels = newyork_labels + shake_labels

##

print(f"Number of New Yorker sentences: {len(newyork_processed)} with avg length of {np.mean([len(sentence) for sentence in newyork_processed])} characters")
print(f"eg:")
for i in range(10) :
   print(f"   {newyork_processed[i]}")
print(f"\nNumber of Shakespearean sentences: {len(shake_processed)} with avg length of {np.mean([len(sentence) for sentence in shake_processed])} characters")
print(f"eg:")
for i in range(10) :
   print(f"   {shake_processed[i]}")

Number of New Yorker sentences: 1213 with avg length of 110.57378400659522 characters
eg:
    Is anything central?
   Orchards flung out on the land, Urban forests, rustic plantations, knee-high hills?
   Are place names central?
   Elm Grove, Adcock Corner, Story Book Farm?
   As they concur with a rush at eye level Beating themselves into eyes which have had enough Thank you, no more thank you.
   And they come on like scenery mingled with darkness The damp plains, overgrown suburbs, Places of known civic pride, of civil obscurity.
   These are connected to my version of America But the juice is elsewhere.
   This morning as I walked out of your room After breakfast crosshatched with Backward and forward glances, backward into light, Forward into unfamiliar light, Was it our doing, and was it The material, the lumber of life, or of lives We were measuring, counting?
   A mood soon to be forgotten In crossed girders of light, cool downtown shadow In this morning that has seized us aga

In [5]:
sentence_lengths = [len(poem) for poem in processed_poems]
max_length = max(sentence_lengths)
avg_length = np.mean(sentence_lengths)
print(f"Max Length = {max_length}\nAvg Length = {avg_length}")

Max Length = 2707
Avg Length = 132.8826695371367


In [6]:
perm = np.random.permutation(len(processed_poems))
shuffled_poems = np.array(processed_poems)[perm]
shuffled_labels = np.array(labels)[perm]

training_data = shuffled_poems[:-100]
training_labels = shuffled_labels[:-100]

validation_data = shuffled_poems[-100:]
validation_labels = shuffled_labels[-100:]

In [7]:
from datasets import Dataset

train_ds = Dataset.from_dict({'text': training_data, 'label': training_labels})
validation_ds = Dataset.from_dict({'text': validation_data, 'label': validation_labels})

# Perform finetuning

In [8]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained(
    'gpt2',
    num_labels=2,          # Number of classes (authors)
)
# Resize the token embeddings because we've added a padding token
model.resize_token_embeddings(len(tokenizer))

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(50257, 768)

In [9]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',    # Pad to max_length
        truncation=True,         # Truncate sequences longer than max_length
        max_length=512           # Set max_length as per GPT-2's capacity
    )

train_ds = train_ds.map(tokenize_function, batched=True)
validation_ds = validation_ds.map(tokenize_function, batched=True)

Map: 100%|██████████| 1758/1758 [00:01<00:00, 1510.06 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 1407.92 examples/s]


In [10]:
train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
validation_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

In [11]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

training_args = TrainingArguments(
    output_dir='./discriminator_model',
    overwrite_output_dir=True,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)




In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=validation_ds,
    data_collator=data_collator,
)

In [18]:
model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
trainer.train()

  0%|          | 0/660 [04:50<?, ?it/s]


In [None]:
eval_results = trainer.evaluate()

print(f"Evaluation results:\n{eval_results}")

In [None]:
trainer.save_model('gpt2_discriminator')
tokenizer.save_pretrained('gpt2_discriminator_tokenizer')