# Prepare data for the finetuning

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
df = pd.read_parquet("hf://datasets/shahules786/PoetryFoundationData/data/train-00000-of-00001-486832872ed96d17.parquet")

print(f"dataframe columns: {df.columns}")

newyork = df[df['author'].isin(["John Ashbery", "Barbara Guest", "James Schuyler", "Kenneth Koch", "Frank O'Hara"])]
shake = df[df['author'] == 'William Shakespeare']

print(f"Shakespeare: {len(shake)} examples\nNew Yorkers: {len(newyork)} examples")
print(f"Shakespeare avg length: {np.average([len(poem) for poem in shake['content']])}\nNew Yorkers avg length: {np.average([len(poem) for poem in newyork['content']])}")

  from .autonotebook import tqdm as notebook_tqdm


dataframe columns: Index(['poem name', 'content', 'author', 'type', 'age'], dtype='object')
Shakespeare: 85 examples
New Yorkers: 81 examples
Shakespeare avg length: 1468.5058823529412
New Yorkers avg length: 1810.6049382716049


In [4]:
def process_poem(poem) :
  proc = re.sub(r'[\r\n]+', ' ', poem)
  proc = re.sub(r'\s+', ' ', proc)
  sentences = re.split(r'(?<=[.!?])\s+', proc)
  sentences = [sentence for sentence in sentences if len(sentence) > 0]
  return sentences
process_poem("this is a sentence. This is -another :SENTENCE!!!!!\nAND this is a question? again.")

['this is a sentence.',
 'This is -another :SENTENCE!!!!!',
 'AND this is a question?',
 'again.']

In [5]:
newyork_processed = [] 
for i in range(len(newyork)) :
   newyork_processed += process_poem(newyork['content'].iloc[i])
newyork_labels = [0 for i in range(len(newyork_processed))]

shake_processed = [] 
for i in range(len(shake)) :
   shake_processed += process_poem(shake['content'].iloc[i])
shake_labels = [1 for i in range(len(shake_processed))]

processed_poems = newyork_processed + shake_processed
labels = newyork_labels + shake_labels

##

print(f"Number of New Yorker sentences: {len(newyork_processed)} with avg length of {np.mean([len(sentence) for sentence in newyork_processed])} characters")
print(f"eg:")
for i in range(10) :
   print(f"   {newyork_processed[i]}")
print(f"\nNumber of Shakespearean sentences: {len(shake_processed)} with avg length of {np.mean([len(sentence) for sentence in shake_processed])} characters")
print(f"eg:")
for i in range(10) :
   print(f"   {shake_processed[i]}")

Number of New Yorker sentences: 1213 with avg length of 110.57378400659522 characters
eg:
    Is anything central?
   Orchards flung out on the land, Urban forests, rustic plantations, knee-high hills?
   Are place names central?
   Elm Grove, Adcock Corner, Story Book Farm?
   As they concur with a rush at eye level Beating themselves into eyes which have had enough Thank you, no more thank you.
   And they come on like scenery mingled with darkness The damp plains, overgrown suburbs, Places of known civic pride, of civil obscurity.
   These are connected to my version of America But the juice is elsewhere.
   This morning as I walked out of your room After breakfast crosshatched with Backward and forward glances, backward into light, Forward into unfamiliar light, Was it our doing, and was it The material, the lumber of life, or of lives We were measuring, counting?
   A mood soon to be forgotten In crossed girders of light, cool downtown shadow In this morning that has seized us aga

In [6]:
sentence_lengths = [len(poem) for poem in processed_poems]
max_length = max(sentence_lengths)
avg_length = np.mean(sentence_lengths)
print(f"Max Length = {max_length}\nAvg Length = {avg_length}")

Max Length = 2707
Avg Length = 132.8826695371367


In [7]:
perm = np.random.permutation(len(processed_poems))
shuffled_poems = np.array(processed_poems)[perm]
shuffled_labels = np.array(labels)[perm]

training_data = shuffled_poems[:-100]
training_labels = shuffled_labels[:-100]

validation_data = shuffled_poems[-100:]
validation_labels = shuffled_labels[-100:]

In [8]:
from datasets import Dataset

train_ds = Dataset.from_dict({'text': training_data, 'label': training_labels})
validation_ds = Dataset.from_dict({'text': validation_data, 'label': validation_labels})

# load discriminator model and evaluate

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Flatten
from transformers import AutoTokenizer, TFAutoModelForCausalLM, TFAutoModelForSequenceClassification, TFT5ForConditionalGeneration, pipeline, set_seed




In [12]:
# load discriminator model
model_disc = TFAutoModelForSequenceClassification.from_pretrained('gpt2_discriminator')
model_disc.config.pad_token_id = model_disc.config.eos_token_id
tokenizer_disc = AutoTokenizer.from_pretrained('gpt2_discriminator_tokenizer')
tokenizer_disc.pad_token = tokenizer_disc.eos_token




All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

All the weights of TFGPT2ForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2ForSequenceClassification for predictions without further training.


In [13]:
# custom classification pipeline
def pipe_disc(prompts, from_logits=False) :
    inputs = tokenizer_disc(prompts, return_tensors='tf', padding=True, truncation=True)
    logits = model_disc(**inputs).logits
    if from_logits :
        predicted_class_id = tf.math.reduce_max(logits, axis=-1)
    else :
        predicted_class_id = tf.math.argmax(logits, axis=-1) # 1 is shake, 0 is ny
    return predicted_class_id

In [16]:
inputs = validation_data.tolist()
model_pred = pipe_disc(inputs)

In [None]:
correct = 0
total = 0
for i in range(len(validation_labels)) :
    total += 1
    correct += 1 if validation_labels[i] == model_pred[i] else 0
print(f"{correct} correct out of {total} total")

100 correct out of 100 total
