In [None]:
!pip install transformers[torch]
!pip install pytorch-lightning
!pip install summa

Collecting transformers[torch]
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 46.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 4.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 16.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 42.8 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel fo

In [None]:
import re
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim
from summa import keywords as kd

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [None]:
df = pd.read_csv('./PoetryFoundationData.csv')
df = df.head(5000)

In [None]:
title_df = df['Title']
title_df
clean_titles = []
for title in title_df:
  title = ' '.join(title.split())
  clean_titles.append(title)
poem_list = list(clean_titles)

In [None]:
def keyword_builder(keywords):
  temp = ""
  for words in keywords[0:10]:
    temp += words[0] + ", "
  output = "Context: " + temp + " "
  return output

In [None]:
poem_df = df['Poem']
content_list = []

for poem in poem_df:
  poem = ' '.join(poem.split())
  poem = kd.keywords(poem, scores=True)
  keywords = keyword_builder(poem)
  content_list.append(keywords)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

In [None]:
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [None]:
tokenizer.encode("poem: ")


[7501, 368, 25, 220]

In [None]:
extra_length = len(tokenizer.encode("poem: ")) 

In [None]:
class PoemDataset(Dataset):  
    def __init__(self, tokenizer, init_token, poem_titles, poem_keywords, max_len):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.eos = self.tokenizer.eos_token
        self.eos_id = self.tokenizer.eos_token_id
        self.poems = poem_titles
        self.result = []

        for poem in range(len(self.poems)):
            # Encode the text using tokenizer.encode(). We add EOS at the end
            tokenized = self.tokenizer.encode(poem_keywords[poem] + init_token + str(self.poems[poem]) + self.eos)
            
            # Padding/truncating the encoded sequence to max_len 
            padded = self.pad_truncate(tokenized)           

            # Creating a tensor and adding to the result
            self.result.append(torch.tensor(padded))

    def __len__(self):
        return len(self.result)


    def __getitem__(self, item):
        return self.result[item]

    def pad_truncate(self, name):
        name_length = len(name) - extra_length
        if name_length < self.max_len:
            difference = self.max_len - name_length
            result = name + [self.eos_id] * difference
        elif name_length > self.max_len:
            result = name[:self.max_len + 3]+[self.eos_id] 
        else:
            result = name
        return result

In [None]:
dataset = PoemDataset(tokenizer, "poem: ", poem_list, content_list, 30)

In [None]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

In [None]:
def train(model, optimizer, dl, epochs):    
    for epoch in range(epochs):
        for idx, batch in enumerate(dl):
             with torch.set_grad_enabled(True):
                optimizer.zero_grad()
                batch = batch.to(device)
                output = model(batch, labels = batch)
                loss = output[0]
                loss.backward()
                optimizer.step()
                if idx % 10 == 0:
                    print("loss: %f, %d"%(loss, idx))

In [None]:
train(model=model, optimizer=optimizer, dl=dataloader, epochs=3)

loss: 7.958041, 0
loss: 3.203794, 10
loss: 3.056694, 20
loss: 2.796635, 30
loss: 3.377231, 40
loss: 3.158198, 50
loss: 2.879732, 60
loss: 3.260788, 70
loss: 2.698149, 80
loss: 3.036521, 90
loss: 3.268284, 100
loss: 3.048631, 110
loss: 3.160037, 120
loss: 3.364476, 130
loss: 2.527377, 140
loss: 2.865879, 150
loss: 2.485632, 0
loss: 2.535643, 10
loss: 2.456827, 20
loss: 2.491779, 30
loss: 2.568291, 40
loss: 2.945042, 50
loss: 2.504732, 60
loss: 2.698580, 70
loss: 2.787321, 80
loss: 2.332636, 90
loss: 2.784451, 100
loss: 2.391797, 110
loss: 2.745463, 120
loss: 2.466953, 130
loss: 2.356982, 140
loss: 2.358159, 150
loss: 2.201990, 0
loss: 1.944437, 10
loss: 2.354829, 20
loss: 2.161008, 30
loss: 2.168436, 40
loss: 2.133794, 50
loss: 1.971687, 60
loss: 1.762071, 70
loss: 1.825142, 80
loss: 2.262035, 90
loss: 2.310711, 100
loss: 2.493098, 110
loss: 2.443344, 120
loss: 2.255140, 130
loss: 2.279916, 140
loss: 2.141505, 150


In [None]:
def topk(probs, n=9):
    # The scores are initially softmaxed to convert to probabilities
    probs = torch.softmax(probs, dim= -1)
    
    # PyTorch has its own topk method, which we use here
    tokensProb, topIx = torch.topk(probs, k=n)
    
    # The new selection pool (9 choices) is normalized
    tokensProb = tokensProb / torch.sum(tokensProb)

    # Send to CPU for numpy handling
    tokensProb = tokensProb.cpu().detach().numpy()

    # Make a random choice from the pool based on the new prob distribution
    choice = np.random.choice(n, 1, p = tokensProb)
    tokenId = topIx[choice][0]

    return int(tokenId)

In [None]:
def model_infer(model, tokenizer, init_token, max_length=30):
    # Preprocess the init token (task designator)
    init_id = tokenizer.encode(init_token)
    result = init_id
    init_input = torch.tensor(init_id).unsqueeze(0).to(device)

    with torch.set_grad_enabled(False):
        # Feed the init token to the model
        output = model(init_input)

        # Flatten the logits at the final time step
        logits = output.logits[0,-1]

        # Make a top-k choice and append to the result
        result.append(topk(logits))

        # For max_length times:
        for i in range(max_length):
            # Feed the current sequence to the model and make a choice
            input = torch.tensor(result).unsqueeze(0).to(device)
            output = model(input)
            logits = output.logits[0,-1]
            res_id = topk(logits)

            # If the chosen token is EOS, return the result
            if res_id == tokenizer.eos_token_id:
                return tokenizer.decode(result)
            else: # Append to the sequence 
                result.append(res_id)
    # IF no EOS is generated, return after the max_len
    return tokenizer.decode(result)

In [None]:
from collections import Counter
import random
results = set()

input_poem = "    This is the dead land\
    This is cactus land\
    Here the stone images\
    Are raised, here they receive\
    The supplication of a dead man\'s hand\
    Under the twinkle of a fading star.\
    Is it like this\
    In death\'s other kingdom\
    Waking alone\
    At the hour when we are\
    Trembling with tenderness\
    Lips that would kiss\
    Form prayers to broken stone."

input_poem = ' '.join(input_poem.split())
seedwords = kd.keywords(input_poem, scores=True)
seedwords = keyword_builder(seedwords)
seedwords = "Context: " + seedwords + " poem:"
print(seedwords)

Context: Context: dead land, cactus,   poem:


In [None]:
results = set()
while len(results) < 10:
    name = model_infer(model, tokenizer, seedwords).replace(seedwords, "").strip()
    if name not in poem_list:
        results.add(name)
        print(name)

A Dead Land
The End Game
The Last Days of Summer
The Cactus Tree
Cactus, No, But Also
The Book of the Dead Land
The Death Land
The End Game
The Death Land
Cactus Poem
The Death Land
A Disused Graveyard
What the Cactus Sings
