# Sampling with rythmic constrains

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

import torch
import pickle

import numpy as np
from scipy.special import softmax
import random
import re
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('Anjoe/german-poetry-gpt2-large')
model = GPT2LMHeadModel.from_pretrained('Anjoe/german-poetry-gpt2-large') 
#model.eval()

## Algorithm

The top p / top k filtered logits are considered to be a candidate. The candidates are checked with a dictionary if they could lead to the targeted rythm. If this is the case, they are stored in a list. For each entry of a list in a next round the process is repeated, while now the sequence of both tokens are checked in the dictionary. If the none of the predicted sequences is in a list, the branch will be closed and deleted.  

In [47]:
def pred_next(text,
              target_dict,
              target_dict_s,
              vocabulary,
              new_words,
              next_stress,
              rythm_df,
              new_line,
              top_p = 0.4,
              max_top_k = 100,
              min_first_branches = 10,
              sample = True,
              rand = 4):
    
    inputs = tokenizer(text,return_tensors='pt')['input_ids']
    inputs_0 = inputs
    word_lst_0 = text.split()
    word_lst = word_lst_0
    text_out = text
    text_0 = text
    
    last_tokens = []
    first = True
    max_reset = 200
    reset = 0
    num_created_outs = 0
    
    hypothesis = []
    depth = 0
    first = True
    while True:
        word_lst = text_out.split()
        
        text = text_out
        
        outputs = model(inputs)
        num_created_outs += 1

        logits= outputs.logits[:,-1,:]

        next_token_sample = torch.argsort(-logits)
        
        ################################
        # top p Filtering
        
        m = torch.nn.Softmax(dim = 1)
        probabilites_sm_sorted = torch.sort(m(logits),descending=True)
        cum_prob = 0
        top_k = 0
        probabilities_sorted = probabilites_sm_sorted[0][0].cpu().detach().numpy()
        
        while cum_prob < top_p:
            cum_prob += probabilities_sorted[top_k]
            top_k += 1
            
        top_k += 1

        ##################################    

        candidates = next_token_sample[:,:top_k].cpu().detach().numpy()[0]

        scores = []

        if first and not new_line:                           # word preceeded by space

            for idx, cand in enumerate(list(candidates)):
                
                score = min(target_dict_s[cand],1)
                scores.append(score)    
            
            
        else:
            for idx, cand in enumerate(list(candidates)):

                score = min(target_dict[str(last_tokens+[cand])],1) # word consisting out of more tokens
                if cand in new_words:                               # each predicted new word is ok
                                                                    # since it only marks the end of the previous
                    score = 1

                scores.append(score)  
        
        first = False
        
        if len(scores) == 0 or (np.sum(scores) == 0 and depth > 0) or (np.sum(scores) < min_first_branches and depth == 0):  
            k = top_k                                           # if no candidate is found within top p extend top k
            
            while k < (max_top_k/(depth*2+1)):
                cand = next_token_sample[:,k].detach().numpy()[0]
                scores.append(min(target_dict[str(last_tokens+[cand])],1))
                k += 1
                if np.sum(scores) > 0 and depth > 0:
                    break
                    
                if np.sum(scores) >= min_first_branches and depth == 0:       # minimum initial branches
                    break
            top_k = k
                    
            candidates = next_token_sample[:,:top_k].cpu().detach().numpy()[0]
  
        cand_probs = probabilities_sorted[:top_k] 
        if sum(scores) > 0:

            scores = np.asarray(scores)
            scores_sort = np.sort(-scores)

            min_0 = np.max(np.nonzero(scores_sort))+1

            
            if sample:                  # sort the tokens according to their probabilities; 
                                        # some randomness is needed to prevent repetitions at line-reset
                                                                
                tokens_delete = np.argsort(-scores)[min_0:]
                cand_probs[tokens_delete] = -float('inf')
                cand_probs *= np.random.randn(len(cand_probs))/rand+1
                scores_arg = np.argsort(-cand_probs)[:min_0]
            else:
                scores_arg = np.argsort(-scores)[:min_0]

                random.shuffle(scores_arg)                # just shuffle random; no multinomial sampling

         
            candidates_fin = candidates[scores_arg]
     
            if depth == 0:
                if 199 in candidates_fin:
                    return 1, '\n'
                if candidates_fin[0] in [12,14]:
                    return 1, tokenizer.decode(candidates_fin[0])

            if depth > 0:
                candidates_no_start = [cand for cand in candidates_fin if cand not in new_words+[199,12,14]]
            else:
                candidates_no_start = candidates_fin



            if (len(candidates_fin) - len(candidates_no_start)) > 0:    #word could have an end

                last_word = tokenizer.decode(last_tokens).lower().strip()
                if last_word in vocabulary:
                    start_stress = rythm_df.loc[(rythm_df['word'] == last_word)]['start'].values[0] 

                    if start_stress == next_stress or start_stress == 0.5:

                        return num_created_outs, last_word
        else:
            candidates_no_start = []

        if len(candidates_no_start) == 0: # close branch

            for j in range(len(hypothesis)):
                hypothesis[-1] = hypothesis[-1][:-1]
                if len(hypothesis[-1]) < 1:
                    hypothesis = hypothesis[:-1]
                    depth -= 1
                else: break

            last_tokens = []
            for token in hypothesis:
                last_tokens.append(token[-1])
            if not last_tokens:
                return num_created_outs, False
            inputs = torch.cat((inputs_0, torch.tensor([last_tokens])),1)

        else:
            depth += 1
            hypothesis.append(candidates_no_start)  # each valid token is appended to the search tree
            last_tokens = []
            for token in hypothesis:
                last_tokens.append(token[-1])
            inputs = torch.cat((inputs_0, torch.tensor([last_tokens])),1)

            text_out = tokenizer.decode(inputs[0])

    
    return reset, False

In [5]:
with open(r'rythm_beam_search/notstressed', 'rb') as f:
    lst_0 = pickle.load(f)
    
with open(r'rythm_beam_search/stressed', 'rb') as f:
    lst_1 = pickle.load(f)
    
with open(r'rythm_beam_search/notstressed_start', 'rb') as f:
    lst_0_s = pickle.load(f)
    
with open(r'rythm_beam_search/stressed_start', 'rb') as f:
    lst_1_s = pickle.load(f)
    
rythm_df = pd.read_csv('rythm_beam_search/word_rythm.csv')

two_ltr_words =['ab', 'am', 'an', 'da', 'du', 'eh', 'er', 'es', 'im', 'in', 'ja', 'je', 'ob', 'so', 'um',
                   'wo', 'zu','ha','oh','ui']
rythm_df['reject'] = rythm_df['word'].apply(lambda x: len(x) < 3 and x not in two_ltr_words if
                                             type(x) == str else True)

rythm_df = rythm_df.drop(rythm_df[rythm_df.reject == True].index)
                                
new_words = list(lst_1_s.keys()) + list(lst_0_s.keys())

## Generating the poem

The poem gets generated word by word. If the Algorithm is not able to finish a line, it gets resetted. At the end it is possible to check how many token generations had been necessary in order to create the four lines

In [49]:
text = '''Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
'''

first_stress = 1

len_metrum = 2
len_verse = 10
vocabulary = list(rythm_df['word'])
new_line = True

num_created_outs = 0
resets_new_line = 0
lines = text.split('\n')

lines_0 = len(lines)

while len(lines) <= 8:  
    
    words = re.sub('[\W_]+', ' ', lines[-1]).split()
    num_syll = 0
    if words:
        for word in words:
            num_syll += rythm_df.loc[(rythm_df['word'] == word.lower())]['num_syll'].values[0]


    if num_syll % len_metrum == 0:
        next_stress = first_stress     
    else:
        next_stress = (1-first_stress)**2
                      
    if next_stress == 1: 
        target_dict = lst_1
        target_dict_s = lst_1_s
    
    else:
        target_dict = lst_0
        target_dict_s = lst_0_s
        
    target_dict_s[14] = 1     # .
    target_dict_s[12] = 1     # ,

    if num_syll > len_verse - 4:
        target_dict_s[199] = 1  # \n
        
    else:
        target_dict_s[199] = 0

    num_outs, next_word = pred_next(text,
                          target_dict,
                          target_dict_s,
                          vocabulary,
                          new_words,
                          next_stress, 
                          rythm_df,
                          new_line,
                          top_p = 0.25,            # top p value
                          max_top_k = 130,         # maximum to which the top k value will be extended
                                                   # when there are too few branches
                          min_first_branches = 10, # minimum of initial branches/beams
                          sample = True,           # tokens with higher probability will be prefered
                          rand = 5)                # randomize, larger value is less random

    num_created_outs += num_outs
    
    if next_word == '\n':
        new_line = True
    else:
        new_line = False
    
    if next_word: 
        text += ' ' + next_word
    else:
        text = '\n'.join(text.split('\n')[:-1]) +'\n'
        resets_new_line += 1
        new_line = True

    lines = text.split('\n')    
    if len(lines) > lines_0:
        lines_0 += 1
        print('result')
        print(text)
        print('number of generated outputs:')
        print(num_created_outs)
        print('number of resetted lines:')
        print(resets_new_line)

print('result')
print(text)
print('number of generated outputs:')
print(num_created_outs)
print('number of resetted lines:')
print(resets_new_line)

result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 leb frohlockend darum eile 

number of generated outputs:
15
number of resetted lines:
0
result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 leb frohlockend darum eile 
 wer geduld get raget tugendreichem 

number of generated outputs:
57
number of resetted lines:
1
result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 leb frohlockend darum eile 
 wer geduld get raget tugendreichem 
 in vollendung seines hobbit . 

number of generated outputs:
72
number of resetted lines:
1
result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 leb frohlockend darum eile 
 wer geduld get rag

## Algorithm used in the poem generator
Below the same experiment is done with the algorithm that is used in the poem generator. It randomly generates n sequences and checks if one of them fullfills the metrical constrains. Therefore a lot more token need to be generated. However the output is not constrained by metric dictionaries and the sampling can be done according to other criteria as well. Therefore the linguistic quality of the produced output is usually better.   

In [1]:
'''
for the sampling refer to https://huggingface.co/transformers/v3.1.0/_modules/transformers/generation_utils.html
if temperature != 1.0:
    scores = scores / temperature
# Top-p/top-k filtering
next_token_logscores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p)
# Sample
probs = F.softmax(next_token_logscores, dim=-1)

'''
import include_dir

from gpt_poet import gpt_poet_analysis

text = '''Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
'''
num_syll = 10
target_rythm = [1,0]

num_created_outs = 0
resets_new_line = 0
lines = text.split('\n')
while len(lines) <= 8:
    lines = text.split('\n')
    num_outs, next_line = gpt_poet_analysis(text,
                                            target_rythm,
                                            num_syll,
                                            tollerance = 4,
                                            require_last = True,   # if the last stress of the output should match
                                            num_branches = 5,   # number of branches to try per iteration
                                            LLM = 'GPT2-large')
    num_created_outs += num_outs
    if next_line: 
        text += ' ' + next_line
    else:
        resets_new_line += 1
    print('result')
    print(text)
    print('number of generated outputs:')
    print(num_created_outs)
    print('number of resetted lines:')
    print(resets_new_line)

  VERSION_SPEC = originalTextFor(_VERSION_SPEC)("specifier")
  MARKER_EXPR = originalTextFor(MARKER_EXPR())("marker")


start generating


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 Was das Reich der Geister brachte weiß

number of generated outputs:
400
number of resetted lines:
0
start generating


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 Was das Reich der Geister brachte weiß
 ich Ich heute , aber ich durfte schaun !

number of generated outputs:
1100
number of resetted lines:
0
start generating


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 Was das Reich der Geister brachte weiß
 ich Ich heute , aber ich durfte schaun !
 Als du noch in deinem Dome sangst ,

number of generated outputs:
2500
number of resetted lines:
0
start generating


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 Was das Reich der Geister brachte weiß
 ich Ich heute , aber ich durfte schaun !
 Als du noch in deinem Dome sangst ,
 War es jetzt bist du und dämmernd steigt

number of generated outputs:
3000
number of resetted lines:
0
start generating


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

result
Nur durch das Morgentor des Schönen
Drangst du in der Erkenntnis Land.
An höhern Glanz sich zu gewöhnen,
Übt sich am Reize der Verstand.
 Was das Reich der Geister brachte weiß
 ich Ich heute , aber ich durfte schaun !
 Als du noch in deinem Dome sangst ,
 War es jetzt bist du und dämmernd steigt

number of generated outputs:
4600
number of resetted lines:
1
