In [38]:
import re

import nltk
from nltk import tokenize
from nltk.tree import Tree
import spacy
import benepar

nlp = spacy.load("en_core_web_md")
nlp.add_pipe("benepar", config={"model":"benepar_en3"})

<benepar.integrations.spacy_plugin.BeneparComponent at 0x19b0e142100>

In [29]:
doc = nlp("Kalki, final avatar (incarnation) of the Hindu god Vishnu, who is yet to appear")
sent = list(doc.sents)[0]
print(sent._.parse_string)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(NP (NP (NNP Kalki)) (, ,) (NP (NP (JJ final) (NN avatar)) (-LRB- -LRB-) (NP (NN incarnation)) (-RRB- -RRB-) (PP (IN of) (NP (NP (NP (DT the) (JJ Hindu) (NN god)) (NP (NNP Vishnu))) (, ,) (SBAR (WHNP (WP who)) (S (VP (VBZ is) (ADVP (RB yet)) (S (VP (TO to) (VP (VB appear)))))))))))


In [30]:
print(sent._.labels)

('NP',)


In [31]:
list((list(sent._.children)[1])._.children)

[]

In [32]:
tree = Tree.fromstring(sent._.parse_string)
print(tree.pretty_print())

            NP                                                                                                   
   _________|_________________________                                                                            
  |    |                              NP                                                                         
  |    |          ____________________|______________________                                                     
  |    |         |           |        |        |             PP                                                  
  |    |         |           |        |        |     ________|___________________                                 
  |    |         |           |        |        |    |                            NP                              
  |    |         |           |        |        |    |              ______________|________                        
  |    |         |           |        |        |    |             |              |  

In [33]:
temp1 = tree[0]
temp2 = tree[1]
temp3 = tree[-1]
temp1.pretty_print()
temp2.pretty_print()
temp3.pretty_print()

  NP 
  |   
 NNP 
  |   
Kalki

 , 
 |  
 , 

                            NP                                                                         
        ____________________|______________________                                                     
       |           |        |        |             PP                                                  
       |           |        |        |     ________|___________________                                 
       |           |        |        |    |                            NP                              
       |           |        |        |    |              ______________|________                        
       |           |        |        |    |             |              |       SBAR                    
       |           |        |        |    |             |              |    ____|____                   
       |           |        |        |    |             |              |   |         S                 
       |     

In [34]:
#split at rightmost NP or VP

def get_flattened(t):
    sent_str_final = None
    if t is not None:
        sent_str = [" ".join(x.leaves()) for x in list(t)]
        sent_str_final = [" ".join(sent_str)]
        sent_str_final = sent_str_final[0]
    return sent_str_final

In [35]:
def get_rvp_nvp(parse_tree, last_np = None, last_vp = None):

    if len(parse_tree.leaves()) == 1:
        return last_np, last_vp
    last_subtree = parse_tree[-1]
    if last_subtree.label() == "NP":
        last_np = last_subtree
    elif last_subtree.label() == "VP":
        last_vp = last_subtree
    return get_rvp_nvp(last_subtree, last_np, last_vp)

In [36]:
last_np, last_vp = get_rvp_nvp(tree)
last_np_flattened = get_flattened(last_np)
last_vp_flattened = get_flattened(last_vp)
print(last_np_flattened)
print(last_vp_flattened)

the Hindu god Vishnu , who is yet to appear
appear


In [12]:
def get_termination_portion(main_string, sub_string):
    combined_sub_string = sub_string.replace(" ","")
    main_string_list = main_string.split()
    last_index = len(main_string_list)
    for i in range(last_index):
        check_string_list = main_string_list[i:]
        check_string = "".join(check_string_list)
        check_string = check_string.replace(" ","")
        if check_string == combined_sub_string:
            return " ".join(main_string_list[:i])
        
    return None

In [40]:
longest_phrase = max(last_np_flattened, last_vp_flattened)
print(longest_phrase)

the Hindu god Vishnu , who is yet to appear


In [41]:
longest_phrase = re.sub(r"-LRB-", "(", longest_phrase)
longest_phrase = re.sub(r"-RRB-", ")", longest_phrase)

In [42]:
longest_phrase

'the Hindu god Vishnu , who is yet to appear'

In [43]:
split_sentence = get_termination_portion("The old woman was sitting under a tree and sipping coffee", longest_phrase)

In [44]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

GPT2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
GPT2_model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=GPT2_tokenizer.eos_token_id)

In [45]:
partial_sentence = "The old woman was sitting under a tree and"
input_ids = GPT2_tokenizer.encode(partial_sentence, return_tensors='pt')
print(input_ids)
maximum_length = len(partial_sentence.split())+40

tensor([[ 464, 1468, 2415,  373, 5586,  739,  257, 5509,  290]])


In [46]:
#Activate top_k sampling and top_p sampling with only from 90% most likely words

sample_outputs = GPT2_model.generate(input_ids, do_sample=True, max_length=maximum_length, top_k=60, top_p=0.8, repetition_penalty=10.0, num_return_sequences=12)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [47]:
generated_sentences = []
for i,sample_output in enumerate(sample_outputs):
  decoded_sentence = GPT2_tokenizer.decode(sample_output, skip_special_tokens=True)
  final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
  generated_sentences.append(final_sentence)
  print(final_sentence)

The old woman was sitting under a tree and the man in black, his hair tied behind him.
The old woman was sitting under a tree and her hands were trembling with fear.
The old woman was sitting under a tree and waiting for the young man to come, but she turned her head towards him.
The old woman was sitting under a tree and she looked down at the ground, then turned her head back towards their home.
The old woman was sitting under a tree and staring at her watch.
The old woman was sitting under a tree and had been watching from afar.
The old woman was sitting under a tree and looked up at the sky.
The old woman was sitting under a tree and sat down, looking at me.
The old woman was sitting under a tree and the new man with his cane had come in.
The old woman was sitting under a tree and had already left for her own protection.
The old woman was sitting under a tree and looked at the snowman in front of her.
The old woman was sitting under a tree and looked out through the window.
