In [4]:
import json

In [5]:
with open(r"..\protocol example\list_responses_clean.json", "r") as f:
    list_dicts = json.load(f)

In [6]:
list_dicts


[{'response_1': 'Some kind of beetle with big wings, or it could be some kind of frog.',
  'response_2': 'Some kind of iking mask with four eyes.',
  'response_3': 'Some kind of silhouette.',
  'response_4': 'A small character with an open mouth who looks surprised and his reflection there.'},
 {'response_1': "Two characters making contact or fighting because there's blood or a leg has been cut off.",
  'response_2': 'Lungs.',
  'response_3': "A rooster's head with a crest Fighter plane, glider, jet engine fire; again, it could be a larger one sending missiles Trunk men, perhaps irrational or some kind of caterpillar with a big belly."},
 {'response_1': 'Two oddly shaped guitars with twisted necks.',
  'response_2': 'Again, two figures face to face, strangely constituted, like Picasso.',
  'response_3': 'Or two figures as one.',
  'response_4': 'Two kinds of missiles or torpedoes.',
  'response_5': 'Lungs or a butterfly.',
  'response_6': 'Just part of an old man with his cane, or an e

In [7]:
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
import string

In [8]:
def get_noun_phrase(doc):
    """Takes a NLP spaCy doc as an input and outputs the noun phrase subtree at the highest depth within the syntax tree
    """
    # print(doc)
    list_dependencies = ["nsubj", "pobj", "attr", "agent", "csubj", "csubjpass", "npmod", "oprd", "pobj", "dative", "appos", "ROOT"]
    pos = ["NOUN", "PROPN", "NUM"]
    
    trans_dictionary = {"[": "", 
                    "]": "", 
                    ",": "",
                    ".": "",
                    "  ": "",
                    " '": "'",
                    ".":",",
                    " )": ")",
                    "( ": "("}
    
    for token in doc:
        # print(token)


        if token.dep_ in list_dependencies and token.pos_ in pos and token.pos_ != "PRON":  # remove personal pronouns?
            ancestors = [t for t in token.ancestors if t.dep_ in list_dependencies and t.pos_ != "PRON" and t.dep_ !="ROOT"]
            
            # in the above code, verify that the "ROOT" dependency acts the way we intend
            
            # print(token,"ancestors:", ancestors)
            
            len_ancestors = len(ancestors)
            # print(ancestors, len_ancestors)
            if len_ancestors > 0 :
                continue
                
            elif len_ancestors == 0 :
                
                og_ancestor = token

                final_children = [t for t in og_ancestor.children]
                
                for child in final_children:
                    grand_children = [t for t in child.children] # this way we get all the tree under the og ancestor
                    for item in grand_children:
                        if item != None:
                            final_children.append(item)
                            
                final_children = final_children + [og_ancestor]  
                
                sorted_sentence = sorted(final_children) # if needed, we can access the token's index with token.i
                response = str(sorted_sentence).replace("[", "").replace("]", "").replace(",", "").replace("  ", " ").replace(".", "").replace(" )", ")").replace("( ", "(").strip(" ") #to clean the response
                # print(response)
                return response
                # sometimes the response will be empty

    

In [9]:
def get_clean_NP_responses(list_of_dictionary_responses):
    """Takes a dictionary with responses as input 
    and outputs a dictionary with the noun phrases from those responses
    """
    clean_np_list = []
    
    for dictionary_response in list_of_dictionary_responses:
            
        clean_np_responses = {}

        for key, sentence in list(dictionary_response.items()):
            
            response_dict = {}
             
            # i will be using this to reintroduce sliced sentences into the dictionary at the end
            list_keys = list(dictionary_response.keys())
            last_key = list_keys[-1]
            key_template = str(last_key[:-1])+"{}"
            last_number = last_key[-1]

            doc = nlp(sentence)
            
            coordination = [(token, token.dep_, token.i) for token in doc if token.dep_ == "cc"] 
            
            if len(coordination) > 0 :  # slice the sentence by a token and add another response to the list
                
                # print(coordination)
                list_indexes = []
                
                for tuple in coordination:  # if there are more than one instances of coordination we'll need access to the indexes of every cc to know where to cut the sentences
                    index_cut = tuple[2]
                    list_indexes.append(index_cut)
                    
                new_sentences = []
                
                for i in range(len(list_indexes)):
                    
                    first_new_sentence = doc[:list_indexes[i]].text
                    try:
                        second_new_sentence = doc[list_indexes[i]+1:list_indexes[i+1]].text # will execute if there are more than one "cc"s
                    except:
                        second_new_sentence = doc[list_indexes[i]+1:].text
                    
                    new_sentences.append(first_new_sentence)
                    new_sentences.append(second_new_sentence)
                
                del dictionary_response[key]
                list_keys = list(dictionary_response.keys())
                last_key = list_keys[-1]
                key_template = str(last_key[:-1])+"{}"
                last_number = last_key[-1]
                
                for i in range(len(new_sentences)):
                    new_index_key = int(last_number) + i + 1 # since i will be 0 we have to add one so that we don't overwrite the last key
                    new_key = key_template.format(new_index_key)
            
                    doc = nlp(new_sentences[i])
                    # print(doc)
                    response_chopped_sentence = get_noun_phrase(doc)
                    np_key = len(list(clean_np_responses.keys())) + 1
                    clean_np_responses["response_{}".format(np_key)] = response_chopped_sentence
                    # response_dict[sentence] = response_chopped_sentence
                    
            elif len(coordination) == 0:
                # print(doc)
                response = get_noun_phrase(doc)
                np_key = len(list(clean_np_responses.keys())) + 1
                clean_np_responses["response_{}".format(np_key)] = response
                # response_dict[sentence] = response_chopped_sentence
                
        clean_np_list.append(clean_np_responses)
        
    return clean_np_list

In [10]:
get_clean_NP_responses(list_dicts)

[{'response_1': 'Some kind of beetle with big wings',
  'response_2': 'some kind of frog',
  'response_3': 'Some kind of iking mask with four eyes',
  'response_4': 'Some kind of silhouette',
  'response_5': 'A small character with an open mouth who looks surprised',
  'response_6': 'his reflection there'},
 {'response_1': 'Two characters',
  'response_2': 'blood',
  'response_3': "Two characters making contact or fighting because there 's blood",
  'response_4': None,
  'response_5': 'Lungs',
  'response_6': "A rooster 's head with a crest Fighter plane glider jet engine fire",
  'response_7': 'some kind of caterpillar with a big belly'},
 {'response_1': 'Two',
  'response_2': 'two figures',
  'response_3': None,
  'response_4': 'two figures as one',
  'response_5': 'Two kinds of missiles',
  'response_6': None,
  'response_7': 'Lungs',
  'response_8': 'a butterfly',
  'response_9': 'Just part of an old man with his cane',
  'response_10': 'an embryo',
  'response_11': 'big beast'},
 

In [11]:
import itertools

Sanity check

In [12]:
list_resp = [[v for k, v in dicts.items()] for dicts in list_dicts]
mege = list(itertools.chain(*list_resp))
len(mege)

34

In [13]:
clean_np_responses = get_clean_NP_responses(list_dicts)

list_resp = [[v for k, v in dicts.items()] for dicts in clean_np_responses]
mege = list(itertools.chain(*list_resp))
len(mege)

34

In [14]:
list_dicts

[{'response_2': 'Some kind of iking mask with four eyes.',
  'response_3': 'Some kind of silhouette.'},
 {'response_2': 'Lungs.'},
 {'response_1': 'Two oddly shaped guitars with twisted necks.',
  'response_2': 'Again, two figures face to face, strangely constituted, like Picasso.',
  'response_7': 'Some kind of big beast, you can see the inside of its body, the mandibles.'},
 {'response_2': 'Two figures under a kind of burka.',
  'response_3': "The fly's head (film)."},
 {'response_2': "Otherwise, I don't see anything else.",
  'response_3': 'Maybe two very small heads, back to back.',
  'response_4': 'The ACDC guitarist: his head with his hair.'},
 {'response_1': 'A rather bizarre dragonfly with half-destroyed wings.',
  'response_3': 'The Sandman (in Spiderman) returning to human form.',
  'response_4': "Otherwise, that's all I can see.",
  'response_7': "Finally, maybe it's some kind of extraterrestrial angel going to an island because it's got wings.",
  'response_8': "He comes do

In [15]:
clean_np_responses

[{'response_1': 'Some kind of iking mask with four eyes',
  'response_2': 'Some kind of silhouette'},
 {'response_1': 'Lungs'},
 {'response_1': 'Two', 'response_2': 'two figures', 'response_3': 'big beast'},
 {'response_1': 'Two figures under a kind of burka',
  'response_2': "The fly 's head (film)"},
 {'response_1': None,
  'response_2': 'Maybe two very small heads back to back',
  'response_3': 'The ACDC guitarist : his head with his hair'},
 {'response_1': 'half - destroyed wings',
  'response_2': 'The Sandman (in Spiderman) returning to human form',
  'response_3': None,
  'response_4': "some kind of extraterrestrial angel going to an island because it 's got wings",
  'response_5': 'a child about'},
 {'response_1': 'seahorse',
  'response_2': 'Prussian army soldiers with spikes on their helmets',
  'response_3': None},
 {'response_1': 'A microscope view with the right dyes',
  'response_2': "Two species of insect foraging on what 's there",
  'response_3': "A fly 's head with som

In [16]:
sentence = "I see some kind of iking mask with four eyes or a frog."
sentence

'I see some kind of iking mask with four eyes or a frog.'

In [17]:
def get_np(sentence):
    
    doc = nlp(sentence)

    coordination = [(token, token.dep_, token.i) for token in doc if token.dep_ == "cc"]
    
    if len(coordination) > 0 :  # slice the sentence by a token and add another response to the list
        
        # print(coordination)
        list_indexes = []
        
        for tuple in coordination:  # if there are more than one instances of coordination we'll need access to the indexes of every cc to know where to cut the sentences
            index_cut = tuple[2]
            list_indexes.append(index_cut)
            
        new_sentences = []
        
        for i in range(len(list_indexes)):
            
            first_new_sentence = doc[:list_indexes[i]].text
            try:
                second_new_sentence = doc[list_indexes[i]+1:list_indexes[i+1]].text # will execute if there are more than one "cc"s
            except:
                second_new_sentence = doc[list_indexes[i]+1:].text
            
            new_sentences.append(first_new_sentence)
            new_sentences.append(second_new_sentence)
            
        # print(new_sentences)
        
        for i in range(len(new_sentences)):

            doc = nlp(new_sentences[i])
            # print(doc)
            response_chopped_sentence = get_noun_phrase(doc)
            
            # print(response_chopped_sentence)
            
            new_sentences[i] = response_chopped_sentence
                    
        # print(new_sentences)
        return new_sentences

    elif len(coordination) == 0:
        # print(doc)
        response = get_noun_phrase(doc)
        
        return response

In [18]:
get_np("I see some kind of iking mask with four eyes or a frog.")

['iking mask with four eyes', 'a frog']