In [1]:
## This code performs VerbNet parsing using Docker container
# Use Selenium environment
# Source: https://github.com/jgung/verbnet-parser
# Follow the github instructions for running Docker container on localhost
# Website version: https://verbnetparser.com/

# Import selenium and other libraries needed for parsing html and running javascript code on browser
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup

from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import sentence_embeds_processing as sep
from sentence_embeds_processing import load_set_of_sentences, tokenise_sentence

import json
import csv
import regex as re
import numpy as np

# Root for data path
path_root = "D:\Study and Projects\School Work\Year 25 - PhD\Data"

# Lists of special words
stop_words = ['a', 'an', 'the', 'these', 'that', 'those', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', '']
special_word_list = {'tshirt':'t_shirt', 'lorises':'loris', 'allocates':'allocate', 'beared':'bears', 'comeing':'coming'}

In [2]:
## Key functions for parsing VerbNet

# Try to print specific semantic role from sentence
def get_semant_ele(sentence,element):
    try:
        return(sentence[element])
    except KeyError:
        return('NONE')
    

# Function to submit sentence to VerbNet browser-based app and return the html
def VerbNet_api(sentence, silent=True):
    if silent==True: # suppress showing  browser
        options = Options()
        options.add_argument("--headless")
  
    # prepare web driver to access url
    driver_path = Service('D:\Downloads\Archive\edgedriver_win64\msedgedriver.exe') # need to update this when Edge updates
    driver = webdriver.Edge(service=driver_path, options=options)
    url = "http://localhost:8080/" # need Docker running on localhost for this to work
    driver.get(url)

    # send sentence as input to form
    input = driver.find_element(by=By.TAG_NAME, value="input")
    input.send_keys(sentence)

    # submit form
    submit = driver.find_element(by=By.TAG_NAME, value="button")
    submit.click()

    # get url after submitting form
    web_output = driver.find_element(by=By.TAG_NAME, value="body")
    return(web_output)


# Modify a sentence by converting main verb to infinitive form, to help the parse when it gets stuck
def modify_sentence(sentence):
    tokenised_sentence = tokenise_sentence(sentence)
    likely_verb = tokenised_sentence[1]
    new_verb = 'is '+PorterStemmer().stem(likely_verb)+'ing'
    alt_sentence = re.sub(likely_verb, new_verb, sentence)
    return(alt_sentence)


# Function to extract the semantic tags from the VerbNet html
def extract_tags(web_output, role_transforms):
    html = web_output.get_attribute('innerHTML')
    soup = BeautifulSoup(html, 'html.parser')
    all_results = soup.find_all(class_="ui basic label") # get a big list of everything

    phrase_tags_dict = {}
    for element in all_results: # loop over elements in html
        phrase = element.find_all(class_=["ui black large basic label"]) # get phrase text
        tag = element.find_all(class_=["ui blue large basic label", "ui blue large label", "ui grey large basic label"]) # get tag text
        if len(tag)==0: # in case we don't find it using the normal search
            tag = element.find_all(class_=["detail"])
        if len(tag)==0: # last ditch case to find a tag
            tag = element.find_all(class_=["ui purple large basic label"])

        tag = tag[0].text # get just the tag text
        tag = re.sub(r',', '', tag) # remove commas from tag names (a few have them)
        phrase = phrase[0].text
        
        try:
            tag = role_transforms[tag] # perform transformation of tag if needed
        except KeyError:
            tag = tag # no transformation needed
            
        if bool(re.search('\d', tag)): # if there are numbers in the tag, its a verb (wordnet ID)
            tag = 'Verb'
        
        phrase_tags_dict[tag] = phrase # store results in dictionary

    if ('Agent' not in phrase_tags_dict.keys()) and ('Theme' in phrase_tags_dict.keys()): # rename 'Theme' to 'Agent' if Agent missing
        phrase_tags_dict['Agent'] = phrase_tags_dict['Theme']
        phrase_tags_dict.pop('Theme')
        
    output = (phrase_tags_dict, all_results)
    return(output)


# Load transformations for role labels from text file
def load_semantic_role_label_transforms():
    role_transforms = {}
    with open(path_root+'\Frames and Structured Data\VerbNet\VerbNet_parse_transformations.txt', newline='') as text_file:
        reader = csv.DictReader(text_file)
        for row in reader:
            role_transforms[row['initial']] = row['new']
    return(role_transforms)

In [5]:
## Show available datasets, as specified in the sep module
pairs = True # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

{0: '2014 Wehbe\\Stimuli\\Chapter_9_sentences_final', 1: '2017 Anderson\\Stimuli\\stimuli_final', 2: '2018 Pereira\\Stimuli\\stimuli_243sentences', 3: '2018 Pereira\\Stimuli\\stimuli_384sentences', 4: '2020 Alice Dataset\\Stimuli\\stimuli_sentences_final', 5: 'Fodor_2023', 6: 'Fodor_2023_prelim'}
loaded Fodor_2023


In [None]:
## Load sentence set 

# choose number from those printed above
dataset_name = datasets[5]

# load sentence set into dictionary depending on type
if pairs == True:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['data_pairs_path'], pairs)
else:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['neuro_root'], pairs)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset_name,'with',n,'sentences')

# store in list
sentences = []
if pairs==True: # use this for sentence similarity pair data
    sentences.append(list(np.array(list(sentences_dict.values()))[:,0].flatten()))
    sentences.append(list(np.array(list(sentences_dict.values()))[:,1].flatten()))
else: # use this for neuroimaging data/list of sentences
    sentences.append(list(sentences_dict.values()))

In [None]:
## Perform VerbNet parse of dataset 
# Try restarting browser if Docker won't run; might need to try restarting the container a few times
sentence_pairs_parsed = {}
role_transforms = load_semantic_role_label_transforms() # get semantic role transformations from file

# Loop over all sentences
for key in list(sentences_dict.keys()):
    sentence_pair_parsed = []
    if (key%25==0):
        print('Index '+str(key))
    for sentence in sentences_dict[key][0:2]:
        print(sentence)
        web_output = VerbNet_api(sentence) # get html from web api
        parse_data, all_results = extract_tags(web_output, role_transforms) # extract parse results from html
        
        # try alternate formulation of a sentence if original version doesn't parse
        if parse_data=={}:
            alt_sentence = modify_sentence(sentence)
            web_output = VerbNet_api(alt_sentence) # get html from web api
            parse_data, all_results = extract_tags(web_output, role_transforms) # extract parse results from html
        
        sentence_pair_parsed.append(parse_data)
        print(parse_data)
    sentence_pairs_parsed[key] = sentence_pair_parsed # full data storage
    
# Save raw VerbNet parsed data into json file  
json_path = "full_verbnet_pass_"+dataset+"_raw.json"
with open(json_path, "w") as file:
    json.dump(sentence_pairs_parsed, file) 

In [29]:
## Print out key roles with values in array-like format

# List of key semantic roles to use
sem_role_dict = {'Agent':0,'Verb':1,'Adjective':2,'Patient':3,'Theme':4,'Time':5,'Manner':6,'Location':7,'Trajectory':8,'Attribute':9}
num_roles = len(sem_role_dict.keys())
sent_1_store = np.empty((0,num_roles), dtype=object)
sent_2_store = np.empty((0,num_roles), dtype=object)
separator = '; '

# Loop over and print semantic roles for all sentence pairs
for key in sentence_pairs_parsed.keys():
    sents = sentence_pairs_parsed[key]
    
    # Get roles for first sentence
    set_of_roles = np.zeros(num_roles, dtype=object)
    for role in sem_role_dict.keys():
        role_index = sem_role_dict[role]
        set_of_roles[role_index] = get_semant_ele(sents[0],role) # add element in role
    sent_1_store = np.vstack([sent_1_store, set_of_roles]) # add this sent to storage array
    
    # Get roles for first sentence
    set_of_roles = np.zeros(num_roles, dtype=object)
    for role in sem_role_dict.keys():
        role_index = sem_role_dict[role]
        set_of_roles[role_index] = get_semant_ele(sents[1],role) # add element in role
    sent_2_store = np.vstack([sent_2_store, set_of_roles]) # add this sent to storage array
    
# Print out set of sentences and roles
all_lines = []
for i in np.arange(len(sentences_dict)):
    print_string_sent_1 = ''
    print_string_sent_2 = ''
    for j in range(num_roles):
        print_string_sent_1+=sent_1_store[i][j]+';'
        print_string_sent_2+=sent_2_store[i][j]+';'
    new_line = print_string_sent_1+';'+print_string_sent_2
    all_lines.append(new_line)
    # print(new_line)
    
# Save list of parsed data to text file  
text_path = "full_verbnet_pass_"+dataset+"_array.txt"
np.savetxt(text_path, np.array(all_lines), fmt='%s')

Cord;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;strong , thick string;;A smile;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;the expression that you ... you are being friendly;
A rooster;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;an adult male chicken;;A voyage;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;a long journey on a ship or in a spacecraft;
Noon;is;NONE;NONE;NONE;in the middle of the day;NONE;NONE;NONE;twelve o'clock;;String;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;thin rope made of twiste...her or tying up parcels;
Fruit or a fruit;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;something which grows on...stance that you can eat;;NONE;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;a container or enclosed space;
An autograph;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;the signature of someone...itten for a fan to keep;;The shores or shore of a sea , lake or wide river;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;the land along the edge of it;
An automobile;is;NONE;NONE;NONE;NONE;NONE;NONE;NONE;a car;;a wizard;is;NONE;NONE;NONE;NONE;NON