Ref: 
- https://nbviewer.org/url/lope.linguistics.ntu.edu.tw/courses/python4nlp/week6-2.Working.with.Lexical.Data.2.ipynb
- https://verbs.colorado.edu/verb-index/VerbNet_Guidelines.pdf
- https://verbs.colorado.edu/~mpalmer/projects/verbnet.html
- https://www.nltk.org/_modules/nltk/corpus/reader/verbnet.html

### Clustering Subjects and Objects
We focus on author (first person pronouns I and we) and the dataset.

We do the following in subjects and objects:
1. identify "I" and "we" 
2. detect (probably not full currently) dataset in part or in ful


In [1]:
def find_author(text):
    '''
    input: full sentence (text)
    output: bool of find or not
    '''
    text_lower = text.lower()
    words = text_lower.split()
    if ("i" in words or "we" in words):
        return True
    return False

In [2]:
find_author('we')

True

In [3]:
find_author('three geographic classifications')

False

In [4]:
# source 1: the detected named entity
# source 2: the coreference of the dataset
data_keywords = ['data', 'data\s*(?:set|base)s?', 'corp(us|ora)', 'tree\s*bank', 
            '(?:train|test|validation|testing|trainings?)\s*(?:set)',
            'collections?', 'benchmarks?', 'surveys?', 'samples?', 'stud(y|ies)']
import re
data_pattern= re.compile(r'\b(' + '|'.join(data_keywords) + r')\b', flags = re.IGNORECASE)

def find_dataset(text,data_name):
    '''
    input: full sentence (text) and dataset_prediction (data_name)
    output: bool of find or not
    '''
    # use predicted dataset names to find
    data_name_list = data_name.split()
    words = text.split()
    for data_name_token in data_name_list: # anything match counts
        if data_name_token in words:
            return True
    
    # use data citation pattern to find
    if re.search(data_pattern,text):
        return True
    
    return False

In [5]:
find_dataset("ADR data","ARD")

True

In [6]:
find_dataset("ADR data","ARD AMEX")

True

### Clustering Relations

In [1]:
import pandas as pd
import numpy as np

In [3]:
### new: https://verbs.colorado.edu/kest1439/
# pre-trained golve
# https://radimrehurek.com/gensim/models/word2vec.html

glove_verb_sense = pd.DataFrame()

with open("/nfs/turbo/hrg/glove_verb_sense/glove-sense450.vectors.txt") as f:
    lines = f.readlines()
    glove_verb_sense["lines"]=lines


In [4]:
glove_verb_sense.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2844385 entries, 0 to 2844384
Data columns (total 1 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   lines   object
dtypes: object(1)
memory usage: 21.7+ MB


In [5]:
glove_verb_sense.head()

Unnamed: 0,lines
0,the -0.696553 -0.199726 0.125435 0.097190 -0.3...
1,. -0.234015 0.050176 0.169443 0.535785 -0.0603...
2,of 0.104824 -0.606487 0.208627 0.197995 -0.081...
3,and 0.022772 0.043362 0.474807 0.367868 0.0506...
4,in -0.312536 0.040338 0.555955 -0.090425 -0.48...


In [8]:
eg_vn = glove_verb_sense.lines[0].split(" ")

In [9]:
len(eg_vn)

101

In [21]:
def convert_line_to_vec(x):
    x_list = x.split(" ")[1:]
    x_list[-1] = x_list[-1].replace("\n","")
    x_list = [float(i) for i in x_list]
    return x_list

In [23]:
glove_verb_sense["verb"] = glove_verb_sense.lines.apply(lambda x: x.split(" ")[0])
glove_verb_sense["vector"] = glove_verb_sense.lines.apply(lambda x: convert_line_to_vec(x))

In [24]:
glove_verb_sense_dict = dict(zip(glove_verb_sense.verb, glove_verb_sense.vector))

In [None]:
# try to apply:
# https://adp.uni.edu/documents/bloomverbscognitiveaffectivepsychomotor.pdf
# https://courses.washington.edu/pharm439/Bloomstax.htm
# https://www.potsdam.edu/sites/default/files/documents/offices/ie/assessment/Action-Verb-List-For-Writing-Student-Outcomes.pdf
# https://tips.uark.edu/blooms-taxonomy-verb-chart/
# same thing above - Bloom's taxonomy
glove_verb_sense_dict

In [25]:
from numpy import dot
from numpy.linalg import norm
def cos_sim(List1,List2):
    result = dot(List1, List2)/(norm(List1)*norm(List2))
    return result

In [26]:
def verb_sense_sim(Word1,Word2,VerbSenseDict=glove_verb_sense_dict):
    Vec1 = VerbSenseDict[Word1]
    Vec2 = VerbSenseDict[Word2]
    return cos_sim(Vec1,Vec2)

In [32]:
# eg - results very bad...
print(verb_sense_sim("use","have"))
print(verb_sense_sim("use","utilize"))
print(verb_sense_sim("use","report"))

0.7358243262368103
-0.2063585409985059
0.4616696176270366


In [7]:
### old
from nltk.corpus import verbnet as vn

In [8]:
vn.classids('are') # this shows that we need to use AEO first

[]

In [9]:
vn.classids('collect') # this shows that we need to use lemma

['herd-47.5.2',
 'knead-26.5',
 'obtain-13.5.2',
 'other_cos-45.4',
 'shake-22.3-2']

bipatite setting? One "relation" got matched to multiple verb classes. We can then clustering "relation" based on the other group of nodes?

https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.bipartite.cluster.clustering.html

http://yiling.seas.harvard.edu/wp-content/uploads/ISWC03.pdf

https://www.osti.gov/servlets/purl/816202

https://cdlib.readthedocs.io/en/latest/reference/classes/bi_node_clustering.html

Maybe we should have a subset from https://verbs.colorado.edu/verb-index/VerbNet_Guidelines.pdf

In [10]:
import pandas as pd
import numpy as np
import re

In [11]:
# ref: https://verbs.colorado.edu/verb-index/VerbNet_Guidelines.pdf
# https://docs.google.com/spreadsheets/d/18kn2z2df-M4ncUmoHPGqbs5nJyGL-k9R0d2slXdT830/edit?usp=sharing
verb_class_df = pd.read_csv("VerbNet_LF.csv")


In [12]:
verb_class_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Class Number  101 non-null    int64 
 1   Verb Type     101 non-null    object
 2   Verb Class    101 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


In [13]:
verb_class_df.head()

Unnamed: 0,Class Number,Verb Type,Verb Class
0,9,Verbs of Putting,put-­9.1 put_spatial-­9.2 funnel-­9.3 put_dire...
1,10,Verbs of Removing,remove-­10.1 banish-­10.2 clear-­10.3 wipe_man...
2,11,Verbs of Sending and Carrying,send-­11.1 slide-­11.2 bring-­11.3 carry-­11.4...
3,12,Verbs of Exerting Force: Push/Pull Verbs,push-­12
4,13,Verbs of Change of Possession,give-­13.1 contribute-­13.2 future_having-­13....


In [14]:
#verb_class_df.astype({'Class Number': 'int'}).dtypes
verb_class = verb_class_df.drop("Verb Class",axis=1).set_index("Class Number").to_dict()['Verb Type']

In [15]:
verb_class # we may want to merge! - according to the context of data reference

{9: 'Verbs of Putting',
 10: 'Verbs of Removing',
 11: 'Verbs of Sending and Carrying',
 12: 'Verbs of Exerting Force: Push/Pull Verbs',
 13: 'Verbs of Change of Possession',
 14: 'Learn Verbs',
 15: 'Hold and Keep Verbs',
 16: 'Verbs of Concealment',
 17: 'Verbs of Throwing',
 18: 'Verbs of Contact by Impact',
 19: 'Poke Verbs',
 20: 'Verbs of Contact: Touch Verbs',
 21: 'Verbs of Cutting',
 22: 'Verbs of Combining and Attaching',
 23: 'Verbs of Separating and Disassembling',
 24: 'Verbs of Coloring',
 25: 'Image Creation Verbs',
 26: 'Verbs of Creation and Transformation',
 27: 'Engender Verbs',
 28: 'Calve Verbs',
 29: 'Verbs with Predicative Complements',
 30: 'Verbs of Perception',
 31: 'Psych-\xadVerbs (Verbs of Psychological State)',
 32: 'Verbs of Desire',
 33: 'Judgment Verbs',
 34: 'Verbs of Assessment',
 35: 'Verbs of Searching',
 36: 'Verbs of Social Interaction',
 37: 'Verbs of Communication',
 38: 'Verbs of Sounds Made by Animals',
 39: 'Verbs of Ingesting',
 40: 'Verbs I

In [16]:
def get_classes(classids):
    """
    input VerbNet classids (long)
    output a set of VerbNet classes (parenent-level, short)
    """
    classes = set()
    for classid in classids:
        # remove the word itself
        this_classid_long = classid.split("-")[1]
        # get the class -- the string upto the first non-digit
        this_classid_short = int(re.search(r'(\d+)',this_classid_long).group(1))
        classes.add(this_classid_short)
    return classes

In [17]:
# example
vn.classids('draw')

['force-59',
 'force-59-1',
 'performance-26.7-2-1',
 'remove-10.1',
 'scribble-25.2',
 'split-23.2']

In [18]:
get_classes(vn.classids('draw'))

{10, 23, 25, 26, 59}

### Convert from AEO
ref: https://github.com/lizhouf/semantic_triplets/blob/main/scr/add_aeo.py

clauses are characterized by:
- temporal organization (the order in which the subject narrates events and actions in the story), 
- evaluative description (personal assessments made by the narrator), and 
- contextual orientation (usually information provided by the narrator that helps orient the listener)

ref: Labov and Waletsky 1997 Labov, William, and Joshua Waletzky. 1997. “Narrative Analysis: Oral Versions of Personal Experience.” Journal of Narrative & Life History 7 (1–4): 3–38.

We have: 
- Active Agency
- Passive Agency
- Possible Agency
- Evaluative Description
- Contextual Orientation

In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import Matcher
from spacy.symbols import dobj, obj, pobj, acomp, ccomp, pcomp, xcomp, conj, acomp, ccomp, pcomp, xcomp, advmod, amod
from spacy.symbols import neg, det, aux, prep, poss, nsubj, nsubjpass, csubj, csubjpass, det, prt
from spacy.symbols import VERB, AUX, DET, ADP, ADV, ADJ, NOUN, PRON, PROPN, PART
from spacy.tokenizer import Tokenizer
from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS
from spacy.util import compile_infix_regex

In [20]:
'''
Example Key Words
'''
evaluation_verbs = ["feel","smell","taste","look","hear","see","think","know"]
orientation_verbs = ["remember","bear","grow","belong"]
imagine_verbs = ["want","should","would","could","can","might","may"]

In [21]:
def get_cat(this_rel, this_obj):
    '''
    input spaCy Spans this_rel, this_obj 
    output category result
    '''
    
    # initate category result
    
    this_cat = ""

    # initiate the rule components

    rel_has_evaluation = 0
    rel_has_orientation = 0
    rel_has_imagine = 0

    rel_has_be = 0
    rel_has_have = 0
    rel_has_to = 0

    rel_has_neg = 0

    rel_has_VBG = 0
    rel_num_verb = 0

    obj_is_adj = 0  # only adj, no NOUN+

    obj_has_no = 0
    
    # give value
    for rel in this_rel:

        # rel lemmas
        try:
            if rel.lemma_ in evaluation_verbs:
                rel_has_evaluation = 1
            if rel.lemma_ in imagine_verbs:
                rel_has_imagine = 1
            if rel.lemma_ in orientation_verbs:
                rel_has_orientation = 1
            if rel.lemma_ == "be":
                rel_has_be = 1
            if rel.lemma_ == "have":
                rel_has_have = 1
            if rel.lemma_ == "to":
                rel_has_to = 1
        except:  # avoid no lemma
            0

        # rel dep
        try:
            if rel.dep == neg:
                rel_has_neg = 1
        except:
            0

        # rel pos
        try:
            if (rel.pos == VERB or rel.pos == AUX):
                rel_num_verb = rel_num_verb + 1
        except:
            0

        # rel tag
        try:
            if rel.tag_ == "VBG":
                rel_has_VBG = 1
        except:
            0

    for obj in this_obj:
        
        # obj lemma
        try:
            if obj.lemma_ == "no":
                obj_has_no = 1
        except:
            0

    for obj in this_obj:  # seperate, want to break
        # obj pos
        try:
            if obj.pos == ADJ:
                obj_is_adj = 1
            if obj.pos in [NOUN,PRON,PROPN]:
                obj_is_adj = 0
                break
        except:
            0

    # judge:

    # fixed words
    if rel_has_evaluation and obj_is_adj:
        this_cat ="Evaluation"
    elif rel_has_imagine:
        this_cat ="Agency_Possible"
    elif rel_has_orientation:
        this_cat ="Orientation"

    # neg
    elif rel_has_neg or obj_has_no:
        this_cat ="Orientation"

    # have
    elif rel_has_have:
        if rel_has_to:
            this_cat ="Agency_Passive" # no longer coercive
        else:
            this_cat ="Orientation"

    # be
    elif rel_has_be:
        if obj_is_adj:
            this_cat ="Evaluation"
        elif rel_has_VBG:
            this_cat ="Agency_Active"
        elif rel_num_verb > 1:
            this_cat ="Agency_Passive"
        elif rel_num_verb == 1:
            this_cat ="Orientation"

    # if none of the above, then assign active:
    else:
        this_cat = "Agency_Active"

    return this_cat

In [22]:
# subject is dataset
eg_rel1 = nlp('are drawn from')
eg_obj1 = nlp('National Organizations Survey')
get_cat(eg_rel1,eg_obj1)

'Agency_Passive'

In [23]:
for rel in nlp('obtain name For'):
    print(rel.lemma_,rel.pos_)

obtain VERB
name NOUN
for ADP


In [24]:
# subject is we
eg_rel2 = nlp('obtain name For')
eg_obj2 = nlp('non-ADR shares')
get_cat(eg_rel2,eg_obj2)

'Agency_Active'

In [25]:
# subject is dataset
eg_rel3 = nlp('are collected from')
eg_obj3 = nlp('major depositary bank websites')
get_cat(eg_rel3,eg_obj3)

'Agency_Passive'

In [26]:
# to do: find more examples