In [1]:
import pandas as pd
import numpy as np
import spacy
import nltk
from nltk.corpus import wordnet

In [2]:
#model='en_pytt_bertbaseuncased_lg'
model = 'en_core_web_lg'
nlp = spacy.load(model)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ga75xoh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def getSynAnt(word):
    synonyms = [] 
    antonyms = []
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonyms.append(l.name()) 
            if l.antonyms(): 
                antonyms.append(l.antonyms()[0].name()) 
    return synonyms, antonyms

def getWUPSimilarity(doc1, doc2):
    if doc1.lemma_.lower() == doc2.lemma_.lower():
        return 1
    
    w1 = doc1.text
    w2 = doc2.text
    synonyms, _ = getSynAnt(w1)
    if w2 in synonyms:
        return 0.9
    synonyms, _ = getSynAnt(w2)
    if w1 in synonyms:
        return 0.9

    #NOUN
    synw1s = wordnet.synsets(w1, wordnet.NOUN)
    if len(synw1s) > 0:
        synw2s = wordnet.synsets(w2, wordnet.NOUN)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.VERB)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADJ)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADV)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
    #VERB
    synw1s = wordnet.synsets(w1, wordnet.VERB)
    if len(synw1s) > 0:
        synw2s = wordnet.synsets(w2, wordnet.NOUN)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.VERB)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADJ)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADV)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
    #ADJ
    synw1s = wordnet.synsets(w1, wordnet.ADJ)
    if len(synw1s) > 0:
        synw2s = wordnet.synsets(w2, wordnet.NOUN)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.VERB)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADJ)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADV)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
    #ADV
    synw1s = wordnet.synsets(w1, wordnet.ADV)
    if len(synw1s) > 0:
        synw2s = wordnet.synsets(w2, wordnet.NOUN)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.VERB)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADJ)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        synw2s = wordnet.synsets(w2, wordnet.ADV)
        if len(synw2s) > 0:
            return synw1s[0].wup_similarity(synw2s[0])
        
        
        
def getAntonymity(doc1, doc2):
    w1 = doc1.text
    w2 = doc2.text
    antonyms = [] 
    for syn in wordnet.synsets(w1): 
        for l in syn.lemmas(): 
            if l.antonyms():
                antonyms.append(l.antonyms()[0].name()) 
    if len(antonyms) < 1:
        return 0
    if w2 in antonyms:
        return 1
    avg = 0
    for a in antonyms:
        tmp = getWUPSimilarity(doc2, nlp(a)[0])
        if tmp == None:
            tmp = 0
        avg = avg + tmp
    if len(antonyms) > 0:
        #avg = avg / len(antonyms)
        avg = avg
    else:
        return 0
    return avg

def getOrigDep(word):
    if (word.dep_ != 'conj'):
        return word.dep_
    return getOrigDep(word.head)


def isStopWord(w):
    if w.pos_ == 'DET':
        return True
    if w in set(nltk.corpus.stopwords.words('english')):
        return True
    return False


def getMax(a_list):
    if len(a_list) == 0:
        return 0
    else:
        return max(a_list, key=abs)

In [4]:
class Item(object):
    def __init__(self, token):
        self.token = token
        self.synonyms, self.antonyms = getSynAnt(self.token.text)
        self.describers = self.getDescribers(self.token)
    
    def getDescribers(self, token):
        res = []
        tokens = []
        for t in token.children:
            if not t.dep_.endswith('subj') and not t.dep_.endswith('obj') and not t.dep_.endswith('aux') and not t.is_punct and not t.pos_.endswith('CONJ'):
                tokens.append(t)
        while tokens != []:
            current = tokens.pop()
            res.append(current)
            for t in current.children:
                if not t.dep_.endswith('subj') and not t.dep_.endswith('obj') and not t.dep_.endswith('aux') and not t.is_punct and not t.pos_.endswith('CONJ'):
                    tokens.append(t)
        for t in res:
            if t.pos_ == 'ADP' or t.pos_ == 'DET':
                res.remove(t)    
        return res       
        
        
        
class Specification(object):
    def __init__(self, text):
        self.text = text
        self.doc = nlp(text)
        self.objs = []
        self.subjs = []
        self.verbs = []
        self.loadItems()
        
    def loadItems(self):
        for token in self.doc:
            if getOrigDep(token).endswith('subj'):
                self.subjs.append(Item(token))
            elif getOrigDep(token).endswith('obj'):
                self.objs.append(Item(token))
            elif token.pos_ == 'VERB':
                self.verbs.append(Item(token))
            else:
                continue
                
    def getAllItems(self):
        return self.objs + self.subjs + self.verbs
    
    def getItem(self, word_text):
        for item in self.getAllItems():
            if item.text == word_text:
                return item
        return None
        
        


In [5]:
class FeatureExtractor(object):
   
    
    def getOneValueSim(self, token1, token2):
        sim = getWUPSimilarity(token1, token2)
        if sim == None:
            sim = 0
        ant = getAntonymity(token1, token2)
        if ant == None:
            ant = 0
        ant = ant * -1
        res = ant + sim
        return res
        
        
    def getRelationship(self, item1, item2):
        non_zeros = []
        
        item1item2_sim = self.getOneValueSim(item1.token, item2.token)
        if not item1item2_sim == 0:
            non_zeros.append(item1item2_sim)
        
        des1item2_sim_list = []
        for d in item1.describers:
            des1item2_sim_list.append(self.getOneValueSim(d, item2.token))
        des1item2_sim = getMax(des1item2_sim_list)
        if not des1item2_sim == 0:
            non_zeros.append(des1item2_sim)
        
        item1des2_sim_list = []
        for d in item2.describers:
            item1des2_sim_list.append(self.getOneValueSim(item1.token, d))
        item1des2_sim = getMax(item1des2_sim_list)
        if not item1des2_sim == 0:
            non_zeros.append(item1des2_sim)
        
        des1des2_sim_list = []
        for d1 in item1.describers:
            for d2 in item2.describers: 
                des1des2_sim_list.append(self.getOneValueSim(d1, d2))
        des1des2_sim = getMax(des1des2_sim_list)
        if not des1des2_sim == 0:
            non_zeros.append(des1des2_sim)
            
        sign = 1
        for val in non_zeros:
            sign = sign * val
        sign = np.sign(sign)
        res = pow(2, item1item2_sim+1)
        res = res * pow(2, item1des2_sim+1)
        res = res * pow(2, des1item2_sim+1)
        res = res * pow(2, des1des2_sim+1)
        res = pow(res, 0.25) * sign
        return res
    
    def updateResDict(self, res, ant_res, key, val):
        if val < 0:
            if key in ant_res.keys():
                ant_res[key].append(val)
            else:
                ant_res[key] = [val]
        else:
            if key in res.keys():
                res[key].append(val)
            else:
                res[key] = [val]
        return res, ant_res

    #returns synonymity and antonymity of subjects and objects
    def getSubObjFeaturesFirst(self, spec1, spec2):
        res = {}
        ant_res = {}
        for t1 in spec1.subjs:
            for t2 in spec2.subjs:
                res, ant_res = self.updateResDict(res, ant_res, 'subjsubj', self.getRelationship(t1, t2))
            for t2 in spec2.verbs:
                res, ant_res = self.updateResDict(res, ant_res, 'subjverb', self.getRelationship(t1, t2))
        for t1 in spec1.subjs:
            for t2 in spec2.objs:
                res, ant_res = self.updateResDict(res, ant_res, 'subjobj', self.getRelationship(t1, t2))
        for t1 in spec1.objs:
            for t2 in spec2.subjs:
                res, ant_res = self.updateResDict(res, ant_res, 'objsubj', self.getRelationship(t1, t2))
            for t2 in spec2.verbs:
                res, ant_res = self.updateResDict(res, ant_res, 'objverb', self.getRelationship(t1, t2))
        for t1 in spec1.objs:
            for t2 in spec2.objs:
                res, ant_res = self.updateResDict(res, ant_res, 'objobj', self.getRelationship(t1, t2))
        for t1 in spec1.verbs:
            for t2 in spec2.objs:
                res, ant_res = self.updateResDict(res, ant_res, 'verbobj', self.getRelationship(t1, t2))
            for t2 in spec2.subjs:
                res, ant_res = self.updateResDict(res, ant_res, 'verbsubj', self.getRelationship(t1, t2))
        return res, ant_res
    
    def getSubObjFeaturesFinal(self, spec1, spec2):
        states = ['subj', 'obj', 'verb']
        res, ant_res = self.getSubObjFeaturesFirst(spec1, spec2)
        final_res = {}
        for s1 in states:
            for s2 in states:
                key = s1 + s2
                if key in res.keys():
                    final_res[key] = np.mean(res[key])
                else:
                    final_res[key] = 0
                if key in ant_res.keys():
                    final_res['ant_'+key] = np.mean(ant_res[key])
                else:
                    final_res['ant_'+key] = 0
        return final_res
    

In [7]:
r1 = 'Crude oil for April delivery traded at $37.80 a barrel, decrease 28 cents.'
r2 = 'Crude oil prices rose to $37.80 per barrel.'

spec1 = Specification(r1)
spec2 = Specification(r2)

fe = FeatureExtractor()
x = fe.getSubObjFeaturesFinal(spec1, spec2)
for key in x.keys():
    print(key, round(x[key],2))


subjsubj 3.08
ant_subjsubj 0
subjobj 2.08
ant_subjobj 0
subjverb 2.16
ant_subjverb 0
objsubj 2.24
ant_objsubj 0
objobj 2.32
ant_objobj 0
objverb 2.1
ant_objverb 0
verbsubj 1.13
ant_verbsubj 0
verbobj 2.0
ant_verbobj -1.73
verbverb 0
ant_verbverb 0
