In [1]:
import spacy
import nltk
import numpy as np
import math
from nltk.corpus import wordnet
from itertools import product
from typing import List
spacy.prefer_gpu()
# nlp_bas = spacy.load(r"C:\Users\Akarsha K R\AppData\Local\Programs\Python\Python37\Lib\site-packages\en_core_web_sm\en_core_web_sm-3.4.1")
nlp = spacy.load(r"C:\Users\Akarsha K R\AppData\Local\Programs\Python\Python37\Lib\site-packages\en_core_web_lg\en_core_web_lg-3.4.1")

In [2]:
from enum import Enum
class POS(Enum):
	VERB = wordnet.VERB
	NOUN = wordnet.NOUN
	ADJ = wordnet.ADJ
	ADV = wordnet.ADV

In [111]:
PR = False
#Word class to capture everything related to a word
class Word:
	def __init__(self,spacy_token:str):
		self.raw_word = spacy_token.text
		self.smallcase_word = spacy_token.text.lower()
		#Tag lemma
		self.lemma = spacy_token.lemma_
		#Tag parts of speech
		self.pos = spacy_token.pos_
		#Antonyms of the word
		self.antonyms = []
		#Get wordnet synnets by pos
		self.synnets = []
		self.lemma_list = []

		if self.pos == 'VERB' or self.pos == 'NOUN' or self.pos == 'ADJ' or self.pos == 'ADV':
			self.get_synnets()
		
		self.find_antonyms()
		
		# self.print_synnets()
	
	#Get the synnets for the word (Wordnet) - Use spaCy POS tag
	def get_synnets(self):
		pos = self.pos
		print(f"Finding synnets for word: {self.raw_word} POS identified by spaCy: {self.pos}") if PR else 0
		self.synnets = wordnet.synsets(self.raw_word)
		print(f"Total synnets found = {len(self.synnets)}") if PR else 0

	def print_synnets(self):
		for synnet in self.synnets:
			print(f"{synnet} - {synnet.definition()}") if PR else 0
			
	def __str__(self):
		return f"{self.raw_word} | {self.lemma} | {self.pos} | {len(self.synnets)}"

	def find_lemmas(self):
		for synnet in self.synnets:
			self.lemma_list = [lemma for lemma in synnet.lemmas()]
			print(f"Total lemmas found for word : {self.raw_word} = {len(self.lemma_list)}")
	
	def find_antonyms(self):
		for syn in self.synnets:
			for lem in syn.lemmas():
				if lem.antonyms():self.antonyms.append(lem.antonyms()[0].name().lower())
	



In [115]:
class Sentence:
	def __init__(self, string:str):
		self.raw_sentense = string

		#Apply spaCy NLP to tokenize and extract information
		self.doc = nlp(string)
		
		#Display spaCy o/p
		print("\nText  |  Lemma  |  POS  |  Tag  |  Dep  |  Shape  |  Alpha  |  Stop  |") if PR else 0
		self.disp()

		#Create new Word wrapper for each tokens in the sentense and dont add stop words
		self.word_list: List[Word] = []

		self.stop_words = []
		
		# print("*************** Removing stop words and punctuation and creating Word objects ***************************")
		#TODO : add custom stop words remover -> Improve performance
		for token in self.doc:
			#Remove stop words
			if token.is_stop:
				print(f'Stop word removed: {token.text}') if PR else 0
				self.stop_words.append(token)
			elif token.is_punct:
				print(f"Punctuation removed : {token.text}") if PR else 0
			else:self.word_list.append(Word(token))
	
	#Display spaCy pipeline output
	def disp(self):
		for token in self.doc:
			#Text |	Lemma |	POS | Tag |	Dep | Shape | Alpha | Stop
			print(token.text,'|', token.lemma_,'|', token.pos_,'|', token.tag_,'|', token.dep_,'|',token.shape_,'|', token.is_alpha,'|', token.is_stop) if PR else 0

	def get_word_set(self):
		word_list = [ word.smallcase_word for word in self.word_list ]
		return set(word_list)
	
	def get_word_list(self):
		return [ word for word in self.word_list ]
	
	def get_str_word_list(self):
		return [ word.smallcase_word for word in self.word_list ]

	def get_all_synnets(self):
		syn_lst = [ word.synnets for word in self.word_list]
		return syn_lst

In [165]:
class Wrapper:
    
    def __init__(self, phrase1, phrase2):
        #Create sentense objects for both phrases
        self.sentence1 = Sentence(phrase1)
        self.sentence2 = Sentence(phrase2)
        self.overall_score = 0
        self.has_antonyms = False
        self.word_order_score = 0
        self.vector_score = 0
        self.reason = ''
    
    def calculate_similarity_score(self):
        #Pipeline of checks and calculations

		#1. Check for antonym -> If antonyms are found make similarity score zero
        self.check_antonym()
        if self.has_antonyms:
            print("Reason : Antonym , Similarity score = 0")
            self.reason = 'Antonym'
            return

        lmbda = 0.6 #Can be set between 0 & 1
		#2. Get vector + path similarity score
        s_c = self.vector_similarity_score()
        self.vector_score = s_c
        
		#3. Get word order similarity score
        s_w = self.word_order_similarity()
        self.word_order_score = s_w


        score = math.sqrt( (math.pow(lmbda,2) * s_w ) + ( (1 - math.pow(lmbda,2)) * s_c ))
        self.overall_score = score
        print(f"Final score : {score}")
			
                

	

    def check_antonym(self):
        word_list1 = self.sentence1.get_word_list()
        word_list2 = self.sentence2.get_word_list()
	
		#For every word in s1 check if the other s2 has any antonyms
		#TODO: Only check for VERB-VERB, NOUN-NOUN words for antonym: Improve efficiency
        for word1 in word_list1:
            for word2 in word_list2:
                if word1.smallcase_word in word2.antonyms:
                    print(f'Found antonym for word {word1.raw_word}')
                    self.has_antonyms=True
    
	#Combination of vector and wordnet path similarity
	#TODO: Compare/Calculate scores for only matching words - (sub-sub / obj-obj / verb-verb...)
    #TODO: If wordnet score is zero for any two words then dont take vector similarity (hard set score to 1.0)
    def vector_similarity_score(self):
        sim_score1, simscore2 = 0.0, 0.0
        score1, score2, avr_score1, avr_score2 = 0.0, 0.0, 0.0, 0.0
        score, count1, count2 = 0.0, 0, 0

        #Get all words
        lst1 = self.sentence1.get_word_list()
        lst2 = self.sentence2.get_word_list()

        #For each word in sentense 1 find maximum of sum of vector and semantic similarity with all the words in sentense 2
        for word1 in lst1:
            sim_score1 = max([ ( (  self.spacy_vector_similarity(word,word1) + self.get_path_similarity(word, word1)) / 2 ) for word in lst2])
            if sim_score1 is not None:
                score1 += sim_score1
                count1 += 1
        avr_score1 = score1 / count1

        #For each word in sentense 2 find maximum of sum of vector and semantic similarity with all the words in sentense 1
        for word1 in lst2:
            sim_score2 = max([ ( (  self.spacy_vector_similarity(word,word1) + self.get_path_similarity(word, word1)) / 2 ) for word in lst1])
            if sim_score2 is not None:
                score2 += sim_score2
                count2 += 1
        avr_score2 = score2 / count2

        #Get average of the max scores for the sentense
        score = (avr_score1 + avr_score2) / 2
        print(f"Combined word vector + Wordnet similarity score = {score}") if PR else 0
        return score

    def spacy_vector_similarity(self, word1:Word, word2:Word):
        doc1 = nlp(word1.raw_word)
        doc2 = nlp(word2.raw_word)
        score = doc1.similarity(doc2)
        print(f"SpaCy vector similarity (lemma) :{word1.raw_word} - {word2.raw_word} = {score}") if PR else 0
        return score
    
    def get_path_similarity(self, word1:Word, word2:Word):
        max_score = 0
        net1 = ''
        net2 = ''
        for synnet1 in word1.synnets:
            for synnet2 in word2.synnets:
                path_sim = wordnet.path_similarity(synnet1, synnet2)
                # print(f"{synnet1} - {synnet2} : {path_sim}") if PR else 0
                if path_sim > max_score: 
                    max_score = path_sim
                    net1 = synnet1
                    net2 = synnet2
        print(f"Max path similarity score {net1} , {net2} : {max_score}") if PR else 0
        return max_score

	#TODO: send application form, send application -> Should give maximum word order similarity. Sort in some way
    def word_order_similarity(self):
        count = 0
        #Get string word list
        str_word_list1 = self.sentence1.get_str_word_list()
        str_word_list2 = self.sentence2.get_str_word_list()

		#Combine the word lists, sort and convert to set
        str_word_list = str_word_list1 + str_word_list2
        # print(str_word_list)
        str_word_list.sort()

        # print(str_word_list)
        word_set = set(str_word_list)
        # word_set = 
        
        # word_set = self.sentence1.get_ordered_word_set().union(self.sentence2.get_ordered_word_set())
        vector1 = [0] * len(word_set)
        vector2 = [0] * len(word_set)

        print(f"Union word list:{word_set}") if PR else 0

        for word in word_set:
            count+= 1
            if word in str_word_list1:
			#Get index
                ix = str_word_list1.index(word)
                vector1[ix] = count
            if word in str_word_list2:
                ix = str_word_list2.index(word)
                vector2[ix] = count
        print(f"Word order vectors formed :{vector1} {vector2}") if PR else 0

        vector_sum = np.add(vector1,vector2)
        vector_sub = np.subtract(vector1,vector2)

	    # word order score Sw = 1 - ( |V1-V2| / |V1+V2| )
        wo_score = 1 - ( np.linalg.norm(vector_sub) / np.linalg.norm(vector_sum))
        print(f"Word order score = {wo_score}") if PR else 0
        return wo_score

In [109]:
def wup_similarity(word1: Word, word2: Word):
	max_score = 0
	print("*******************FINDING WUP SIMILARITY*******************") if PR else 0
	for synnet1 in word1.synnets:
		for synnet2 in word2.synnets:
			wup_sim = wordnet.wup_similarity(synnet1, synnet2)
			print(f"{synnet1} - {synnet2} : {wup_sim}") if PR else 0
			if wup_sim > max_score: 
				max_score = wup_sim
	print(f"Max score : {max_score}") if PR else 0
	return max_score

In [164]:
PR = True
s1 = "high ranking requirements are met"
s2 = "requirements are not met or low ranking"
wr = Wrapper(s1,s2)
wr.calculate_similarity_score()


Text  |  Lemma  |  POS  |  Tag  |  Dep  |  Shape  |  Alpha  |  Stop  |
high | high | ADJ | JJ | amod | xxxx | True | False
ranking | ranking | NOUN | NN | amod | xxxx | True | False
requirements | requirement | NOUN | NNS | nsubjpass | xxxx | True | False
are | be | AUX | VBP | auxpass | xxx | True | True
met | meet | VERB | VBN | ROOT | xxx | True | False
Finding synnets for word: high POS identified by spaCy: ADJ
Total synnets found = 18
Finding synnets for word: ranking POS identified by spaCy: NOUN
Total synnets found = 5
Finding synnets for word: requirements POS identified by spaCy: NOUN
Total synnets found = 3
Stop word removed: are
Finding synnets for word: met POS identified by spaCy: VERB
Total synnets found = 13

Text  |  Lemma  |  POS  |  Tag  |  Dep  |  Shape  |  Alpha  |  Stop  |
requirements | requirement | NOUN | NNS | nsubjpass | xxxx | True | False
are | be | AUX | VBP | auxpass | xxx | True | True
not | not | PART | RB | neg | xxx | True | True
met | meet | VERB | V

In [166]:
PR = False
import json
f = open('mapping_87.json')
data = json.load(f)

pair = data['Mapping']

# print(pair)
lst = []
for att,value in pair.items():
    for an, vl in value.items():
    	if an == 'label':
            if vl != "" and vl != " " and vl != None:
            	lst.append(vl)
        
print(len(lst))
f.close()

df = []

for i in range(0,50):
    for j in range(i,50):
        # print(f'Phrase 1: {lst[i]}')
        # print(f'Phrase 2: {lst[j]}')
        st1 = lst[i]
        st2 = lst[j]
        wr = Wrapper(st1,st2)
        wr.calculate_similarity_score()
        # Phrase 1, Phrase 2, reson, vector score, word order score, overall score 
        row = [st1, st2, wr.reason, wr.vector_score, wr.word_order_score, wr.overall_score]
        df.append(row)



1032
Final score : 0.7211102550927979
Final score : 0.4019971040782163
Final score : 0.4323709231042373
Final score : 0.428957227988723
Final score : 0.5139000840857826
Final score : 0.4282822696063488
Final score : 0.45077379147796787
Final score : 0.45238401388519406
Final score : 0.41881621795784313
Final score : 0.47103796333289977
Final score : 0.4412553010640825
Final score : 0.3990853217015079
Final score : 0.39916584308846437
Final score : 0.4325123255405092




Final score : 0.3690592168588078
Final score : 0.42568877194207955
Final score : 0.4984764283107509
Final score : 0.41701507357272416
Final score : 0.45520372213549226
Final score : 0.39023211452919976
Final score : 0.49888278196409463
Final score : 0.43027250786901433
Final score : 0.43027250786901433
Final score : 0.3400352692030368
Final score : 0.39975214404690884
Final score : 0.4757169675918601
Final score : 0.4383275784769191
Final score : 0.462180272422927
Final score : 0.4419203977926675
Final score : 0.4598888071018222
Final score : 0.4401871042950192
Final score : 0.4401871042950192
Final score : 0.39767077073732116
Final score : 0.45888658488969947
Final score : 0.480416249930961
Final score : 0.43027250786901433
Final score : 0.4736360866437639
Final score : 0.39767077073732116
Final score : 0.48883668086043397
Final score : 0.4499373968379157
Final score : 0.40293224107929376
Final score : 0.416999398918246
Final score : 0.4385278345425819
Final score : 0.4507945571511558

In [167]:
# Create dataframe and write to excel
import pandas as pd
from IPython.display import display
dataframe = pd.DataFrame(df, columns =['Phrase 1', 'Phrase 2', 'Reason', 'Vector + Similarity score', 'Word order score', 'Final score'])
# dataframe.show()
# display(df)

In [168]:
writer = pd.ExcelWriter('out.xlsx', engine='xlsxwriter')
dataframe.to_excel(writer, sheet_name='Sheet1')
writer.save()