In [67]:
import csv 
import nltk
from nltk.stem import WordNetLemmatizer
import gensim.downloader as api 
from itertools import combinations 

In [68]:
nltk.download('wordnet')
model = api.load('word2vec-google-news-300')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rafailiavlachou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [69]:
import pandas as pd 
df = pd.read_csv('../data/relations_count.csv')
df['lower_relations'] = df['relation'].apply(lambda x: x.lower().replace('_', ' '))
df = df.groupby(['lower_relations']).sum('count')
df.to_csv('../data/unique_relations.csv')

In [70]:

data_path = '../data/unique_relations.csv'
with open(data_path, 'r') as d:
    data = csv.reader(d)
    raw_relations = [row[0] for row in data][1:]

In [71]:
len(raw_relations)

1461

In [72]:
df_new = pd.read_csv(data_path)
df_new 

Unnamed: 0,lower_relations,count
0,abandon,3
1,abandonment of,1
2,abolish,2
3,accede to pressure from,1
4,accelerate,1
...,...,...
1456,worried about,1
1457,worry about,1
1458,write,21
1459,write on,1


In [73]:
lemmatizer = WordNetLemmatizer()

In [74]:
df_new['lematised'] = df_new['lower_relations'].apply(lambda x: lemmatizer.lemmatize(x, 'v'))
df_new 

Unnamed: 0,lower_relations,count,lematised
0,abandon,3,abandon
1,abandonment of,1,abandonment of
2,abolish,2,abolish
3,accede to pressure from,1,accede to pressure from
4,accelerate,1,accelerate
...,...,...,...
1456,worried about,1,worried about
1457,worry about,1,worry about
1458,write,21,write
1459,write on,1,write on


In [75]:
lemmatized_relations = [lemmatizer.lemmatize(relation, 'v') for relation in raw_relations]
mapping_1 = dict(zip(raw_relations, lemmatized_relations))
mapping_1

{'abandon': 'abandon',
 'abandonment of': 'abandonment of',
 'abolish': 'abolish',
 'accede to pressure from': 'accede to pressure from',
 'accelerate': 'accelerate',
 'accept': 'accept',
 'accept as': 'accept as',
 'accept with': 'accept with',
 'accepted as': 'accepted as',
 'accepts': 'accept',
 'access': 'access',
 'accessing': 'access',
 'accommodate': 'accommodate',
 'accompanied': 'accompany',
 'accuse': 'accuse',
 'accused': 'accuse',
 'accused in': 'accused in',
 'accused of': 'accused of',
 'achieve': 'achieve',
 'achieved': 'achieve',
 'acknowledge': 'acknowledge',
 'acquiesce': 'acquiesce',
 'acquire': 'acquire',
 'acquired': 'acquire',
 'acquit': 'acquit',
 'act as checks on': 'act as checks on',
 'acted as': 'acted as',
 'adapt': 'adapt',
 'add': 'add',
 'address': 'address',
 'addressed': 'address',
 'adjacent to': 'adjacent to',
 'administer': 'administer',
 'admire': 'admire',
 'admit': 'admit',
 'adopt': 'adopt',
 'advance': 'advance',
 'advance on': 'advance on',
 'a

In [76]:
words = list(set(lemmatized_relations))

words = [word for word in words if word in model]

similarities = {}
for word1, word2 in combinations(words, 2):
    if word1 != word2:
        similarity = model.similarity(word1, word2)
        if similarity > 0.7:
            similarities[(word1, word2)] = similarity

sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)


for (word1, word2), similarity in sorted_similarities:
    print(f"Similarity between '{word1}' and '{word2}': {similarity:.4f}")

Similarity between 'convince' and 'persuade': 0.8929
Similarity between 'increase' and 'decrease': 0.8370
Similarity between 'safeguard' and 'protect': 0.8329
Similarity between 'sell' and 'buy': 0.8308
Similarity between 'categorize' and 'classify': 0.8047
Similarity between 'denounce' and 'condemn': 0.8003
Similarity between 'minimize' and 'mitigate': 0.7943
Similarity between 'lessen' and 'minimize': 0.7697
Similarity between 'purchase' and 'buy': 0.7640
Similarity between 'lessen' and 'reduce': 0.7530
Similarity between 'indicate' and 'suggest': 0.7501
Similarity between 'supervise' and 'oversee': 0.7463
Similarity between 'recognize' and 'acknowledge': 0.7424
Similarity between 'build' and 'construct': 0.7395
Similarity between 'vow' and 'pledge': 0.7381
Similarity between 'know' and 'tell': 0.7357
Similarity between 'think' and 'know': 0.7344
Similarity between 'reaffirm' and 'reiterate': 0.7286
Similarity between 'minimize' and 'reduce': 0.7281
Similarity between 'erode' and 'we

In [77]:
sorted_similarities

[(('convince', 'persuade'), 0.89288),
 (('increase', 'decrease'), 0.8370319),
 (('safeguard', 'protect'), 0.83287936),
 (('sell', 'buy'), 0.8308461),
 (('categorize', 'classify'), 0.8046503),
 (('denounce', 'condemn'), 0.80032325),
 (('minimize', 'mitigate'), 0.7943238),
 (('lessen', 'minimize'), 0.76968896),
 (('purchase', 'buy'), 0.7639905),
 (('lessen', 'reduce'), 0.75301445),
 (('indicate', 'suggest'), 0.7500874),
 (('supervise', 'oversee'), 0.74632823),
 (('recognize', 'acknowledge'), 0.74235857),
 (('build', 'construct'), 0.7394898),
 (('vow', 'pledge'), 0.7381013),
 (('know', 'tell'), 0.7356717),
 (('think', 'know'), 0.7344213),
 (('reaffirm', 'reiterate'), 0.72863954),
 (('minimize', 'reduce'), 0.72813696),
 (('erode', 'weaken'), 0.72559845),
 (('begin', 'commence'), 0.7244425),
 (('believe', 'say'), 0.7188932),
 (('reaffirm', 'affirm'), 0.71827835),
 (('strengthen', 'bolster'), 0.7159493),
 (('boost', 'bolster'), 0.7133502),
 (('cancel', 'postpone'), 0.71189785),
 (('undermine

In [78]:
len(sorted_similarities)

32

In [79]:

data_path = '../data/unique_relations.csv'
with open(data_path, 'r') as d:
    data = csv.reader(d)
    scores = [row[1] for row in data][1:]
relation_counts = dict(zip(raw_relations, scores))
lemma_counts = {}
for r in relation_counts:
    lemma_counts[mapping_1[r]] = relation_counts[r]

In [80]:
pairs = [i[0] for i in sorted_similarities]
mapping_2 = {}
for pair in pairs:
    mapping_2[pair[0]] = pair[1]
print(mapping_2)




{'convince': 'persuade', 'increase': 'decrease', 'safeguard': 'protect', 'sell': 'buy', 'categorize': 'classify', 'denounce': 'condemn', 'minimize': 'reduce', 'lessen': 'reduce', 'purchase': 'buy', 'indicate': 'suggest', 'supervise': 'oversee', 'recognize': 'acknowledge', 'build': 'construct', 'vow': 'pledge', 'know': 'tell', 'think': 'know', 'reaffirm': 'affirm', 'erode': 'weaken', 'begin': 'commence', 'believe': 'say', 'strengthen': 'bolster', 'boost': 'bolster', 'cancel': 'postpone', 'undermine': 'weaken', 'characterize': 'describe', 'determine': 'assess', 'despise': 'dislike', 'contend': 'argue'}


In [123]:
final = {}
for r in raw_relations:
    tmp = mapping_1[r]
    if tmp in mapping_2:
        final[r]= mapping_2[tmp]
    else:
        final[r] = tmp

In [124]:
len(final)

1461

In [126]:
unique_relations = set(final.values())
len(unique_relations) # was 1491

1306

In [83]:
df_new['processed relations'] = df_new['lower_relations'].apply(lambda x: final[x])
df_new 

Unnamed: 0,lower_relations,count,lematised,processed relations
0,abandon,3,abandon,abandon
1,abandonment of,1,abandonment of,abandonment of
2,abolish,2,abolish,abolish
3,accede to pressure from,1,accede to pressure from,accede to pressure from
4,accelerate,1,accelerate,accelerate
...,...,...,...,...
1456,worried about,1,worried about,worried about
1457,worry about,1,worry about,worry about
1458,write,21,write,write
1459,write on,1,write on,write on


In [84]:
final['recognize']

'acknowledge'

In [44]:
with open("relation_mapping.csv", "w") as f:
   for key in final.keys():
        f.write("%s,%s\n"%(key,final[key]))

Same analysis with sentence embeddings instead of work embeddings.

In [85]:
from sentence_transformers import SentenceTransformer
model2 = SentenceTransformer("all-MiniLM-L6-v2")

In [86]:
words2 = list(set(lemmatized_relations))

In [88]:
word_embeddings = model2.encode(words2)

In [143]:
from sentence_transformers import util
similarities = {}
all_pairs = []
semantic_scores = util.semantic_search(word_embeddings, word_embeddings)
for i, score in enumerate(semantic_scores):
    pair = (words2[i], words2[score[1]['corpus_id']])
    a, b = pair
    if a == b:
        print('duplicates')
    if set(pair) not in all_pairs:
        all_pairs.append(set(pair))
        similarity = score[1]['score']
        if similarity > 0.7:
            similarities[pair] = similarity

sorted_similarities2 = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
print(len(sorted_similarities2))

340


In [132]:
sorted_similarities2

[(('resign', 'resign from'), 0.9724125266075134),
 (('withdraw from', 'withdraw'), 0.965355396270752),
 (('comply with', 'comply'), 0.9633784294128418),
 (('sworn in', 'sworn in as'), 0.958960235118866),
 (('emigrated to', 'emigrated from'), 0.9531527161598206),
 (('negotiate', 'negotiate with'), 0.9521523118019104),
 (('intervene', 'intervene in'), 0.95151287317276),
 (('sworn-in', 'sworn in'), 0.9486343264579773),
 (('concerned about', 'concerned with'), 0.9483494162559509),
 (('involved with', 'involved in'), 0.9464719295501709),
 (('file complaint', 'file complaint against'), 0.9460045099258423),
 (('participate in', 'participate'), 0.9457226991653442),
 (('appeal', 'appeal for'), 0.94526207447052),
 (('campaign against', 'campaigns against'), 0.9440921545028687),
 (('compete', 'compete for'), 0.9440404772758484),
 (('was born in', 'born in'), 0.940864622592926),
 (('countersue', 'countersued'), 0.9405544400215149),
 (('tour', 'tour with'), 0.940220057964325),
 (('concede', 'conced

In [133]:
pairs = [i[0] for i in sorted_similarities2]
mapping_22 = {}
for pair in pairs:
    mapping_22[pair[0]] = pair[1]

In [134]:
final2 = {}
for r in raw_relations:
    tmp= mapping_1[r]
    if tmp in mapping_22:
        final2[r]= mapping_22[tmp]
    else:
        final2[r] = tmp

In [137]:
unique_relations2 = len(set(final2.values()))
unique_relations2

1055

In [138]:
len(unique_relations) - unique_relations2

251

In [141]:
with open("relation_mapping_sentence_transformer.csv", "w") as f:
   for key in final2.keys():
        f.write("%s,%s\n"%(key,final2[key]))