In [1]:
import csv 
import nltk
from nltk.stem import WordNetLemmatizer
import gensim.downloader as api 
from itertools import combinations 

In [2]:
nltk.download('wordnet')
model = api.load('word2vec-google-news-300')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rafailiavlachou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
import pandas as pd 
df = pd.read_csv('../data/relations_count.csv')
df['lower_relations'] = df['relation'].apply(lambda x: x.lower().replace('_', ' '))
df = df.groupby(['lower_relations']).sum('count')
df.to_csv('../data/unique_relations.csv')

In [4]:

data_path = '../data/unique_relations.csv'
with open(data_path, 'r') as d:
    data = csv.reader(d)
    raw_relations = [row[0] for row in data][1:]

In [5]:
len(raw_relations)

1461

In [6]:
df_new = pd.read_csv(data_path)
df_new 

Unnamed: 0,lower_relations,count
0,abandon,3
1,abandonment of,1
2,abolish,2
3,accede to pressure from,1
4,accelerate,1
...,...,...
1456,worried about,1
1457,worry about,1
1458,write,21
1459,write on,1


In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
df_new['lematised'] = df_new['lower_relations'].apply(lambda x: lemmatizer.lemmatize(x, 'v'))
df_new 

Unnamed: 0,lower_relations,count,lematised
0,abandon,3,abandon
1,abandonment of,1,abandonment of
2,abolish,2,abolish
3,accede to pressure from,1,accede to pressure from
4,accelerate,1,accelerate
...,...,...,...
1456,worried about,1,worried about
1457,worry about,1,worry about
1458,write,21,write
1459,write on,1,write on


In [9]:
lemmatized_relations = [lemmatizer.lemmatize(relation, 'v') for relation in raw_relations]
mapping_1 = dict(zip(raw_relations, lemmatized_relations))
mapping_1

{'abandon': 'abandon',
 'abandonment of': 'abandonment of',
 'abolish': 'abolish',
 'accede to pressure from': 'accede to pressure from',
 'accelerate': 'accelerate',
 'accept': 'accept',
 'accept as': 'accept as',
 'accept with': 'accept with',
 'accepted as': 'accepted as',
 'accepts': 'accept',
 'access': 'access',
 'accessing': 'access',
 'accommodate': 'accommodate',
 'accompanied': 'accompany',
 'accuse': 'accuse',
 'accused': 'accuse',
 'accused in': 'accused in',
 'accused of': 'accused of',
 'achieve': 'achieve',
 'achieved': 'achieve',
 'acknowledge': 'acknowledge',
 'acquiesce': 'acquiesce',
 'acquire': 'acquire',
 'acquired': 'acquire',
 'acquit': 'acquit',
 'act as checks on': 'act as checks on',
 'acted as': 'acted as',
 'adapt': 'adapt',
 'add': 'add',
 'address': 'address',
 'addressed': 'address',
 'adjacent to': 'adjacent to',
 'administer': 'administer',
 'admire': 'admire',
 'admit': 'admit',
 'adopt': 'adopt',
 'advance': 'advance',
 'advance on': 'advance on',
 'a

In [10]:
words = list(set(lemmatized_relations))

words = [word for word in words if word in model]

similarities = {}
for word1, word2 in combinations(words, 2):
    if word1 != word2:
        similarity = model.similarity(word1, word2)
        if similarity > 0.7:
            similarities[(word1, word2)] = similarity

sorted_similarities = sorted(similarities.items(), key=lambda item: item[1], reverse=True)


for (word1, word2), similarity in sorted_similarities:
    print(f"Similarity between '{word1}' and '{word2}': {similarity:.4f}")

Similarity between 'convince' and 'persuade': 0.8929
Similarity between 'increase' and 'decrease': 0.8370
Similarity between 'protect' and 'safeguard': 0.8329
Similarity between 'buy' and 'sell': 0.8308
Similarity between 'categorize' and 'classify': 0.8047
Similarity between 'condemn' and 'denounce': 0.8003
Similarity between 'minimize' and 'mitigate': 0.7943
Similarity between 'lessen' and 'minimize': 0.7697
Similarity between 'purchase' and 'buy': 0.7640
Similarity between 'lessen' and 'reduce': 0.7530
Similarity between 'indicate' and 'suggest': 0.7501
Similarity between 'supervise' and 'oversee': 0.7463
Similarity between 'recognize' and 'acknowledge': 0.7424
Similarity between 'build' and 'construct': 0.7395
Similarity between 'vow' and 'pledge': 0.7381
Similarity between 'know' and 'tell': 0.7357
Similarity between 'know' and 'think': 0.7344
Similarity between 'reaffirm' and 'reiterate': 0.7286
Similarity between 'minimize' and 'reduce': 0.7281
Similarity between 'weaken' and 'e

In [11]:
sorted_similarities

[(('convince', 'persuade'), 0.89288),
 (('increase', 'decrease'), 0.8370319),
 (('protect', 'safeguard'), 0.83287936),
 (('buy', 'sell'), 0.8308461),
 (('categorize', 'classify'), 0.8046503),
 (('condemn', 'denounce'), 0.80032325),
 (('minimize', 'mitigate'), 0.7943238),
 (('lessen', 'minimize'), 0.76968896),
 (('purchase', 'buy'), 0.7639905),
 (('lessen', 'reduce'), 0.75301445),
 (('indicate', 'suggest'), 0.7500874),
 (('supervise', 'oversee'), 0.74632823),
 (('recognize', 'acknowledge'), 0.74235857),
 (('build', 'construct'), 0.7394898),
 (('vow', 'pledge'), 0.7381013),
 (('know', 'tell'), 0.7356717),
 (('know', 'think'), 0.7344213),
 (('reaffirm', 'reiterate'), 0.72863954),
 (('minimize', 'reduce'), 0.72813696),
 (('weaken', 'erode'), 0.72559845),
 (('commence', 'begin'), 0.7244425),
 (('believe', 'say'), 0.7188932),
 (('affirm', 'reaffirm'), 0.71827835),
 (('bolster', 'strengthen'), 0.7159493),
 (('bolster', 'boost'), 0.7133502),
 (('cancel', 'postpone'), 0.71189785),
 (('undermine

In [408]:

data_path = '../data/unique_relations.csv'
with open(data_path, 'r') as d:
    data = csv.reader(d)
    scores = [row[1] for row in data][1:]
relation_counts = dict(zip(raw_relations, scores))
lemma_counts = {}
for r in relation_counts:
    lemma_counts[mapping_1[r]] = relation_counts[r]
lemma_counts

{'abandon': '3',
 'abandonment of': '1',
 'abolish': '2',
 'accede to pressure from': '1',
 'accelerate': '1',
 'accept': '1',
 'accept as': '1',
 'accept with': '1',
 'accepted as': '1',
 'access': '1',
 'accommodate': '1',
 'accompany': '2',
 'accuse': '1',
 'accused in': '1',
 'accused of': '2',
 'achieve': '2',
 'acknowledge': '8',
 'acquiesce': '1',
 'acquire': '2',
 'acquit': '8',
 'act as checks on': '1',
 'acted as': '1',
 'adapt': '1',
 'add': '1',
 'address': '1',
 'adjacent to': '1',
 'administer': '2',
 'admire': '1',
 'admit': '6',
 'adopt': '6',
 'advance': '2',
 'advance on': '1',
 'advance to': '2',
 'adversely affect': '1',
 'advise': '5',
 'advised against': '1',
 'advisor': '1',
 'advocate': '1',
 'advocate for': '8',
 'affect': '1',
 'affirm': '1',
 'against': '5',
 'aggravate': '1',
 'agree': '4',
 'agree to': '4',
 'agree to run for': '1',
 'agree to withdraw': '1',
 'agree to work': '1',
 'aim to comply with': '1',
 'alert': '1',
 'align': '3',
 'allege': '5',
 '

In [12]:
pairs = [i[0] for i in sorted_similarities]
mapping_2 = {}
for pair in pairs:
    mapping_2[pair[0]] = pair[1]
print(mapping_2)




{'convince': 'persuade', 'increase': 'decrease', 'protect': 'safeguard', 'buy': 'sell', 'categorize': 'classify', 'condemn': 'denounce', 'minimize': 'reduce', 'lessen': 'reduce', 'purchase': 'buy', 'indicate': 'suggest', 'supervise': 'oversee', 'recognize': 'acknowledge', 'build': 'construct', 'vow': 'pledge', 'know': 'think', 'reaffirm': 'reiterate', 'weaken': 'erode', 'commence': 'begin', 'believe': 'say', 'affirm': 'reaffirm', 'bolster': 'boost', 'cancel': 'postpone', 'undermine': 'weaken', 'characterize': 'describe', 'assess': 'determine', 'dislike': 'despise', 'argue': 'contend'}


In [13]:
final = {}
for r in raw_relations:
    if r in mapping_2:
        final[r]= mapping_2[r]
    else:
        final[r] = mapping_1[r]

In [15]:
unique_relations = set(final.values())
len(unique_relations) # was 1491

1310

In [17]:
df_new['processed relations'] = df_new['lower_relations'].apply(lambda x: final[x])
df_new 

Unnamed: 0,lower_relations,count,lematised,processed relations
0,abandon,3,abandon,abandon
1,abandonment of,1,abandonment of,abandonment of
2,abolish,2,abolish,abolish
3,accede to pressure from,1,accede to pressure from,accede to pressure from
4,accelerate,1,accelerate,accelerate
...,...,...,...,...
1456,worried about,1,worried about,worried about
1457,worry about,1,worry about,worry about
1458,write,21,write,write
1459,write on,1,write on,write on


In [24]:
final['recognize']

'acknowledge'

In [44]:
with open("relation_mapping.csv", "w") as f:
   for key in final.keys():
        f.write("%s,%s\n"%(key,final[key]))