In [1]:
import pickle
from tqdm import tqdm
from collections import defaultdict

In [2]:
with open('fernald.txt','r') as f:
    text = f.read()
lines = text.split('\n')

In [3]:
def extract_next_entry(lines,start,end):
    '''
    Search entry in the list of lines `lines` between indices `start` and `end` (included).
    Return the line number at the end of the entry, and the entry itself.
    '''
    searching_entry = True
    current_nline = start
    # Searching the first entry
    while searching_entry and current_nline<end:
        line = lines[current_nline]
        if line == '       *       *       *       *       *':
            searching_entry = False
        current_nline +=1
    entry_start = current_nline
    # Searching the next entry
    searching_next_entry = True
    while searching_next_entry and current_nline<end:
        line = lines[current_nline]
        if line == '       *       *       *       *       *':
            searching_next_entry = False
        current_nline +=1
    entry_end = current_nline-1
    return (entry_end,lines[entry_start:entry_end])
    

In [4]:
def get_synonyms(entry):
   '''
   Return the list of synonymous pairs found in an entry.
   A pair is (ENTRY_HEAD, synonymous word).
   `entry` must be a list of text lines.
   '''
   syns = list()
   
   headline = entry[1].split(',')

   if len(headline) == 1:
      head = headline[0][:-1].lower()
   elif len(headline) > 1:
      head = ''.join( [ head_part.strip() for head_part in headline]).lower()
   else:
      raise ValueError('No entry head found.')

   if ' ' in head:
      print(entry[1])
      raise ValueError('Entry head is not a single word.')

   syn_list_start = 0
   while entry[syn_list_start]!='Synonyms:' and entry[syn_list_start]!='Synonym:':
      syn_list_start +=1
   syn_list_start += 2 #Skip 'Synonyms:' and following empty line.
   syn_list_end = syn_list_start
   while entry[syn_list_end]: #Search next empty line
      syn_list_end +=1
   synonyms_list = entry[syn_list_start:syn_list_end]

   for line in synonyms_list:
      words = [word for word in line.split(',') if word] #remove empty
      for word in words:
         syns.append( word.strip() )
   
   last_syn = syns.pop(-1)
   syns.append( last_syn[:-1] ) #remove punctation at the end.

   syns = [syn_word for syn_word in syns.copy() if ' ' not in syn_word  ] #remove compound word synonyms

   return (head,syns)


In [5]:
partI_range = (453,22058)

syn_pairs = dict()
nline = partI_range[0]

while nline < partI_range[1]:
    nline, entry = extract_next_entry(lines, nline, partI_range[1])
    if not entry:
        break
    head, syns = get_synonyms(entry)
    syn_pairs[head] = syns

In [9]:
with open('./words/adjs_list.pickle','rb') as f:
    adjs = set(pickle.load(f))
with open('./words/nouns_list.pickle','rb') as f:
    nouns = set(pickle.load(f))
with open('./words/verbs_list.pickle','rb') as f:
    verbs = set(pickle.load(f))

In [10]:

final_syn_pairs = { 'A': dict(), 'N': dict() ,'V':dict()}

for full_head, syns in tqdm(syn_pairs.items()):
    full_head = full_head.split('_')
    if len(full_head)>1:
        head, pos = full_head[:2]
    else:
        head = full_head[0]
        pos = ''
        
    if head in adjs and (pos=='a.' or not pos):
        select = list()
        for syn in syns:
            if syn in adjs:
                select.append(syn)
        if select:
            final_syn_pairs['A'][head] = select.copy()

    elif head in nouns and (pos=='n.' or not pos):
        select = list()
        for syn in syns:
            if syn in nouns:
                select.append(syn)
        if select:
            final_syn_pairs['N'][head] = select.copy()

    elif head in verbs and (pos=='v.' or not pos):
        select = list()
        for syn in syns:
            if syn in verbs:
                select.append(syn)
        if select:
            final_syn_pairs['V'][head] = select.copy()

100%|██████████| 629/629 [00:00<00:00, 71112.89it/s]


In [11]:
for pos in final_syn_pairs.keys():
    print(f'PoS : {pos}, nb heads : {len(final_syn_pairs[pos])}, nb pairs: {sum( [ len(syns) for syns in final_syn_pairs[pos].values()] )}' )

PoS : A, nb heads : 143, nb pairs: 1450
PoS : N, nb heads : 308, nb pairs: 2519
PoS : V, nb heads : 110, nb pairs: 616


In [15]:
for pos in final_syn_pairs.keys():
    with open(f'./words/words2graph_{pos}.txt','w',encoding='utf-8') as f:
        for head in final_syn_pairs[pos].keys():
            f.write(head)
            for syn in final_syn_pairs[pos][head]:
                f.write(' '+syn)
            f.write('\n')

In [36]:
with open('./words/fernald_synonyms.pickle','wb') as f:
    pickle.dump(final_syn_pairs, f)