# Synonym transformation

## Main imports and paths

In [1]:
import os
import pandas as pd

In [2]:
PATH_SOURCE = './synonym_list/source/'
FINAL_SOURCE = './synonym_list/transformed/'

DATA_FILE_NAMES = sorted(os.listdir(PATH_SOURCE))
DATA_FILE_NAMES

['Common names - Pest names.csv']

## Pest names

In [3]:
FILE_NAME = DATA_FILE_NAMES[0]
df = pd.read_csv(PATH_SOURCE + FILE_NAME)
df.sample(5)

Unnamed: 0,Pest names,synonyms,target name
2,bedstraw,"common bedstraw, catchweed bedstraw, cleavers,...",bedstraw
34,invaders,,invader
90,woodpeckers,"sapsuckers, flickers, acorn woodpecker, northe...",woodpecker
82,termites,,termite
27,gophers,pocket gopher,gopher


In [4]:
# df['set_synonyms'] = df.apply(lambda x: set([x['Pest names']]).union(set([x1.strip().lower() for x1 in x['synonyms'].split(',')]) if not pd.isna(x['synonyms']) else set()), axis = 1)
df['set_synonyms'] = df['synonyms'].apply(lambda x: set([x1.strip().lower() for x1 in x.split(',')]) if not pd.isna(x) else set())
df['set_synonyms'] = df.apply(lambda x: x['set_synonyms'].union(set([x['Pest names']])).union(set([x['target name']])), axis = 1)
df.sample(5)

Unnamed: 0,Pest names,synonyms,target name,set_synonyms
58,oakworms,,worm,"{worm, oakworms}"
29,grasshoppers,,grasshopper,"{grasshopper, grasshoppers}"
25,fungi,"mold, mildew, mushroom, conk, toadstool, decay...",fungus,"{rust, mildew, mushroom, fungus, conk, fungi, ..."
43,mealybugs,,mealybug,"{mealybug, mealybugs}"
91,woodsorrel,"oxalis, wood-sorrel, wood sorrel, woodsorrel, ...",woodsorrel,"{wood-sorrel, woodsorrel, oxalis, wood-sorrels..."


In [5]:
for i1 in range(0, df.shape[0] - 1):
    for i2 in range(i1 + 1 , df.shape[0]):
        if len(df.iloc[i1]['set_synonyms'].intersection(df.iloc[i2]['set_synonyms'])) > 0:
            print(f'Found intersection at lines {i1 + 2} and {i2 + 2}:')
            print(f'{df.iloc[i1]["Pest names"]} - {df.iloc[i1]["set_synonyms"]}')
            print(f'{df.iloc[i2]["Pest names"]} - {df.iloc[i2]["set_synonyms"]}')
            print(f'Target - {df.iloc[i1]["target name"]} - {df.iloc[i2]["target name"]}')
            print('\n')

Found intersection at lines 6 and 83:
birds - {'bird', 'birds'}
swallows - {'swallows', 'bird'}
Target - bird - bird


Found intersection at lines 7 and 59:
borers - {'borer', 'borers'}
borer - {'borer'}
Target - borer - borer


Found intersection at lines 9 and 35:
bugs - {'bugs', 'insects', 'bug'}
insects - {'bugs', 'insects', 'bug'}
Target - bug - bug


Found intersection at lines 12 and 38:
caterpillars - {'cutworm', 'hornworm', 'caterpillars', 'worm', 'looper', 'pinworm', 'oakworm', 'larva', 'fruitworm', 'earworm'}
larva - {'larva', 'worm'}
Target - worm - worm


Found intersection at lines 12 and 39:
caterpillars - {'cutworm', 'hornworm', 'caterpillars', 'worm', 'looper', 'pinworm', 'oakworm', 'larva', 'fruitworm', 'earworm'}
larvae - {'worm', 'larvae'}
Target - worm - worm


Found intersection at lines 12 and 60:
caterpillars - {'cutworm', 'hornworm', 'caterpillars', 'worm', 'looper', 'pinworm', 'oakworm', 'larva', 'fruitworm', 'earworm'}
oakworms - {'worm', 'oakworms'}
Target -

In [6]:
synonym_dict = {}
for i, r in df.iterrows():
    main_synonym = r['target name']
    synonym_list = r['set_synonyms']
    if main_synonym in synonym_dict:
        synonym_dict[main_synonym] = synonym_dict[main_synonym].union(synonym_list)
    else:
        synonym_dict[main_synonym] = synonym_list

final_df = pd.DataFrame(data = {'main_synonym': synonym_dict.keys(), 'synonym_list': [', '.join(x) for x in synonym_dict.values()]})
final_df = final_df.sort_values('main_synonym')
final_df.head(10)

Unnamed: 0,main_synonym,synonym_list
0,aphid,"aphids, aphid"
1,bat,"bat, bats"
2,bedstraw,"cleavers, common bedstraw, velcro plant, bedst..."
3,beetle,"beetle, beetles"
4,bird,"swallows, bird, birds"
5,borer,"borer, borers"
22,bristletail,"bristletail, firebrats, silverfish"
6,broom,"english broom, scotch broom, sweet broom, broo..."
7,bug,"bugs, insects, bug"
8,buttercup,"bermuda buttercup wood sorrel, weedy bermuda b..."


In [None]:
final_df.to_csv(FINAL_SOURCE + FILE_NAME, index = False)

## Saving synonym dictionary to pickle file

In [7]:
import pickle

synonym_dict_reverse = {}
for i, r in final_df.iterrows():
    synonym_list = r['synonym_list'].split(', ')
    for s in synonym_list:
        synonym_dict_reverse[s] = r['main_synonym']

PICKLE_NAME = 'synonym_pest.pickle'
with open(FINAL_SOURCE + PICKLE_NAME, 'wb') as handle:
    pickle.dump(synonym_dict_reverse, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [10]:
with open(FINAL_SOURCE + PICKLE_NAME, 'rb') as handle:
    synonym_dict = pickle.load(handle)
synonym_dict

{'aphids': 'aphid',
 'aphid': 'aphid',
 'bat': 'bat',
 'bats': 'bat',
 'cleavers': 'bedstraw',
 'common bedstraw': 'bedstraw',
 'velcro plant': 'bedstraw',
 'bedstraw': 'bedstraw',
 'grip plant': 'bedstraw',
 'catchweed bedstraw': 'bedstraw',
 'beetle': 'beetle',
 'beetles': 'beetle',
 'swallows': 'bird',
 'bird': 'bird',
 'birds': 'bird',
 'borer': 'borer',
 'borers': 'borer',
 'bristletail': 'bristletail',
 'firebrats': 'bristletail',
 'silverfish': 'bristletail',
 'english broom': 'broom',
 'scotch broom': 'broom',
 'sweet broom': 'broom',
 'broom': 'broom',
 'french broom': 'broom',
 'brooms': 'broom',
 'portugese broom': 'broom',
 'bugs': 'bug',
 'insects': 'bug',
 'bug': 'bug',
 'bermuda buttercup wood sorrel': 'buttercup',
 'weedy bermuda buttercup': 'buttercup',
 'african woodsorrel': 'buttercup',
 'cape oxalis': 'buttercup',
 "nanny-goat's foot": 'buttercup',
 'buttercup': 'buttercup',
 'bermuda buttercup': 'buttercup',
 'bermuda-buttercup': 'buttercup',
 'african wood-sorrel'