# Synonym transformation

## Main imports and paths

In [None]:
import os
import pandas as pd

In [None]:
PATH_SOURCE = './synonym_list/source/'
FINAL_SOURCE = './synonym_list/transformed/'

DATA_FILE_NAMES = sorted(os.listdir(PATH_SOURCE))
DATA_FILE_NAMES

## Pest names

In [None]:
FILE_NAME = DATA_FILE_NAMES[0]
df = pd.read_csv(PATH_SOURCE + FILE_NAME)
df.sample(5)

In [None]:
# df['set_synonyms'] = df.apply(lambda x: set([x['Pest names']]).union(set([x1.strip().lower() for x1 in x['synonyms'].split(',')]) if not pd.isna(x['synonyms']) else set()), axis = 1)
df['set_synonyms'] = df['synonyms'].apply(lambda x: set([x1.strip().lower() for x1 in x.split(',')]) if not pd.isna(x) else set())
df['set_synonyms'] = df.apply(lambda x: x['set_synonyms'].union(set([x['Pest names']])).union(set([x['target name']])), axis = 1)
df.sample(5)

In [None]:
for i1 in range(0, df.shape[0] - 1):
    for i2 in range(i1 + 1 , df.shape[0]):
        if len(df.iloc[i1]['set_synonyms'].intersection(df.iloc[i2]['set_synonyms'])) > 0:
            print(f'Found intersection at lines {i1 + 2} and {i2 + 2}:')
            print(f'{df.iloc[i1]["Pest names"]} - {df.iloc[i1]["set_synonyms"]}')
            print(f'{df.iloc[i2]["Pest names"]} - {df.iloc[i2]["set_synonyms"]}')
            print(f'Target - {df.iloc[i1]["target name"]} - {df.iloc[i2]["target name"]}')
            print('\n')

In [None]:
synonym_dict = {}
for i, r in df.iterrows():
    main_synonym = r['target name']
    synonym_list = r['set_synonyms']
    if main_synonym in synonym_dict:
        synonym_dict[main_synonym] = synonym_dict[main_synonym].union(synonym_list)
    else:
        synonym_dict[main_synonym] = synonym_list

pest_df = pd.DataFrame(data = {'main_synonym': synonym_dict.keys(), 'synonym_list': [', '.join(sorted(list(x))) for x in synonym_dict.values()]})
pest_df = pest_df.sort_values('main_synonym')
pest_df.head(10)

In [None]:
pest_df.to_csv(FINAL_SOURCE + FILE_NAME, index = False)

## Plant names

In [None]:
FILE_NAME = DATA_FILE_NAMES[1]
df = pd.read_csv(PATH_SOURCE + FILE_NAME)
df.sample(5)

In [None]:
# df['set_synonyms'] = df.apply(lambda x: set([x['Pest names']]).union(set([x1.strip().lower() for x1 in x['synonyms'].split(',')]) if not pd.isna(x['synonyms']) else set()), axis = 1)
df['set_synonyms'] = df['synonyms'].apply(lambda x: set([x1.strip().lower() for x1 in x.split(',')]) if not pd.isna(x) else set())
df['set_synonyms'] = df.apply(lambda x: x['set_synonyms'].union(set([x['Plant names']])).union(set([x['target name']])), axis = 1)
df.sample(5)

In [None]:
for i1 in range(0, df.shape[0] - 1):
    for i2 in range(i1 + 1 , df.shape[0]):
        if len(df.iloc[i1]['set_synonyms'].intersection(df.iloc[i2]['set_synonyms'])) > 0:
            print(f'Found intersection at lines {i1 + 2} and {i2 + 2}:')
            print(f'{df.iloc[i1]["Plant names"]} - {df.iloc[i1]["set_synonyms"]}')
            print(f'{df.iloc[i2]["Plant names"]} - {df.iloc[i2]["set_synonyms"]}')
            print(f'Intersection - {df.iloc[i1]["set_synonyms"].intersection(df.iloc[i2]["set_synonyms"])}')
            print(f'Target - {df.iloc[i1]["target name"]} - {df.iloc[i2]["target name"]}')
            print('\n')

In [None]:
synonym_dict = {}
for i, r in df.iterrows():
    main_synonym = r['target name']
    synonym_list = r['set_synonyms']
    if main_synonym in synonym_dict:
        synonym_dict[main_synonym] = synonym_dict[main_synonym].union(synonym_list)
    else:
        synonym_dict[main_synonym] = synonym_list

plant_df = pd.DataFrame(data = {'main_synonym': synonym_dict.keys(), 'synonym_list': [', '.join(sorted(list(x))) for x in synonym_dict.values()]})
plant_df = plant_df.sort_values('main_synonym')
plant_df.head(10)

In [None]:
plant_df.to_csv(FINAL_SOURCE + FILE_NAME, index = False)

## Saving synonym dictionary to pickle file

In [None]:
print(f'Shapes of DFs: {pest_df.shape}, {plant_df.shape}')
final_df = pd.concat([pest_df, plant_df], axis = 0)
print(f'Final shape: {final_df.shape}')
final_df.sample(10)

In [None]:
print(f'Unique main synonyms: {final_df["main_synonym"].nunique()}')

n_synonyms = 0
s_synonyms = set()
for synonyms in final_df['synonym_list']:
    s = set(synonyms.split(', '))
    if len(s) + len(s_synonyms) != len(s_synonyms.union(s)):
        print(f'Following secondary synonym(s) have duplicates - {", ".join(s)}')
    n_synonyms += len(s)
    s_synonyms = s_synonyms.union(s)

print(f'Number of secondary synonyms: {n_synonyms}')
print(f'Number of unique secondary synonyms: {len(s_synonyms)}')

In [None]:
import pickle

synonym_dict_reverse = {}
for i, r in final_df.iterrows():
    synonym_list = r['synonym_list'].split(', ')
    for s in synonym_list:
        synonym_dict_reverse[s] = r['main_synonym']

PICKLE_NAME = 'synonym_pest.pickle'
with open(FINAL_SOURCE + PICKLE_NAME, 'wb') as handle:
    pickle.dump(synonym_dict_reverse, handle, protocol = pickle.HIGHEST_PROTOCOL)

In [None]:
with open(FINAL_SOURCE + PICKLE_NAME, 'rb') as handle:
    synonym_dict = pickle.load(handle)
synonym_dict