-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
189 lines (165 loc) · 9.86 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import argparse, os, sys
import shutil, glob
import subprocess
import json
import utils.biosyn
import utils.lightweight
import utils.evaluation
def main():
'''
Parses arguments from command line
Parameters:
Console arguments
-i / --input (str) : Input path containing files to convert. Handles folder containing multiple datasets.
-m / --method (str): Output path for both standardized and converted formats.
-s / --score (str) : Input format.
-e / --evalset (str) : Output format.
-o / --original (str) : Output format.
'''
global base_dir
base_dir = os.path.split(os.path.abspath(__file__))[0]
# Construct the argument parser
parser = argparse.ArgumentParser()
# Add the arguments to the parser
parser.add_argument("-i", "--input", required=False,
help="Name of dataset folder to use located in data/standardized. ex: Full Bacteria Biotope 4 as 'BB4' or a sub category as 'BB4-Phenotype', 'BB4-Habitat' or 'BB4-Microorganism', NCBI Disease Corpus as 'ncbi-disease'. \
Can handle a non-natively supported dataset if in a standardized format and in the right directory. 'test_ncbi' is a smaller available for testing purposes.")
parser.add_argument("-m", "--method", required=False,
help="Specifies which method to use. Supported: 'BioSyn' or 'Lightweight'")
parser.add_argument("-s", "--score", required=False, nargs='+',
help="Specify which evaluator(s) should be used use to determine accuracy. Supports multiple arguments: 'Lightweight', 'BioSyn', 'Ref', 'BB4'. Using 'BB4' will produce a folder containing predictions that can be evaluated through the BB4 online evaluation software.")
parser.add_argument("-e", "--evalset", default='test',
help="Specify which set should be used use for evaluation. Supported : 'dev', 'test'. defaults to 'test' if left empty.")
parser.add_argument("-o", "--original", action='store_true',
help="Starts the process from an original, non-standardized dataset.")
parser.add_argument("--runs", default=1,
help="Number of models to train.")
args = vars(parser.parse_args())
router(args)
def router(args):
"""
Handles the flow of operations from provided arguments.
Individual steps an all be called independently
Starts by checking whether to process an original dataset (--original) or start from an already processed one.
If a Bacteria Biotope subcategory is specified as --input, standardization from the original data is made before filtering mentions of the subcategory.
Knowledge base related to the --input will be standardized and the specified method called.
Finally the output files will be scored according to the specified scoring methods (--score).
"""
for _ in range(1, args['runs']+1):
try:
shutil.rmtree(f"{base_dir}/tmp")
except:
pass
os.mkdir(f"{base_dir}/tmp")
if args["original"]:
# Starts from original data located in data/original.
bb4_subcategory = True if args["input"] in ['BB4-Phenotype', 'BB4-Habitat', 'BB4-Microorganism'] else False
input_original_data = f"{base_dir}/data/original/BB4" if bb4_subcategory else f'{base_dir}/data/original/{args["input"]}'
if not os.path.exists(input_original_data):
raise Exception('Original data of input not found in data/original')
# Converts original data to a standart format
p = subprocess.Popen([
'python', f'{base_dir}/utils/standardize_data.py',
'-i', input_original_data,
'-o', f'{base_dir}/tmp/{args["input"]}/',
'-d', args["input"]], stdout=subprocess.PIPE, bufsize=1)
for line in iter(p.stdout.readline, b''):
sys.stdout.write(line.decode(sys.stdout.encoding))
p.stdout.close()
p.wait()
# Separates BB4 data into subcategories
if bb4_subcategory:
shutil.move(f'{base_dir}/tmp/{args["input"]}/', f'{base_dir}/tmp/bb4_full')
p = subprocess.Popen([
'python', f'{base_dir}/utils/bb4_exclude.py',
'-i', f'{base_dir}/tmp/bb4_full',
'-o', f'{base_dir}/tmp/{args["input"]}',
'-s', args["input"].split('BB4-')[1]], stdout=subprocess.PIPE, bufsize=1)
for line in iter(p.stdout.readline, b''):
sys.stdout.write(line.decode(sys.stdout.encoding))
p.stdout.close()
p.wait()
shutil.rmtree(f'{base_dir}/tmp/bb4_full')
if args["input"]:
input_std_data = f'{base_dir}/tmp/{args["input"]}'
# Load knowledge base
# Checks knowledge base existence and recreates it if needed.
if 'BB4' in args["input"]:
kb = f'BB4_kb.txt'
if not os.path.exists(f'{base_dir}/data/knowledge_base/standardized/{kb}'):
with open(f'{base_dir}/data/knowledge_base/original/OntoBiotope_BioNLP-OST-2019.obo', 'r') as fh:
lines = fh.readlines()
synonym = []
for line in lines:
if line.startswith('id'):
cui = line.strip().split(': ')[1]
elif line.startswith('name'):
label = line.strip().split(': ')[1]
elif line.startswith('synonym'):
synonym.append(line.split('"')[1])
elif line.startswith('is_a') and cui != False:
if synonym != []:
label = label + "|" + "|".join(synonym)
with open(f'{base_dir}/data/knowledge_base/standardized/{kb}', 'a') as f:
f.write(f"{cui}||{label}\n")
synonym = []
cui = False
elif args["input"] == 'ncbi-disease':
kb = f'ncbi-disease_kb.txt'
if not os.path.exists(f'{base_dir}/data/knowledge_base/standardized/{kb}'):
shutil.copy(f'{base_dir}/data/knowledge_base/original/medic_06Jul2012.txt', f'{base_dir}/data/knowledge_base/standardized/{kb}')
# the ncbi-disease kb provided by BioSyn authors is considered to be in the model for standard knowledge bases format.
elif args["input"] == 'test_ncbi':
kb = f'test_ncbi_kb.txt'
if not os.path.exists(f'{base_dir}/data/knowledge_base/standardized/{kb}'):
shutil.copy(f'{base_dir}/data/knowledge_base/original/test_ncbi.txt', f'{base_dir}/data/knowledge_base/standardized/{kb}')
else :
kb = f'{args["input"]}_kb.txt'
if not os.path.exists(f'{base_dir}/data/knowledge_base/standardized/{kb}'):
shutil.copy(f'{base_dir}/data/knowledge_base/original/{kb}', f'{base_dir}/data/knowledge_base/standardized/{kb}')
if args["method"]:
# Loads model parameters
params = json.load(open('config.json', 'r'))
try:
shutil.copytree(f'{base_dir}/data/standardized/{args["input"]}', f'{base_dir}/tmp/{args["input"]}')
except:
raise NotImplementedError(f'Standardized {args["input"]} data not found in {base_dir}/data/standardized/{args["input"]}')
else:
print('Since no --method has been called, the processed has stopped here.')
exit()
if args["method"] == 'BioSyn':
# Prepares BioSyn environnement via setup(), trains model and evaluates is via run().
# Cleans up environnement via cleanup().
utils.biosyn.setup(base_dir, input_std_data, kb, args)
utils.biosyn.run(base_dir, args, params, kb)
prediction_path = utils.biosyn.cleanup(base_dir, args, kb)
elif args["method"] == 'Lightweight':
env_path = f'{base_dir}/Biomedical-Entity-Linking/output/{args["input"]}'
utils.lightweight.setup(base_dir, env_path, input_std_data, kb, args)
utils.lightweight.run(base_dir, env_path, params, args)
prediction_path = utils.lightweight.cleanup(base_dir, env_path, args)
if args["score"]:
if 'Lightweight' in args["score"]:
print(f'Lightweight accuracy evaluation = {utils.evaluation.accuracy_lightweight(f"{prediction_path}/standardized_predictions.txt")}')
if 'BioSyn' in args["score"]:
print(f'BioSyn accuracy evaluation = {utils.evaluation.accuracy_biosyn(f"{prediction_path}/standardized_predictions.txt")}')
if 'Ref' in args["score"]:
print(f'Ref accuracy evaluation = {utils.evaluation.accuracy_ref(f"{prediction_path}/standardized_predictions.txt")}')
if 'BB4' in args["score"]:
entity = 'BB4' if args["input"] == 'BB4' else args["input"].split('BB4-')[1]
p = subprocess.Popen([
'python', f'{base_dir}/utils/evaluation.py',
'--input', f"{prediction_path}/standardized_predictions.txt",
'--dataset', f'{base_dir}/data/original/BB4/BioNLP-OST-2019_BB-norm_test',
'--entities', entity,
'--output', prediction_path], stdout=subprocess.PIPE, bufsize=1)
for line in iter(p.stdout.readline, b''):
sys.stdout.write(line.decode(sys.stdout.encoding))
p.stdout.close()
p.wait()
print(f'Bacteria Biotope 4 prediction file (.a2) available for online evaluation.')
print(p.returncode)
shutil.rmtree(f"{base_dir}/tmp")
print('Done')
if __name__ == "__main__":
main()