-
Notifications
You must be signed in to change notification settings - Fork 0
/
MeSHfiltering.py
222 lines (183 loc) · 9.61 KB
/
MeSHfiltering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
Author: Joao F Silva
Date: 04/09/2021
This script filters MeSH codes of interest from a MeSH dictionary. MeSH codes are considered valid if they belong to a tree structure
specified in the "validTreeCodes" variable. Practically, a MeSH code is valid if it has a TreeNumber beginning with one of the codes in the list "validTreeCodes".
To tune the list of valid MeSH codes, simply adjust the "validTreeCodes" with the codes of interest.
For a list of possible Tree codes please check https://www.nlm.nih.gov/mesh/2021/download/2021NewMeshHeadingsByCategory.pdf
@params:
input_mesh_xml_file - Directory for the xml file containing the full MeSH dictionary
input_mesh_json_file - Directory for the json file containing the full MeSH dictionary
output_mesh_json_file - Destination directory for the json file containing the filtered MeSH dictionary
save_xml_to_json - Flag used to define if a .json file is to be saved with the xml content from input_mesh_xml_file converted to json
valid_mesh_tree_codes - List of MeSH initial tree codes used to filter the dictionary. Eg. D01 D02 D03
The output of this script is a .json file with a list of dicts, where each dict contains a MeSH term and its corresponding information
[
{
"DescriptorUI":,
"DescriptorName":,
"TreeNumberList": ,
"Concepts": [{"ConceptName":, "ConceptCASN1Name", "ConceptScopeNote", "EntryTerms":[...]}]
},
{
"DescriptorUI":,
"DescriptorName":,
"TreeNumberList": ,
"Concepts": [{"ConceptName":, "ConceptCASN1Name", "ConceptScopeNote", "EntryTerms":[...],}]
},
...
]
Mandatory fields:
DescriptorUI
DescriptorName
TreeNumberList
ConceptName
EntryTerms
NOTE: Some concepts do not have "ConceptCASN1Name" or "ConceptScopeNote", thus these are not mandatory. """
import json
import argparse
import xmltodict
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input_mesh_xml_file",
type=str,
required=False,
help="Directory for the original xml file with the complete MeSH dictionary.")
parser.add_argument("--input_mesh_json_file",
type=str,
required=True,
help="Directory for the json file with the complete MeSH dictionary.")
parser.add_argument("--save_xml_to_json",
default=False,
action='store_true',
help="Whether to save a .json file with the content from the converted xml input file.")
parser.add_argument("--input_source_type",
type=str,
required=True,
help="MeSH for MeSH sources, SCR for supplementary concept record sources.")
parser.add_argument("--output_mesh_json_file",
type=str,
required=True,
help="Directory for the json file with the filtered MeSH dictionary.")
parser.add_argument('--valid_mesh_tree_codes',
default=None,
nargs="*",
type=str,
help='Codes from MeSH TreeNumbers considered valid, to be used in the filtering of MeSH codes. Eg: D01 D02 D03 D04')
args = parser.parse_args()
if args.input_mesh_xml_file:
with open(args.input_mesh_xml_file) as xmlFile:
data = xmltodict.parse(xmlFile.read())
data = json.dumps(data, indent=4, sort_keys=True)
if args.save_xml_to_json:
with open(args.input_mesh_json_file,"w") as jsonFile:
jsonFile.write(data)
print("Written .json file")
with open(args.input_mesh_json_file,"r") as jsonFile:
data=json.load(jsonFile)
if args.input_source_type == "MeSH":
validTreeCodes = args.valid_mesh_tree_codes
RecordsList = []
for descriptor in data["DescriptorRecordSet"]["DescriptorRecord"]:
DescriptorDict = {}
DescriptorDict["DescriptorUI"] = descriptor["DescriptorUI"]
DescriptorDict["DescriptorName"] = descriptor["DescriptorName"]["String"]
try:
DescriptorDict["TreeNumberList"] = descriptor["TreeNumberList"]["TreeNumber"]
except KeyError:
DescriptorDict["TreeNumberList"] = ""
validTreeCode = False
if isinstance(DescriptorDict["TreeNumberList"], list):
for treeCode in DescriptorDict["TreeNumberList"]:
if treeCode.startswith(tuple(validTreeCodes)):
validTreeCode = True
else:
if DescriptorDict["TreeNumberList"].startswith(tuple(validTreeCodes)):
validTreeCode = True
if validTreeCode:
conceptList=[]
if isinstance(descriptor["ConceptList"]["Concept"], list):
for concept in descriptor["ConceptList"]["Concept"]:
conceptDict = {}
entryTermsList = []
conceptDict["ConceptName"] = concept["ConceptName"]["String"]
if "CASN1Name" in concept.keys():
conceptDict["ConceptCASN1Name"] = concept["CASN1Name"]
if "ScopeNote" in concept.keys():
conceptDict["ConceptScopeNote"] = concept["ScopeNote"]
if isinstance(concept["TermList"]["Term"], list):
for term in concept["TermList"]["Term"]:
entryTermsList.append(term["String"])
else:
entryTermsList.append(concept["TermList"]["Term"]["String"])
conceptDict["EntryTerms"] = entryTermsList
conceptList.append(conceptDict)
else:
conceptDict = {}
entryTermsList = []
concept = descriptor["ConceptList"]["Concept"]
conceptDict["ConceptName"] = concept["ConceptName"]["String"]
if "CASN1Name" in concept.keys():
conceptDict["ConceptCASN1Name"] = concept["CASN1Name"]
if "ScopeNote" in concept.keys():
conceptDict["ConceptScopeNote"] = concept["ScopeNote"]
if isinstance(concept["TermList"]["Term"], list):
for term in concept["TermList"]["Term"]:
entryTermsList.append(term["String"])
else:
entryTermsList.append(concept["TermList"]["Term"]["String"])
conceptDict["EntryTerms"] = entryTermsList
conceptList.append(conceptDict)
DescriptorDict["Concepts"] = conceptList
RecordsList.append(DescriptorDict)
elif args.input_source_type == "SCR":
RecordsList = []
for descriptor in data["SupplementalRecordSet"]["SupplementalRecord"]:
DescriptorDict = {}
DescriptorDict["DescriptorUI"] = descriptor["SupplementalRecordUI"]
DescriptorDict["DescriptorName"] = descriptor["SupplementalRecordName"]["String"]
try:
DescriptorDict["Note"] = descriptor["Note"]
except KeyError:
pass
conceptList=[]
if isinstance(descriptor["ConceptList"]["Concept"], list):
for concept in descriptor["ConceptList"]["Concept"]:
conceptDict = {}
conceptDict["ConceptName"] = concept["ConceptName"]["String"]
if "CASN1Name" in concept.keys():
conceptDict["ConceptCASN1Name"] = concept["CASN1Name"]
conceptList.append(conceptDict)
else:
conceptDict = {}
concept = descriptor["ConceptList"]["Concept"]
conceptDict["ConceptName"] = concept["ConceptName"]["String"]
if "CASN1Name" in concept.keys():
conceptDict["ConceptCASN1Name"] = concept["CASN1Name"]
conceptList.append(conceptDict)
headingMappingList=[]
if isinstance(descriptor["HeadingMappedToList"]["HeadingMappedTo"], list):
for concept in descriptor["HeadingMappedToList"]["HeadingMappedTo"]:
headingDict = {}
try:
headingDict["HeadingMappedUI"] = concept["DescriptorReferredTo"]["DescriptorUI"]
headingDict["HeadingMappedName"] = concept["DescriptorReferredTo"]["DescriptorName"]["String"]
except KeyError:
pass
headingMappingList.append(headingDict)
else:
concept = descriptor["HeadingMappedToList"]["HeadingMappedTo"]
headingDict = {}
try:
headingDict["HeadingMappedUI"] = concept["DescriptorReferredTo"]["DescriptorUI"]
headingDict["HeadingMappedName"] = concept["DescriptorReferredTo"]["DescriptorName"]["String"]
except KeyError:
pass
headingMappingList.append(headingDict)
DescriptorDict["Concepts"] = conceptList
DescriptorDict["HeadingMappings"] = headingMappingList
RecordsList.append(DescriptorDict)
with open(args.output_mesh_json_file, "w") as jsonFile:
json.dump(RecordsList, jsonFile, indent=4, sort_keys=True)
if __name__ == "__main__":
main()