-
Notifications
You must be signed in to change notification settings - Fork 0
/
99_article_to_df_adapted.R
257 lines (233 loc) · 14.2 KB
/
99_article_to_df_adapted.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# 99_article_to_df_adapted.R
# process a single article
# adapted code from github, https://github.com/cran/easyPubMed
# called by 99_table_articles_byAuth_adapted.R
# documentation for pubmed elements is here https://dtd.nlm.nih.gov/ncbi/pubmed/doc/out/180101/index.html
# August 2022
article_to_df_adapted <-
function(pubmedArticle)
{
#
options(warn = -1)
# Global Check!
if (class(pubmedArticle) != "character" |
regexpr("(<PubmedArticle)(.+)(\\/PubmedArticle>)", pubmedArticle) < 0 )
{
message("An error occurred - character")
return(NULL)
}
# Get started
tryCatch({
tmp.article <- custom_grep(xml_data = pubmedArticle, tag = "PubmedArticle", format = "char") # extract the article
if (is.null(tmp.article))
{
message("An error occurred - null")
return(NULL)
}
# Fetch ID string
tmp.paperID <- custom_grep(xml_data = tmp.article, tag = "ArticleIdList", format = "char")
if (is.null(tmp.paperID))
{
message("An error occurred - null ID")
return(NULL)
} else {
tmp.paperID <- gsub("[[:space:]]", "", tmp.paperID[1])
}
# Get PMID
tmp.PMID <- gsub("^(.*ArticleIdIdType=\\\"pubmed\\\")([[:space:]]|[[:alnum:]]){0,20}>", "", tmp.paperID)
tmp.PMID <- gsub("<.*$", "", tmp.PMID)
## Dates, kept as pubmed date, see https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html
dstart = str_locate(string=tmp.article, pattern='PubMedPubDate PubStatus="pubmed"')
tmp.date = str_sub(string=tmp.article, start = dstart[2]+2, end=nchar(tmp.article)) # start from pubmed date so that next end of pubmed date work
dend = str_locate(string=tmp.date, pattern='\\/PubMedPubDate')
tmp.date = str_sub(string=tmp.date, start = 1, end=dend[1]-2) # now go to next end
ystart = str_locate(string=tmp.date, pattern='\\<Year\\>') # year
year = as.numeric(str_sub(tmp.date, ystart[2]+1, ystart[2]+4))
mstart = str_locate(string=tmp.date, pattern='\\<Month\\>') # month
mend = str_locate(string=tmp.date, pattern='\\<\\/Month\\>')
month = str_sub(tmp.date, mstart[2]+1, mend[1]-1)
if(month %in% month.abb){ # convert text month to number
month = factor(month, levels=month.abb) #
}
month = as.numeric(month)
dstart = str_locate(string=tmp.date, pattern='\\<Day\\>') # day
dend = str_locate(string=tmp.date, pattern='\\<\\/Day\\>')
day = as.numeric(str_sub(tmp.date, dstart[2]+1, dend[1]-1))
if(is.na(month) == TRUE){month=6} # middle of year if month missing
if(is.na(day) == TRUE){day = 15} # middle of month if day missing
date = as.Date(ISOdate(year = year,
month = month,
day = day)) # convert day/month/year to date
# number of authors
n.authors = str_count(string=tmp.article, pattern="\\<\\/Author\\>")
# first author's surname
first.author = ''
if(n.authors > 0){
tmp.author <- custom_grep(xml_data = tmp.article, tag = "Author", format = "char")
tmp.last <- custom_grep(xml_data = tmp.author, tag = "LastName", format = "char")
first.author = ifelse(is.null(tmp.last)==TRUE, '', tmp.last)
}
# Article title
tmp.title <- custom_grep(xml_data = tmp.article, tag = "ArticleTitle", format = "char")
if (length(tmp.title) > 1){
tmp.title <- paste(tmp.title, collapse = " ", sep = " ")
} else if (length(tmp.title) < 1) {
# use my own search because of danger of "Title" appearing elsewhere
title.loc = str_locate(pattern='\\<Title\\>', tmp.article)
title.loc.end = str_locate(pattern='\\</Title\\>', tmp.article)
tmp.title = str_sub(string=tmp.article, start = title.loc[2]+1, end=title.loc.end[1]-1)
if (length(tmp.title) > 1){
tmp.title <- NA # if title and alternative title are missing
}
}
# country for first author
country = ''
author.list = custom_grep(xml_data = tmp.article, tag = "AuthorList")
if(length(author.list) > 0){
aff = custom_grep(xml_data = author.list, tag = "AffiliationInfo") # narrow down to author list and affiliation info
if(length(aff) > 0){
address = custom_grep(xml_data = aff[[1]], tag = "Affiliation")[[1]] # first author address
index = which(str_detect(address, country_list$match_words))
country = country_list$english[index][1] # convert to common English name; just take one if multiple
}
}
# Language, only want English (is excluded later so we can see numbers)
# had to use my own search because "Language" was being picked up in abstract ...
# ... specific code not working: tmp.language <- custom_grep(xml_data = tmp.article, tag = "Language", format = "char")
language.loc = str_locate_all(pattern='\\<Language\\>', tmp.article)
tmp.language <- str_sub(string = tmp.article, start=language.loc[[1]][2]+1, end=language.loc[[1]][2]+3) # extract three letter language
tmp.language = tolower(tmp.language) # make sure it is lower case
if(nchar(tmp.language)!=3){tmp.language = 'Missing'}
## Abstract & Copyright
# remove <OtherAbstract>, which can be abstract in another language - had to do whilst still in XML format
abstract.other.location.start <- str_locate_all(tmp.article, pattern='\\<OtherAbstract')[[1]]
if(length(abstract.other.location.start)>0 & is.na(abstract.other.location.start)[1] == FALSE){
n.other = nrow(abstract.other.location.start)
abstract.other.location.end <- str_locate_all(tmp.article, pattern='\\<\\/OtherAbstract\\>')[[1]]
first.string = str_sub(tmp.article, start = 1, end = abstract.other.location.start[1,1]-1)
last.string = str_sub(tmp.article, start = abstract.other.location.end[n.other,2], end = nchar(tmp.article))
tmp.article = paste(first.string, last.string) # create new article
}
# remove Copyright (assume it is at end, so discard anything after); can still be in abstract and CopyrightInformation!
abstract.copyright.location.start <- str_locate_all(tmp.article, pattern='\\<CopyrightInformation')[[1]]
if(length(abstract.copyright.location.start)>0 & is.na(abstract.copyright.location.start)[1] == FALSE){
abstract.copyright.location.end <- str_locate_all(tmp.article, pattern='\\<\\/CopyrightInformation\\>')[[1]]
first.string = str_sub(tmp.article, start = 1, end = abstract.copyright.location.start[1,1]-1)
last.string = str_sub(tmp.article, start = abstract.copyright.location.end[1,2]+1, end = nchar(tmp.article))
tmp.article = paste(first.string, last.string) # create new article
}
# get "raw" abstract from rest of entry
abstract.raw = ''
abstract.start <- str_locate(tmp.article, pattern='\\<Abstract') # first mention
if(length(abstract.start)>0 & is.na(abstract.start)[1] == FALSE){
abstract.end = str_locate_all(tmp.article, pattern='\\<\\/Abstract')[[1]]
abstract.end = abstract.end[nrow(abstract.end),] # last ending
abstract.raw = str_sub(tmp.article, start=abstract.start[1], end=abstract.end[2]+1)
}
# remove systematic review registrations, e.g., 28320459[PMID] - test17.xml
abstract.registration.location.start <- str_locate(abstract.raw, pattern='SYSTEMATIC REVIEW REGISTRATION')[1]
if(!is.na(abstract.registration.location.start)){
abstract.registration.location.end = str_locate_all(abstract.raw, pattern='\\<\\/AbstractText\\>')[[1]]
end = min(abstract.registration.location.end[abstract.registration.location.end[,2] > abstract.registration.location.start, 2]) # find the next "/AbstractText" after the systematic review
first.string = str_sub(abstract.raw, start = 1, end = abstract.registration.location.start-22) # minus 22 to also remove <AbstractText>
last.string = str_sub(abstract.raw, start = end+1, end = nchar(abstract.raw))
abstract.raw = paste(first.string, last.string) # create new article
}
# remove abstract sub-headings
sub.headings.start <- str_locate(abstract.raw, pattern='\\<AbstractText') # find first
if(is.na(sub.headings.start)[1] == FALSE){ # first remove all </AbstractText>
abstract.raw = str_remove_all(string=abstract.raw, pattern='\\<\\/AbstractText\\>')
}
next_end = FALSE
while(is.na(sub.headings.start)[1] == FALSE){
first = str_sub(abstract.raw, start = 1, end = sub.headings.start[1,1]-1)
second = str_sub(abstract.raw, start = sub.headings.start[1,2]+1, end = nchar(abstract.raw))
next.ending = str_locate(second, '>') # now find the next ending (varying because of label length)
second = str_sub(second, next.ending[1]+1, nchar(second))
# add methods start and end if sub-heading is methods
section = str_extract(abstract.raw, pattern='(?<=Label=\")\\w+\"')
section = ifelse(is.na(section) == TRUE, '', section)
section = str_remove(section, '\"')
section_start = ''
if(next_end == TRUE){
section_start = 'results_end'
next_end = FALSE
}
if(str_detect(tolower(section),c('results|findings')) == TRUE){
section_start = 'results_start'
next_end = TRUE
}
#
abstract.raw = paste(first, section_start, second)
sub.headings.start <- str_locate(abstract.raw, pattern='\\<AbstractText') # find next, if there is one
}
# now extract abstract and leave behind <CopyrightInformation>
tmp.abstract <- custom_grep(xml_data = abstract.raw, tag = "Abstract", format = "char")
if (is.null(tmp.abstract) == FALSE){
tmp.abstract <- paste(tmp.abstract, collapse = " ", sep = " ") # if it is a list
# remove Copyright, sometimes ends up in abstract
tmp.copyright <- custom_grep(xml_data = tmp.article, tag = "CopyrightInformation", format = "char")
if(is.null(tmp.copyright)==FALSE){tmp.abstract = str_remove(fixed(tmp.abstract), pattern=fixed(tmp.copyright))}
} else if (is.null(tmp.abstract) == TRUE) {tmp.abstract <- NA}
# second pass for removing copyright (as it is not always in CopyrightSection)
# strings to remove from abstract:
string.to.remove = c('\\<CopyrightInformation\\>',
'\\<\\/CopyrightInformation\\>',
'No commercial re-use',
'This is an open access article distributed under the terms of the Creative Commons Attribution (CC BY) license',
'This article is protected by copyright\\. All rights reserved\\.',
'Published [0-9][0-9][0-9][0-9]\\.',
'Copyright .? ?[0-9][0-9][0-9][0-9] (\\S+\\.?)', # catch-alls for copyright statements; " .? ?" means that there might be copyright symbol followed by a space
'Copyright .? ?[0-9][0-9][0-9][0-9] (\\S+) (\\S+\\.?)',
'Copyright .? ?[0-9][0-9][0-9][0-9] (\\S+) (\\S+) (\\S+\\.?)',
'Copyright .? ?[0-9][0-9][0-9][0-9] (\\S+) (\\S+) (\\S+) (\\S+\\.?)',
'Copyright .? ?[0-9][0-9][0-9][0-9] (\\S+) (\\S+) (\\S+) (\\S+),? (\\S+\\.?)', # for John Wiley & Sons, Inc
'Copyright .? ?[0-9][0-9][0-9][0-9] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+\\.?)',
'Copyright .? ?[0-9][0-9][0-9][0-9] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+\\.?)',
'Copyright .? ?[0-9][0-9][0-9][0-9] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+\\.?)',
"Published by Elsevier Ltd.",
"This is a work of the U.S. Government and is not subject to copyright protection in the United States.",
"Foreign copyrights may apply.",
"Published by John Wiley & Sons Ltd.",
"All rights reserved.",
"all rights reserved",
"PsycINFO Database Record (c) [0-9][0-9][0-9][0-9] APA",
'This article is a US Government work and is in the public domain in the USA\\.')
string.to.remove = string.to.remove[order(-nchar(string.to.remove))] # long to short
string.to.remove = paste(string.to.remove, collapse='|')
tmp.abstract = str_remove_all(tmp.abstract, pattern=string.to.remove)
# Get Journal Abbrv
tmp.jabbrv <- custom_grep(xml_data = tmp.article, tag = "ISOAbbreviation", format = "char")
tmp.jabbrv <- ifelse(is.null(tmp.jabbrv), NA, tmp.jabbrv)
tmp.jabbrv = str_remove_all(string=tmp.jabbrv, pattern='\\.') # tidy journal names to make them a bit shorter
# Get article type, see list here https://www.nlm.nih.gov/mesh/pubtypes.html
tmp.type <- custom_grep(xml_data = tmp.article, tag = "PublicationType", format = "char")
tmp.type <- ifelse(is.null(tmp.type), NA, tmp.type)
# get mesh terms (added Feb 2023)
mesh = NULL
mesh.start = str_locate_all(string=tmp.article, pattern='\\<DescriptorName')[[1]]
mesh.end = str_locate_all(string=tmp.article, pattern='\\/DescriptorName')[[1]]
if(length(mesh.start)>0){
for (k in 1:nrow(mesh.start)){
start = str_sub(tmp.article, mesh.start[k,2], mesh.end[k,1] - 2)
start_1 = str_locate(start, '\\>')
mesh = c(mesh, str_sub(start, start_1[1]+1, nchar(start)))
}
mesh = paste(mesh, collapse = '; ')
}
if(is.null(mesh)==TRUE){mesh = ''}
# frame to output
final.mat <- data.frame(pmid = tmp.PMID,
language = tmp.language,
n.authors = n.authors,
first.author = first.author,
country = country,
title = tmp.title,
abstract = str_squish(tmp.abstract),
date = date,
type = tmp.type,
mesh = mesh,
jabbrv = tmp.jabbrv, stringsAsFactors = FALSE)
return(final.mat)
})
}