/
KWIC.R
204 lines (201 loc) · 7.8 KB
/
KWIC.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
### This file is part of 'PGRdup' package for R.
### Copyright (C) 2014-2023, ICAR-NBPGR.
#
# PGRdup is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# PGRdup is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A copy of the GNU General Public License is available at
# https://www.r-project.org/Licenses/
#' Create a KWIC index
#'
#' \code{KWIC} creates a Keyword in Context index from PGR passport database
#' fields.
#'
#' The function generates a Keyword in Context index from a data frame of a PGR
#' passport database based on the fields(columns) stated in the arguments, using
#' \code{\link[data.table]{data.table}} package.
#'
#' The first element of vector \code{fields} is considered as the primary key or
#' identifier which uniquely identifies all rows in the data frame.
#'
#' Cleaning of the data the input fields(columns) using the
#' \code{\link[PGRdup]{DataClean}} function with appropriate arguments is
#' suggested before running this function.
#'
#' @param x A data frame from which KWIC index is to be generated.
#' @param fields A character vector with the names of fields(columns) of the
#' data frame from which KWIC index is to be generated. The first field is
#' considered as the primary key or identifier (see \strong{Details}).
#' @param min.freq Frequency of keywords are not computed if below
#' \code{min.freq}. Default is 10.
#' @return A list of class \code{KWIC} containing the following components:
#' \tabular{ll}{ \code{KWIC} \tab The KWIC index in the form of a data frame.
#' \cr \code{KeywordFreq} \tab A data frame of the keywords detected with
#' frequency greater than \code{min.freq}. \cr \code{Fields} \tab A character
#' vector with the names of the PGR database fields from which the keywords
#' were extracted. \cr }
#' @seealso \code{\link[data.table]{data.table}},
#' \code{\link[PGRdup]{print.KWIC}}
#'
#' @references Kn\enc{ü}{ue}pffer, H. 1988. "The European Barley Database of the ECP/GR:
#' An Introduction." \emph{Die Kulturpflanze} 36 (1): 135-62.
#' \doi{10.1007/BF01986957}.
#'
#' Kn\enc{ü}{ue}pffer, H., L. Frese, and M. W. M. Jongen. 1997. "Using Central Crop
#' Databases: Searching for Duplicates and Gaps." In \emph{Central Crop
#' Databases: Tools for Plant Genetic Resources Management. Report of a
#' Workshop, Budapest, Hungary, 13-16 October 1996}, edited by E. Lipman, M.
#' W. M. Jongen, T. J. L. van Hintum, T. Gass, and L. Maggioni, 67-77. Rome,
#' Italy and Wageningen, The Netherlands: International Plant Genetic
#' Resources Institute and Centre for Genetic Resources.
#'
#' @encoding UTF-8
#' @examples
#'
#' \dontshow{
#' threads_dt <- data.table::getDTthreads()
#' threads_OMP <- Sys.getenv("OMP_THREAD_LIMIT")
#' data.table::setDTthreads(2)
#'
#' data.table::setDTthreads(2)
#' Sys.setenv(`OMP_THREAD_LIMIT` = 2)
#' }
#'
#' # Load PGR passport database
#' GN <- GN1000
#'
#' # Specify as a vector the database fields to be used
#' GNfields <- c("NationalID", "CollNo", "DonorID", "OtherID1", "OtherID2")
#'
#' # Clean the data
#' GN[GNfields] <- lapply(GN[GNfields], function(x) DataClean(x))
#'
#' \dontrun{
#'
#' # Generate KWIC index
#' GNKWIC <- KWIC(GN, GNfields)
#' GNKWIC
#'
#' # Retrieve the KWIC index from the KWIC object
#' KWIC <- GNKWIC[[1]]
#'
#' # Retrieve the keyword frequencies from the KWIC object
#' KeywordFreq <- GNKWIC[[2]]
#'
#' # Show error in case of duplicates and NULL values
#' # in the primary key/ID field "NationalID"
#' GN[1001:1005,] <- GN[1:5,]
#' GN[1001,3] <- ""
#' GNKWIC <- KWIC(GN, GNfields)
#' }
#'
#' \dontshow{
#' data.table::setDTthreads(threads_dt)
#' Sys.setenv(`OMP_THREAD_LIMIT` = threads_OMP)
#' }
#'
#' @import data.table
#' @importFrom stringi stri_split_fixed
#' @export KWIC
#' @export print.KWIC
#' @rdname KWIC
KWIC <- function(x, fields, min.freq = 10) {
if (is.data.frame(x) == FALSE) {
# Check if x is a data.frame and stop if not
stop("x is not a data.frame")
}
if (is.vector(fields) == FALSE) {
# Check if fields is a vector or not
stop("fields is not a vector")
}
if (length(fields) == 1) {
# Check if more than one field is given as input and stop if not
stop("Only one field given as input")
}
if (is.element(FALSE, fields %in% colnames(x)) == TRUE) {
# Check if fields are present in x and stop if not
stop("One or more fields are missing in x")
}
#setDT(x)
x <- as.data.table(x)
# Convert the fields in x to character
for (col in fields) set(x, j = col, value = as.character(x[[col]]))
# Convert NAs to empty strings
for (j in fields) set(x, which(is.na(x[[j]])), j, "")
setDF(x)
if (is.element("", x[fields[1]]) | is.element(TRUE,
duplicated(x[fields[1]]))) {
# Check primary key/ID is unique and not NULL
stop("Primary key/ID field should be unique and not NULL\n Use PGRdup::ValidatePrimKey() to identify and rectify the aberrant records first")
}
#setDT(x)
x <- as.data.table(x)
# Create context fields
x[, KWIC := do.call(paste, c(.SD, sep = " = ")), .SDcols = fields]
x[, COMBINED := do.call(paste, .SD), .SDcols = fields]
# Create KWIC index using data.table
K <- as.list(rep(NA, length(fields)))
for (i in 1:(length(fields))) {
K[[i]] <- x[, list(KEYWORD = unlist(strsplit(get(fields[i]), " ")),
FIELD = fields[i]),
by = list(PRIM_ID = get(fields[1]), KWIC)]
K[[i]] <- K[[i]][!is.na(K[[i]]$KEYWORD), ]
}
KWIC <- rbindlist(K)
rm(K, x)
#KWIC$KEYWORD[is.na(KWIC$KEYWORD)] <- ""
set(KWIC, which(is.na(KWIC[["KEYWORD"]])), "KEYWORD", "")
KWIC <- setkey(KWIC, KEYWORD)
# Remove all '\' from KWIC
KWIC[, KWIC := gsub(pattern = "([\\])", replacement = "", x = KWIC)]
KWIC[, KEYWORD := gsub(pattern = "([\\])", replacement = "", x = KEYWORD)]
# Remove records with blank keywords
KWIC <- subset(KWIC, KEYWORD != "")
# Remove duplicate records
KWIC <- setkey(KWIC, NULL)
KWIC <- unique(KWIC)
# Add padding space in KWIC
KWIC[, KWIC := paste(" ", KWIC, " ")]
# Escape all Regex special characters
KWIC[, KEYWORD := gsub(pattern = "([.|()\\^{}+$*?]|\\[|\\])",
replacement = "\\\\\\1", x = KEYWORD)]
# Highlight keywords in KWIC
KWIC[, KWIC := mapply(gsub, pattern = paste0(" ", KEYWORD, " "),
replacement = paste0(" <<", KEYWORD, ">> "), KWIC)]
KWIC[, KWIC := gsub("^\\s+|\\s+$", "", KWIC)]
# Unescape all Regex special characters
KWIC[, KEYWORD := gsub(pattern = "\\\\(.)", replacement = "\\1", x = KEYWORD)]
# Split KWIC
KWIC[, c("KWIC_L", "KW1") := do.call(rbind.data.frame,
stri_split_fixed(KWIC, "<<", 2))][]
KWIC[, c("KWIC_KW", "KWIC_R") := do.call(rbind.data.frame,
stri_split_fixed(KW1, ">>", 2))][]
cols <- c("KWIC_L", "KWIC_KW", "KWIC_R")
KWIC[, (cols) := lapply(.SD, as.character), .SDcols = cols]
KWIC[, KW1 := NULL]
# Clean output data.frame
KWIC <- setkey(KWIC, FIELD)
KWIC <- setkey(KWIC, PRIM_ID)
setcolorder(KWIC, c("PRIM_ID", "FIELD", "KEYWORD", "KWIC", "KWIC_L",
"KWIC_KW", "KWIC_R"))
KWICIndex <- list(KWIC = NULL, KeywordFreq = NULL, Fields = fields)
#KWICIndex[[1]] <- as.data.frame(KWIC)
KWICIndex[[1]] <- setDF(KWIC)
# Get keyword freq
kwf <- as.data.frame(table(KWIC$KEYWORD))
kwf <- subset(kwf, Freq > min.freq)
kwf <- kwf[order(-kwf$Freq), ]
rownames(kwf) <- NULL
setnames(kwf, old = "Var1", new = "Keyword")
KWICIndex[[2]] <- kwf
# Set Class
class(KWICIndex) <- "KWIC"
return(KWICIndex)
}