-
Notifications
You must be signed in to change notification settings - Fork 2
/
delta.R
104 lines (71 loc) · 4.07 KB
/
delta.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#' Delta
#'
#' This function runs a *Cosine Delta* analysis (Smith and Aldridge 2011; Evert et al. 2017).
#'
#' @param q.data The questioned or disputed data, either as a corpus (the output of [create_corpus()]) or as a `quanteda` dfm (the output of [vectorize()]).
#' @param k.data The known or undisputed data, either as a corpus (the output of [create_corpus()]) or as a `quanteda` dfm (the output of [vectorize()]).
#' @param tokens The type of tokens to extract, either "word" (default) or "character".
#' @param remove_punct A logical value. FALSE (default) keeps punctuation marks.
#' @param remove_symbols A logical value. TRUE (default) removes symbols.
#' @param remove_numbers A logical value. TRUE (default) removes numbers
#' @param lowercase A logical value. TRUE (default) transforms all tokens to lower case.
#' @param n The order or size of the n-grams being extracted. Default is 1.
#' @param trim A logical value. If TRUE (default) then only the most frequent tokens are kept.
#' @param threshold A numeric value indicating how many most frequent tokens to keep if trim = TRUE. The default is 150.
#' @param features Logical with default FALSE. If TRUE, then the output will contain the features used.
#' @param cores The number of cores to use for parallel processing (the default is one).
#'
#' @references Evert, Stefan, Thomas Proisl, Fotis Jannidis, Isabella Reger, Steffen Pielström, Christof Schöch & Thorsten Vitt. 2017. Understanding and explaining Delta measures for authorship attribution. Digital Scholarship in the Humanities 32. ii4–ii16. https://doi.org/10.1093/llc/fqx023.
#' Smith, Peter W H & W Aldridge. 2011. Improving Authorship Attribution: Optimizing Burrows’ Delta Method*. Journal of Quantitative Linguistics 18(1). 63–88. https://doi.org/10.1080/09296174.2011.533591.
#' @return If features is set to FALSE then the output is a data frame containing the results of all comparisons between the Q texts and the K texts. If features is set to TRUE then the output is a list containing the results data frame and the vector of features used for the analysis.
#'
#' @examples
#' Q <- enron.sample[c(5:6)]
#' K <- enron.sample[-c(5:6)]
#' delta(Q, K)
#'
#' @export
delta <- function(q.data, k.data, tokens = "word", remove_punct = FALSE, remove_symbols = TRUE, remove_numbers = TRUE, lowercase = TRUE, n = 1, trim = TRUE, threshold = 150, features = FALSE, cores = NULL){
if(quanteda::is.corpus(q.data) & quanteda::is.corpus(k.data)){
d = vectorize(c(q.data, k.data), tokens = tokens, remove_punct = remove_punct,
remove_symbols = remove_symbols, remove_numbers = remove_numbers, lowercase = lowercase,
n = n, weighting = "rel", trim = trim, threshold = threshold)
}else if(quanteda::is.dfm(q.data) & quanteda::is.dfm(k.data)){
d <- rbind(q.data, k.data)
}else{
stop("The Q and K objects need to be either quanteda corpora or quanteda dfms.")
}
q.list <- quanteda::docnames(q.data)
k.list <- quanteda::docnames(k.data)
tests <- expand.grid(q.list, k.list, stringsAsFactors = FALSE) |>
dplyr::rename(Q = Var1, K = Var2)
z <- scale(d) |> quanteda::as.dfm()
quanteda::docvars(z) <- quanteda::docvars(d)
results <- pbapply::pbapply(tests, 1, cosine_delta, z, cl = cores)
results.table = list_to_df(results)
if(features == TRUE){
output <- list(results = results.table, features = colnames(d))
}else{
output <- results.table
}
return(output)
}
cosine_delta <- function(x, z){
a.name = as.character(x["Q"])
a = quanteda::dfm_subset(z, quanteda::docnames(z) == a.name)
a.author = quanteda::docvars(a, "author")
b.name = as.character(x["K"])
b = quanteda::dfm_subset(z, quanteda::docnames(z) == b.name)
b.author = quanteda::docvars(b, "author")
score <- quanteda.textstats::textstat_simil(a, b, method = "cosine") |> suppressMessages() |> as.numeric()
results = data.frame()
results[1,"Q"] = a.name
results[1,"K"] = b.name
if(a.author == b.author){
results[1,"target"] = TRUE
}else{
results[1,"target"] = FALSE
}
results[1,"score"] = round(score, 3)
return(results)
}