/
stat-poly-eq.R
1260 lines (1219 loc) · 53.5 KB
/
stat-poly-eq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#' Equation, p-value, \eqn{R^2}, AIC and BIC of fitted polynomial
#'
#' \code{stat_poly_eq} fits a polynomial by default with \code{stats::lm()} but
#' alternatively using robust regression. From the fitted model it
#' generates several labels including the equation, p-value, F-value,
#' coefficient of determination (R^2), 'AIC', 'BIC', and number of observations.
#'
#' @param mapping The aesthetic mapping, usually constructed with
#' \code{\link[ggplot2]{aes}}. Only needs to be
#' set at the layer level if you are overriding the plot defaults.
#' @param data A layer specific dataset, only needed if you want to override
#' the plot defaults.
#' @param geom The geometric object to use display the data
#' @param position The position adjustment to use for overlapping points on this
#' layer
#' @param show.legend logical. Should this layer be included in the legends?
#' \code{NA}, the default, includes if any aesthetics are mapped. \code{FALSE}
#' never includes, and \code{TRUE} always includes.
#' @param inherit.aes If \code{FALSE}, overrides the default aesthetics, rather
#' than combining with them. This is most useful for helper functions that
#' define both data and aesthetics and shouldn't inherit behaviour from the
#' default plot specification, e.g. \code{\link[ggplot2]{borders}}.
#' @param ... other arguments passed on to \code{\link[ggplot2]{layer}}. This
#' can include aesthetics whose values you want to set, not map. See
#' \code{\link[ggplot2]{layer}} for more details.
#' @param na.rm a logical indicating whether NA values should be stripped before
#' the computation proceeds.
#' @param formula a formula object. Using aesthetic names \code{x} and \code{y}
#' instead of original variable names.
#' @param method function or character If character, "lm", "rlm" or the name of
#' a model fit function are accepted, possibly followed by the fit function's
#' \code{method} argument separated by a colon (e.g. \code{"rlm:M"}). If a
#' function different to \code{lm()}, it must accept as a minimum a model
#' formula through its first parameter, and have formal parameters named
#' \code{data}, \code{weights}, and \code{method}, and return a model fit
#' object of class \code{lm}.
#' @param method.args named list with additional arguments.
#' @param n.min integer Minimum number of distinct values in the explanatory
#' variable (on the rhs of formula) for fitting to the attempted.
#' @param eq.with.lhs If \code{character} the string is pasted to the front of
#' the equation label before parsing or a \code{logical} (see note).
#' @param eq.x.rhs \code{character} this string will be used as replacement for
#' \code{"x"} in the model equation when generating the label before parsing
#' it.
#' @param small.r,small.p logical Flags to switch use of lower case r and p for
#' coefficient of determination and p-value.
#' @param rsquared.conf.level numeric Confidence level for the returned
#' confidence interval. Set to NA to skip CI computation.
#' @param CI.brackets character vector of length 2. The opening and closing
#' brackets used for the CI label.
#' @param coef.digits,f.digits integer Number of significant digits to use for
#' the fitted coefficients and F-value.
#' @param coef.keep.zeros logical Keep or drop trailing zeros when formatting
#' the fitted coefficients and F-value.
#' @param rr.digits,p.digits integer Number of digits after the decimal point to
#' use for \eqn{R^2} and P-value in labels. If \code{Inf}, use exponential
#' notation with three decimal places.
#' @param label.x,label.y \code{numeric} with range 0..1 "normalized parent
#' coordinates" (npc units) or character if using \code{geom_text_npc()} or
#' \code{geom_label_npc()}. If using \code{geom_text()} or \code{geom_label()}
#' numeric in native data units. If too short they will be recycled.
#' @param label.x.npc,label.y.npc \code{numeric} with range 0..1 (npc units)
#' DEPRECATED, use label.x and label.y instead; together with a geom
#' using npcx and npcy aesthetics.
#' @param hstep,vstep numeric in npc units, the horizontal and vertical step
#' used between labels for different groups.
#' @param output.type character One of "expression", "LaTeX", "text",
#' "markdown" or "numeric".
#' @param orientation character Either "x" or "y" controlling the default for
#' \code{formula}.
#' @param parse logical Passed to the geom. If \code{TRUE}, the labels will be
#' parsed into expressions and displayed as described in \code{?plotmath}.
#' Default is \code{TRUE} if \code{output.type = "expression"} and
#' \code{FALSE} otherwise.
#'
#' @note For backward compatibility a logical is accepted as argument for
#' \code{eq.with.lhs}. If \code{TRUE}, the default is used, either
#' \code{"x"} or \code{"y"}, depending on the argument passed to \code{formula}.
#' However, \code{"x"} or \code{"y"} can be substituted by providing a
#' suitable replacement character string through \code{eq.x.rhs}.
#' Parameter \code{orientation} is redundant as it only affects the default
#' for \code{formula} but is included for consistency with
#' \code{ggplot2::stat_smooth()}.
#'
#' R option \code{OutDec} is obeyed based on its value at the time the plot
#' is rendered, i.e., displayed or printed. Set \code{options(OutDec = ",")}
#' for languages like Spanish or French.
#'
#' @details This statistic can be used to automatically annotate a plot with
#' \eqn{R^2}, adjusted \eqn{R^2} or the fitted model equation. It supports
#' linear regression, robust linear regression and median regression fitted
#' with functions \code{\link{lm}}, or \code{\link[MASS]{rlm}}. The \eqn{R^2}
#' and adjusted \eqn{R^2} annotations can be used with any linear model
#' formula. The confidence interval for \eqn{R^2} is computed with function
#' \code{\link[confintr]{ci_rsquared}} from package 'confintr'. The fitted
#' equation label is correctly generated for polynomials or quasi-polynomials
#' through the origin. Model formulas can use \code{poly()} or be defined
#' algebraically with terms of powers of increasing magnitude with no missing
#' intermediate terms, except possibly for the intercept indicated by "- 1" or
#' "-1" or \code{"+ 0"} in the formula. The validity of the \code{formula} is
#' not checked in the current implementation, and for this reason the default
#' aesthetics sets \eqn{R^2} as label for the annotation. This statistic
#' generates labels as R expressions by default but LaTeX (use TikZ device),
#' markdown (use package 'ggtext') and plain text are also supported, as well
#' as numeric values for user-generated text labels. The value of \code{parse}
#' is set automatically based on \code{output-type}, but if you assemble
#' labels that need parsing from \code{numeric} output, the default needs to
#' be overridden. This stat only generates annotation labels, the predicted
#' values/line need to be added to the plot as a separate layer using
#' \code{\link{stat_poly_line}} (or \code{\link[ggplot2]{stat_smooth}}), if
#' the default formula is overriden with an argument, it is crucial to make
#' sure that the same model formula is used in all layers. In this case it is
#' best to save the formula as an object and supply this object as argument to
#' the different statistics.
#'
#' A ggplot statistic receives as \code{data} a data frame that is not the one
#' passed as argument by the user, but instead a data frame with the variables
#' mapped to aesthetics. \code{stat_poly_eq()} mimics how \code{stat_smooth()}
#' works, except that only polynomials can be fitted. Similarly to these
#' statistics the model fits respect grouping, so the scales used for \code{x}
#' and \code{y} should both be continuous scales rather than discrete.
#'
#' With method \code{"lm"}, singularity results in terms being dropped with a
#' message if more numerous than can be fitted with a singular (exact) fit.
#' In this case or if the model results in a perfect fit due to a low
#' number of observations, estimates for various parameters are \code{NaN} or
#' \code{NA}. When this is the case the corresponding labels are set to
#' \code{character(0L)} and thus not visble in the plot.
#'
#' With methods other than \code{"lm"}, the model fit functions simply fail
#' in case of singularity, e.g., singular fits are not implemented in
#' \code{"rlm"}.
#'
#' In both cases the minimum number of observations with distinct values in
#' the explanatory variable can be set through parameter \code{n.min}. The
#' default \code{n.min = 2L} is the smallest suitable for method \code{"lm"}
#' but too small for method \code{"rlm"} for which \code{n.min = 3L} is
#' needed. Anyway, model fits with very few observations are of little
#' interest and using larger values of \code{n.min} than the default is
#' usually wise.
#'
#' @references Originally written as an answer to question 7549694 at
#' Stackoverflow but enhanced based on suggestions from users and my own
#' needs.
#'
#' @section Aesthetics: \code{stat_poly_eq()} understands \code{x} and \code{y},
#' to be referenced in the \code{formula} and \code{weight} passed as argument
#' to parameter \code{weights}. All three must be mapped to \code{numeric}
#' variables. In addition, the aesthetics understood by the geom
#' (\code{"text"} is the default) are understood and grouping respected.
#'
#' \emph{If the model formula includes a transformation of \code{x}, a
#' matching argument should be passed to parameter \code{eq.x.rhs}
#' as its default value \code{"x"} will not reflect the applied
#' transformation. In plots, transformation should never be applied to the
#' left hand side of the model formula, but instead in the mapping of the
#' variable within \code{aes}, as otherwise plotted observations and fitted
#' curve will not match. In this case it may be necessary to also pass
#' a matching argument to parameter \code{eq.with.lhs}.}
#'
#' @return A data frame, with a single row and columns as described under
#' \strong{Computed variables}. In cases when the number of observations is
#' less than \code{n.min} a data frame with no rows or columns is returned
#' rendered as an empty/invisible plot layer.
#'
#' @section Computed variables:
#' If output.type different from \code{"numeric"} the returned tibble contains
#' columns listed below. If the model fit function used does not return a value,
#' the label is set to \code{character(0L)}.
#' \describe{
#' \item{x,npcx}{x position}
#' \item{y,npcy}{y position}
#' \item{eq.label}{equation for the fitted polynomial as a character string to be parsed}
#' \item{rr.label}{\eqn{R^2} of the fitted model as a character string to be parsed}
#' \item{adj.rr.label}{Adjusted \eqn{R^2} of the fitted model as a character string to be parsed}
#' \item{rr.confint.label}{Confidence interval for \eqn{R^2} of the fitted model as a character string to be parsed}
#' \item{f.value.label}{F value and degrees of freedom for the fitted model as a whole.}
#' \item{p.value.label}{P-value for the F-value above.}
#' \item{AIC.label}{AIC for the fitted model.}
#' \item{BIC.label}{BIC for the fitted model.}
#' \item{n.label}{Number of observations used in the fit.}
#' \item{grp.label}{Set according to mapping in \code{aes}.}
#' \item{method.label}{Set according \code{method} used.}
#' \item{r.squared, adj.r.squared, p.value, n}{numeric values, from the model fit object}}
#'
#' If output.type is \code{"numeric"} the returned tibble contains columns
#' listed below. If the model fit function used does not return a value,
#' the variable is set to \code{NA_real_}.
#' \describe{
#' \item{x,npcx}{x position}
#' \item{y,npcy}{y position}
#' \item{coef.ls}{list containing the "coefficients" matrix from the summary of the fit object}
#' \item{r.squared, rr.confint.level, rr.confint.low, rr.confint.high, adj.r.squared, f.value, f.df1, f.df2, p.value, AIC, BIC, n}{numeric values, from the model fit object}
#' \item{grp.label}{Set according to mapping in \code{aes}.}
#' \item{b_0.constant}{TRUE is polynomial is forced through the origin}
#' \item{b_i}{One or columns with the coefficient estimates}}
#'
#' To explore the computed values returned for a given input we suggest the use
#' of \code{\link[gginnards]{geom_debug}} as shown in the last examples below.
#'
#' @section Alternatives: \code{stat_regline_equation()} in package 'ggpubr' is
#' a renamed but almost unchanged copy of \code{stat_poly_eq()} taken from an
#' old version of this package (without acknowledgement of source and
#' authorship). \code{stat_regline_equation()} lacks important functionality
#' and contains bugs that have been fixed in \code{stat_poly_eq()}.
#'
#' @seealso This statistics fits a model with function \code{\link[stats]{lm}},
#' function \code{\link[MASS]{rlm}} or a user supplied function returning an
#' object of class \code{"lm"}. Consult the documentation of these functions
#' for the details and additional arguments that can be passed to them by name
#' through parameter \code{method.args}.
#'
#' This \code{stat_poly_eq} statistic can return ready formatted labels
#' depending on the argument passed to \code{output.type}. This is possible
#' because only polynomial and quasy-polynomial models are supported. For
#' quantile regression \code{\link{stat_quant_eq}} should be used instead of
#' \code{stat_poly_eq} while for model II or major axis regression
#' \code{\link{stat_ma_eq}} should be used. For other types of models such as
#' non-linear models, statistics \code{\link{stat_fit_glance}} and
#' \code{\link{stat_fit_tidy}} should be used and the code for construction of
#' character strings from numeric values and their mapping to aesthetic
#' \code{label} needs to be explicitly supplied by the user.
#'
#' @family ggplot statistics for linear and polynomial regression
#'
#' @examples
#' # generate artificial data
#' set.seed(4321)
#' x <- 1:100
#' y <- (x + x^2 + x^3) + rnorm(length(x), mean = 0, sd = mean(x^3) / 4)
#' y <- y / max(y)
#' my.data <- data.frame(x = x, y = y,
#' group = c("A", "B"),
#' y2 = y * c(1, 2) + c(0, 0.1),
#' w = sqrt(x))
#'
#' # give a name to a formula
#' formula <- y ~ poly(x, 3, raw = TRUE)
#'
#' # using defaults
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line() +
#' stat_poly_eq()
#'
#' # no weights
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula)
#'
#' # other labels
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(use_label("eq"), formula = formula)
#'
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(use_label(c("eq", "R2")), formula = formula)
#'
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(use_label(c("R2", "R2.CI", "P", "method")), formula = formula)
#'
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(use_label(c("R2", "F", "P", "n"), sep = "*\"; \"*"),
#' formula = formula)
#'
#' # grouping
#' ggplot(my.data, aes(x, y2, color = group)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula)
#'
#' # rotation
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, angle = 90)
#'
#' # label location
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, label.y = "bottom", label.x = "right")
#'
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, label.y = 0.1, label.x = 0.9)
#'
#' # modifying the explanatory variable within the model formula
#' # modifying the response variable within aes()
#' formula.trans <- y ~ I(x^2)
#' ggplot(my.data, aes(x, y + 1)) +
#' geom_point() +
#' stat_poly_line(formula = formula.trans) +
#' stat_poly_eq(use_label("eq"),
#' formula = formula.trans,
#' eq.x.rhs = "~x^2",
#' eq.with.lhs = "y + 1~~`=`~~")
#'
#' # using weights
#' ggplot(my.data, aes(x, y, weight = w)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula)
#'
#' # no weights, 4 digits for R square
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, rr.digits = 4)
#'
#' # manually assemble and map a specific label using paste() and aes()
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(aes(label = paste(after_stat(rr.label),
#' after_stat(n.label), sep = "*\", \"*")),
#' formula = formula)
#'
#' # manually assemble and map a specific label using sprintf() and aes()
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(aes(label = sprintf("%s*\" with \"*%s*\" and \"*%s",
#' after_stat(rr.label),
#' after_stat(f.value.label),
#' after_stat(p.value.label))),
#' formula = formula)
#'
#' # x on y regression
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula, orientation = "y") +
#' stat_poly_eq(use_label(c("eq", "adj.R2")),
#' formula = x ~ poly(y, 3, raw = TRUE))
#'
#' # conditional user specified label
#' ggplot(my.data, aes(x, y2, color = group)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(aes(label = ifelse(after_stat(adj.r.squared) > 0.96,
#' paste(after_stat(adj.rr.label),
#' after_stat(eq.label),
#' sep = "*\", \"*"),
#' after_stat(adj.rr.label))),
#' rr.digits = 3,
#' formula = formula)
#'
#' # geom = "text"
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(geom = "text", label.x = 100, label.y = 0, hjust = 1,
#' formula = formula)
#'
#' # using numeric values
#' # Here we use columns b_0 ... b_3 for the coefficient estimates
#' my.format <-
#' "b[0]~`=`~%.3g*\", \"*b[1]~`=`~%.3g*\", \"*b[2]~`=`~%.3g*\", \"*b[3]~`=`~%.3g"
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula,
#' output.type = "numeric",
#' parse = TRUE,
#' mapping =
#' aes(label = sprintf(my.format,
#' after_stat(b_0), after_stat(b_1),
#' after_stat(b_2), after_stat(b_3))))
#'
#' # Inspecting the returned data using geom_debug()
#' # This provides a quick way of finding out the names of the variables that
#' # are available for mapping to aesthetics with after_stat().
#'
#' gginnards.installed <- requireNamespace("gginnards", quietly = TRUE)
#'
#' if (gginnards.installed)
#' library(gginnards)
#'
#' if (gginnards.installed)
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, geom = "debug")
#'
#' if (gginnards.installed)
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, geom = "debug", output.type = "numeric")
#'
#' # names of the variables
#' if (gginnards.installed)
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, geom = "debug",
#' summary.fun = colnames)
#'
#' # only data$eq.label
#' if (gginnards.installed)
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, geom = "debug",
#' output.type = "expression",
#' summary.fun = function(x) {x[["eq.label"]]})
#'
#' # only data$eq.label
#' if (gginnards.installed)
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(aes(label = after_stat(eq.label)),
#' formula = formula, geom = "debug",
#' output.type = "markdown",
#' summary.fun = function(x) {x[["eq.label"]]})
#'
#' # only data$eq.label
#' if (gginnards.installed)
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, geom = "debug",
#' output.type = "latex",
#' summary.fun = function(x) {x[["eq.label"]]})
#'
#' # only data$eq.label
#' if (gginnards.installed)
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, geom = "debug",
#' output.type = "text",
#' summary.fun = function(x) {x[["eq.label"]]})
#'
#' # show the content of a list column
#' if (gginnards.installed)
#' ggplot(my.data, aes(x, y)) +
#' geom_point() +
#' stat_poly_line(formula = formula) +
#' stat_poly_eq(formula = formula, geom = "debug", output.type = "numeric",
#' summary.fun = function(x) {x[["coef.ls"]][[1]]})
#'
#' @export
#'
stat_poly_eq <- function(mapping = NULL, data = NULL,
geom = "text_npc",
position = "identity",
...,
formula = NULL,
method = "lm",
method.args = list(),
n.min = 2L,
eq.with.lhs = TRUE,
eq.x.rhs = NULL,
small.r = FALSE,
small.p = FALSE,
CI.brackets = c("[", "]"),
rsquared.conf.level = 0.95,
coef.digits = 3,
coef.keep.zeros = TRUE,
rr.digits = 2,
f.digits = 3,
p.digits = 3,
label.x = "left",
label.y = "top",
label.x.npc = NULL,
label.y.npc = NULL,
hstep = 0,
vstep = NULL,
output.type = NULL,
na.rm = FALSE,
orientation = NA,
parse = NULL,
show.legend = FALSE,
inherit.aes = TRUE) {
stopifnot(!any(c("formula", "data") %in% names(method.args)))
# we guess formula from orientation
if (is.null(formula)) {
if (is.na(orientation) || orientation == "x") {
formula = y ~ x
} else if (orientation == "y") {
formula = x ~ y
}
}
# we guess orientation from formula
if (is.na(orientation)) {
if (grepl("x", as.character(formula)[2])) {
orientation <- "y"
} else if (grepl("y", as.character(formula)[2])) {
orientation <- "x"
} else {
stop("The model formula should use 'x' and 'y' as variables")
}
}
# backwards compatibility
if (!is.null(label.x.npc)) {
stopifnot(grepl("_npc", geom))
label.x <- label.x.npc
}
if (!is.null(label.y.npc)) {
stopifnot(grepl("_npc", geom))
label.y <- label.y.npc
}
if (is.null(output.type)) {
if (geom %in% c("richtext", "textbox")) {
output.type <- "markdown"
} else {
output.type <- "expression"
}
}
if (is.null(parse)) {
parse <- output.type == "expression"
}
if (is.character(method)) {
if (method == "auto") {
message("Method 'auto' is equivalent to 'lm', splines are not supported.")
method <- "lm"
} else if (grepl("^rq", method)) {
stop("Method 'rq' not supported, please use 'stat_quant_eq()'.")
} else if (grepl("^lmodel2", method)) {
stop("Method 'lmodel2' not supported, please use 'stat_ma_eq()'.")
}
}
if (is.null(rsquared.conf.level) || !is.finite(rsquared.conf.level)) {
rsquared.conf.level <- 0
}
ggplot2::layer(
data = data,
mapping = mapping,
stat = StatPolyEq,
geom = geom,
position = position,
show.legend = show.legend,
inherit.aes = inherit.aes,
params =
rlang::list2(formula = formula,
method = method,
method.args = method.args,
n.min = n.min,
eq.with.lhs = eq.with.lhs,
eq.x.rhs = eq.x.rhs,
small.r = small.r,
small.p = small.p,
CI.brackets = CI.brackets,
rsquared.conf.level = rsquared.conf.level,
coef.digits = coef.digits,
coef.keep.zeros = coef.keep.zeros,
rr.digits = rr.digits,
f.digits = f.digits,
p.digits = p.digits,
label.x = label.x,
label.y = label.y,
hstep = hstep,
vstep = ifelse(is.null(vstep),
ifelse(grepl("label", geom),
0.10,
0.05),
vstep),
npc.used = grepl("_npc", geom),
output.type = output.type,
na.rm = na.rm,
orientation = orientation,
parse = parse,
...)
)
}
# Defined here to avoid a note in check --as-cran as the import from 'polynom'
# is not seen when the function is defined in-line in the ggproto object.
#' @rdname ggpmisc-ggproto
#'
#' @format NULL
#' @usage NULL
#'
poly_eq_compute_group_fun <- function(data,
scales,
method,
method.args,
formula,
n.min,
weight,
eq.with.lhs,
eq.x.rhs,
small.r,
small.p,
CI.brackets,
rsquared.conf.level,
coef.digits,
coef.keep.zeros,
rr.digits,
f.digits,
p.digits,
label.x,
label.y,
hstep,
vstep,
npc.used,
output.type,
na.rm,
orientation) {
force(data)
# parse obeys this option, but as for some labels or output types we do not
# use parse() to avoid dropping of trailing zeros, we need to manage this in
# our code in this case.
decimal.mark <- getOption("OutDec", default = ".")
if (!decimal.mark %in% c(".", ",")) {
warning("Decimal mark must be one of '.' or ',', not: '", decimal.mark, "'")
decimal.mark <- "."
}
range.sep <- c("." = ", ", "," = "; ")[decimal.mark]
if (orientation == "x") {
if (length(unique(data$x)) < n.min) {
return(data.frame())
}
} else if (orientation == "y") {
if (length(unique(data$y)) < n.min) {
return(data.frame())
}
}
output.type <- if (!length(output.type)) {
"expression"
} else {
tolower(output.type)
}
stopifnot(output.type %in%
c("expression", "text", "markdown", "numeric", "latex", "tex", "tikz"))
if (is.null(data$weight)) {
data$weight <- 1
}
if (exists("grp.label", data)) {
if (length(unique(data[["grp.label"]])) > 1L) {
warning("Non-unique value in 'data$grp.label' using group index ", data[["group"]][1], " as label.")
grp.label <- as.character(data[["group"]][1])
} else {
grp.label <- data[["grp.label"]][1]
}
} else {
# if nothing mapped to grp.label we use group index as label
grp.label <- as.character(data[["group"]][1])
}
if (is.integer(data$group)) {
group.idx <- abs(data$group[1])
} else if (is.character(data$group) &&
grepl("^(-1|[0-9]+).*$", data$group[1])) {
# likely that 'gganimate' has set the groups for scenes
# we assume first characters give the original group
group.idx <- abs(as.numeric(gsub("^(-1|[0-9]+).*$", "\\1", data$group[1])))
} else {
group.idx <- NA_integer_
}
if (length(label.x) >= group.idx) {
label.x <- label.x[group.idx]
} else if (length(label.x) > 0) {
label.x <- label.x[1]
}
if (length(label.y) >= group.idx) {
label.y <- label.y[group.idx]
} else if (length(label.y) > 0) {
label.y <- label.y[1]
}
# If method was specified as a character string, replace with
# the corresponding function. Some model fit functions themselves have a
# method parameter accepting character strings as argument. We support
# these by splitting strings passed as argument at a colon.
if (is.character(method)) {
method <- switch(method,
lm = "lm:qr",
rlm = "rlm:M",
method)
method.name <- method
method <- strsplit(x = method, split = ":", fixed = TRUE)[[1]]
if (length(method) > 1L) {
fun.method <- method[2]
method <- method[1]
} else {
fun.method <- character()
}
method <- switch(method,
lm = stats::lm,
rlm = MASS::rlm,
match.fun(method))
} else if (is.function(method)) {
fun.method <- character()
if (is.name(quote(method))) {
method.name <- as.character(quote(method))
} else {
method.name <- "function"
}
}
fun.args <- list(quote(formula),
data = quote(data),
weights = data[["weight"]])
fun.args <- c(fun.args, method.args)
if (length(fun.method)) {
fun.args[["method"]] <- fun.method
}
# some model fit functions contain code with partial matching of names!
# so we silence selectively only these warnings
withCallingHandlers({
fm <- do.call(method, args = fun.args)
fm.summary <- summary(fm)
}, warning = function(w) {
if (startsWith(conditionMessage(w), "partial match of 'coef'") ||
startsWith(conditionMessage(w), "partial argument match of 'contrasts'"))
invokeRestart("muffleWarning")
})
fm.class <- class(fm)
# allow model formula selection by the model fit method
# extract formula from fitted model if possible, but fall back on argument if needed
formula.ls <- fail_safe_formula(fm, fun.args, verbose = TRUE)
if ("fstatistic" %in% names(fm.summary)) {
f.value <- fm.summary[["fstatistic"]]["value"]
f.df1 <- fm.summary[["fstatistic"]]["numdf"]
f.df2 <- fm.summary[["fstatistic"]]["dendf"]
p.value <- stats::pf(q = f.value, f.df1, f.df2, lower.tail = FALSE)
} else {
f.value <- f.df1 <- f.df2 <- p.value <- NA_real_
}
if ("r.squared" %in% names(fm.summary)
) {
rr <- fm.summary[["r.squared"]]
if (!all(is.finite(c(f.value, f.df1, f.df2))) ||
rsquared.conf.level <= 0
) {
rr.confint.low <- rr.confint.high <- NA_real_
} else {
rr.confint <-
confintr::ci_rsquared(x = f.value,
df1 = f.df1,
df2 = f.df2,
probs = ((1 - rsquared.conf.level) / 2) * c(1, -1) + c(0, 1) )
rr.confint.low <- rr.confint[["interval"]][1]
rr.confint.high <- rr.confint[["interval"]][2]
}
} else {
rr <- rr.confint.low <- rr.confint.high <- NA_real_
}
if ("adj.r.squared" %in% names(fm.summary)) {
adj.rr <- fm.summary[["adj.r.squared"]]
} else {
adj.rr <- NA_real_
}
AIC <- AIC(fm)
BIC <- BIC(fm)
n <- length(fm.summary[["residuals"]])
coefs <- stats::coefficients(fm)
formula <- formula.ls[[1L]]
stopifnot(inherits(formula, what = "formula"))
formula.rhs.chr <- as.character(formula)[3]
forced.origin <- grepl("-[[:space:]]*1|+[[:space:]]*0", formula.rhs.chr)
if (forced.origin) {
coefs <- c(0, coefs)
}
selector <- !is.na(coefs)
coefs <- coefs[selector]
names(coefs) <- paste("b", (which(selector)) - 1, sep = "_")
if (!all(selector)) {
message("Terms dropped from model (singularity); n = ", nrow(data), " in group.")
}
if (output.type == "numeric") {
z <- tibble::tibble(coef.ls = list(summary(fm)[["coefficients"]]),
coefs = list(coef(fm)),
r.squared = rr,
rr.confint.level = rsquared.conf.level,
rr.confint.low = rr.confint.low,
rr.confint.high = rr.confint.high,
adj.r.squared = adj.rr,
f.value = f.value,
f.df1 = f.df1,
f.df2 = f.df2,
p.value = p.value,
AIC = AIC,
BIC = BIC,
n = n,
rr.label = "", # needed for default 'label' mapping
b_0.constant = forced.origin)
z <- cbind(z, tibble::as_tibble_row(coefs))
} else {
# set defaults needed to assemble the equation as a character string
if (is.null(eq.x.rhs)) {
eq.x.rhs <- build_eq.x.rhs(output.type = output.type,
orientation = orientation)
}
if (is.character(eq.with.lhs)) {
lhs <- eq.with.lhs
eq.with.lhs <- TRUE
} else if (eq.with.lhs) {
lhs <- build_lhs(output.type = output.type,
orientation = orientation)
} else {
lhs <- character(0)
}
# build equation as a character string from the coefficient estimates
eq.char <- coefs2poly_eq(coefs = coefs,
coef.digits = coef.digits,
coef.keep.zeros = coef.keep.zeros,
eq.x.rhs = eq.x.rhs,
lhs = lhs,
output.type = output.type,
decimal.mark = decimal.mark)
# build the other character strings
stopifnot(rr.digits > 0)
if (rr.digits < 2) {
warning("'rr.digits < 2' Likely information loss!")
}
stopifnot(f.digits > 0)
if (f.digits < 2) {
warning("'f.digits < 2' Likely information loss!")
}
stopifnot(p.digits > 0)
if (p.digits < 2) {
warning("'p.digits < 2' Likely information loss!")
}
if (output.type == "expression") {
rr.char <- sprintf_dm("\"%#.*f\"", rr.digits, rr, decimal.mark = decimal.mark)
adj.rr.char <- sprintf_dm("\"%#.*f\"", rr.digits, adj.rr, decimal.mark = decimal.mark)
rr.confint.chr <- paste(sprintf_dm("%#.*f",
rr.digits, rr.confint.low, decimal.mark = decimal.mark),
sprintf_dm("%#.*f",
rr.digits, rr.confint.high, decimal.mark = decimal.mark),
sep = range.sep)
if (as.logical((rsquared.conf.level * 100) %% 1)) {
conf.level.digits = 1L
} else {
conf.level.digits = 0L
}
conf.level.chr <- sprintf_dm("%.*f", conf.level.digits, rsquared.conf.level * 100, decimal.mark = decimal.mark)
AIC.char <- sprintf_dm("\"%.4g\"", AIC, decimal.mark = decimal.mark)
BIC.char <- sprintf_dm("\"%.4g\"", BIC, decimal.mark = decimal.mark)
f.value.char <- sprintf_dm("\"%#.*g\"", f.digits, f.value, decimal.mark = decimal.mark)
if (grepl("e", f.value.char)) {
f.value.char <- sprintf_dm("%#.*e", f.digits, f.value, decimal.mark = decimal.mark)
f.value.char <- paste(gsub("e", " %*% 10^{", f.value.char), "}", sep = "")
}
f.df1.char <- as.character(f.df1)
f.df2.char <- as.character(f.df2)
if (p.digits == Inf) {
p.value.char <- sprintf_dm("%#.2e", p.value, decimal.mark = decimal.mark)
p.value.char <- paste(gsub("e", " %*% 10^{", p.value.char), "}", sep = "")
} else {
p.value.char <- sprintf_dm("\"%#.*f\"", p.digits, p.value, decimal.mark = decimal.mark)
}
} else {
rr.char <- sprintf_dm("%#.*f", rr.digits, rr, decimal.mark = decimal.mark)
adj.rr.char <- sprintf_dm("%#.*f", rr.digits, adj.rr, decimal.mark = decimal.mark)
rr.confint.chr <- paste(sprintf_dm("%#.*f",
rr.digits, rr.confint.low, decimal.mark = decimal.mark),
sprintf_dm("%#.*f",
rr.digits, rr.confint.high, decimal.mark = decimal.mark),
sep = range.sep)
if (as.logical((rsquared.conf.level * 100) %% 1)) {
conf.level.digits = 1L
} else {
conf.level.digits = 0L
}
conf.level.chr <- sprintf_dm("%.*f", conf.level.digits, rsquared.conf.level * 100, decimal.mark = decimal.mark)
AIC.char <- sprintf_dm("%.4g", AIC, decimal.mark = decimal.mark)
BIC.char <- sprintf_dm("%.4g", BIC, decimal.mark = decimal.mark)
f.value.char <- sprintf_dm("%#.*g", f.digits, f.value, decimal.mark = decimal.mark)
f.df1.char <- as.character(f.df1)
f.df2.char <- as.character(f.df2)
if (p.digits == Inf) {
p.value.char <- sprintf_dm("%#.2e", p.value, decimal.mark = decimal.mark)
} else {
p.value.char <- sprintf_dm("%#.*f", p.digits, p.value, decimal.mark = decimal.mark)
}
}
# build the data frames to return
if (output.type == "expression") {
z <- tibble::tibble(eq.label = eq.char,
rr.label =
# character(0) instead of "" avoids in paste() the insertion of sep for missing labels
ifelse(is.na(rr) || is.nan(rr), character(0L),
paste(ifelse(small.r, "italic(r)^2", "italic(R)^2"),
ifelse(rr < 10^(-rr.digits) & rr != 0,
sprintf_dm("\"%.*f\"", rr.digits, 10^(-rr.digits), decimal.mark = decimal.mark),
rr.char),
sep = ifelse(rr < 10^(-rr.digits) & rr != 0,
"~`<`~",
"~`=`~"))),
adj.rr.label =
ifelse(is.na(adj.rr) || is.nan(adj.rr), character(0L),
paste(ifelse(small.r, "italic(r)[adj]^2", "italic(R)[adj]^2"),
ifelse(adj.rr < 10^(-rr.digits) & adj.rr != 0,
sprintf_dm("\"%.*f\"", rr.digits, 10^(-rr.digits), decimal.mark = decimal.mark),
adj.rr.char),
sep = ifelse(adj.rr < 10^(-rr.digits) & adj.rr != 0,
"~`<`~",
"~`=`~"))),
rr.confint.label =
paste("\"", conf.level.chr, "% CI ", CI.brackets[1], rr.confint.chr, CI.brackets[2], "\"", sep = ""),
AIC.label =
ifelse(is.na(AIC) || is.nan(AIC), character(0L),
paste("AIC", AIC.char, sep = "~`=`~")),
BIC.label =
ifelse(is.na(BIC) || is.nan(BIC), character(0L),
paste("BIC", BIC.char, sep = "~`=`~")),
f.value.label =
ifelse(is.na(f.value) || is.nan(f.value), character(0L),
paste("italic(F)[", f.df1.char,
"*\",\"*", f.df2.char,
"]~`=`~", f.value.char, sep = "")),
p.value.label =
ifelse(is.na(p.value) || is.nan(p.value), character(0L),
paste(ifelse(small.p, "italic(p)", "italic(P)"),
ifelse(p.value < 10^(-p.digits),
sprintf_dm("\"%.*f\"", p.digits, 10^(-p.digits), decimal.mark = decimal.mark),
p.value.char),
sep = ifelse(p.value < 10^(-p.digits),
"~`<`~",
"~`=`~"))),
n.label = paste("italic(n)~`=`~\"", n, "\"", sep = ""),
grp.label = grp.label,
method.label = paste("\"method: ", method.name, "\"", sep = ""),
r.squared = rr,
adj.r.squared = adj.rr,
p.value = p.value,
n = n)
} else if (output.type %in% c("latex", "tex", "text", "tikz")) {
z <- tibble::tibble(eq.label = eq.char,
rr.label =
# character(0) instead of "" avoids in paste() the insertion of sep for missing labels
ifelse(is.na(rr) || is.nan(rr), character(0L),
paste(ifelse(small.r, "r^2", "R^2"),
ifelse(rr < 10^(-rr.digits), as.character(10^(-rr.digits)), rr.char),
sep = ifelse(rr < 10^(-rr.digits), " < ", " = "))),
adj.rr.label =
ifelse(is.na(adj.rr) || is.nan(adj.rr), character(0L),
paste(ifelse(small.r, "r_{adj}^2", "R_{adj}^2"),
ifelse(adj.rr < 10^(-rr.digits), as.character(10^(-rr.digits)), adj.rr.char),
sep = ifelse(adj.rr < 10^(-rr.digits), " < ", " = "))),
rr.confint.label =
paste(conf.level.chr, "% CI ", CI.brackets[1], rr.confint.chr, CI.brackets[2], sep = ""),
AIC.label =
ifelse(is.na(AIC) || is.nan(AIC), character(0L),
paste("AIC", AIC.char, sep = " = ")),
BIC.label =
ifelse(is.na(BIC) || is.nan(BIC), character(0L),
paste("BIC", BIC.char, sep = " = ")),
f.value.label =
ifelse(is.na(f.value) || is.nan(f.value), character(0L),
paste("F_{", f.df1.char, ",", f.df2.char,
"} = ", f.value.char, sep = "")),
p.value.label =
ifelse(is.na(p.value), character(0L),
paste(ifelse(small.p, "p", "P"),
ifelse(p.value < 10^(-p.digits), as.character(10^(-p.digits)), p.value.char),
sep = ifelse(p.value < 10^(-p.digits), " < ", " = "))),
n.label = paste("n = ", n, sep = ""),
grp.label = grp.label,
method.label = paste("method: ", method.name, sep = ""),
r.squared = rr,
adj.r.squared = adj.rr,
p.value = p.value,
n = n)
} else if (output.type == "markdown") {
z <- tibble::tibble(eq.label = eq.char,
rr.label =
# character(0) instead of "" avoids in paste() the insertion of sep for missing labels
ifelse(is.na(rr) || is.nan(rr), character(0L),
paste(ifelse(small.r, "_r_<sup>2</sup>", "_R_<sup>2</sup>"),
ifelse(rr < 10^(-rr.digits), as.character(10^(-rr.digits)), rr.char),
sep = ifelse(rr < 10^(-rr.digits), " < ", " = "))),