/
growth.R
1609 lines (1477 loc) · 65.7 KB
/
growth.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Main growthcleanr algorithm and other exported functions
# Main growthcleanr algorithm (cleangrowth): adult and pediatric are in *_clean.R
#(main bulk of algorithm) and *_support.R (supporting functions)
#' Clean growth measurements
#'
#' @param subjid Vector of unique identifiers for each subject in the database.
#' @param param Vector identifying each measurement, may be 'WEIGHTKG', 'WEIGHTLBS', 'HEIGHTCM', 'HEIGHTIN', 'LENGTHCM', or 'HEADCM'.
#' 'HEIGHTCM'/'HEIGHTIN' vs. 'LENGTHCM' only affects z-score calculations between ages 24 to 35 months (730 to 1095 days).
#' All linear measurements below 731 days of life (age 0-23 months) are interpreted as supine length, and
#' all linear measurements above 1095 days of life (age 36+ months) are interpreted as standing height.
#' Note: at the moment, all LENGTHCM will be converted to HEIGHTCM. In the future, the algorithm will be updated to consider this difference.
#' Additionally, imperial 'HEIGHTIN' and 'WEIGHTLBS' measurements are converted to
#' metric during algorithm calculations.
#' @param agedays Numeric vector containing the age in days at each measurement.
#' @param sex Vector identifying the gender of the subject, may be 'M', 'm', or 0 for males, vs. 'F', 'f' or 1 for females.
#' @param measurement Numeric vector containing the actual measurement data. Weight must be in
#' kilograms (kg), and linear measurements (height vs. length) in centimeters (cm).
#' @param recover.unit.error Indicates whether the cleaning algorithm should
#' attempt to identify unit errors (I.e. inches vs. cm, lbs vs. kg). If unit
#' errors are identified, the value will be corrected and retained within the
#' cleaning algorithm as a valid measurement. Defaults to FALSE.
#' @param sd.extreme Measurements more than sd.extreme standard deviations from
#' the mean (either above or below) will be flagged as invalid. Defaults to 25.
#' @param z.extreme Measurements with an absolute z-score greater than
#' z.extreme will be flagged as invalid. Defaults to 25.
#' @param lt3.exclude.mode Determines type of exclusion procedure to use for 1 or 2 measurements of one type without
#' matching same ageday measurements for the other parameter. Options include "default" (standard growthcleanr approach),
#' and "flag.both" (in case of two measurements of one type without matching values for the other parameter, flag both
#' for exclusion if beyond threshold)
#' @param height.tolerance.cm maximum decrease in height tolerated for sequential measurements
#' @param error.load.mincount minimum count of exclusions on parameter before
#' considering excluding all measurements. Defaults to 2.
#' @param error.load.threshold threshold of percentage of excluded measurement count to included measurement
#' count that must be exceeded before excluding all measurements of either parameter. Defaults to 0.5.
#' @param sd.recenter specifies how to recenter medians. May be a data frame or
#' table w/median SD-scores per day of life by gender and parameter, or "NHANES"
#' or "derive" as a character vector.
#' \itemize{
#' \item If `sd.recenter` is specified as a data set, use the data set
#' \item If `sd.recenter` is specified as "`nhanes`", use NHANES reference medians
#' \item If `sd.recenter` is specified as "`derive`", derive from input
#' \item If `sd.recenter` is not specified or `NA`:
#' \itemize{
#' \item If the input set has at least 5,000 observations, derive medians from input
#' \item If the input set has fewer than 5,000 observations, use NHANES
#' }
#' }
#'
#' If specifying a data set, columns must include param, sex, agedays, and sd.median
#' (referred to elsewhere as "modified Z-score"), and those medians will be used
#' for recentering. A summary of how the NHANES reference medians were derived is
#' available in README.md. Defaults to NA.
#' @param sdmedian.filename Name of file to save sd.median data calculated on the input dataset to as CSV.
#' Defaults to "", for which this data will not be saved. Use for extracting medians for parallel processing
#' scenarios other than the built-in parallel option.
#' @param sdrecentered.filename Name of file to save re-centered data to as CSV. Defaults to "", for which this
#' data will not be saved. Useful for post-processing and debugging.
#' @param include.carryforward Determines whether Carry-Forward values are kept in the output. Defaults to False.
#' @param ewma.exp Exponent to use for weighting measurements in the
#' exponentially weighted moving average calculations. Defaults to -1.5.
#' This exponent should be negative in order to weight growth measurements
#' closer to the measurement being evaluated more strongly. Exponents that are
#' further from zero (e.g. -3) will increase the relative influence of
#' measurements close in time to the measurement being evaluated compared to
#' using the default exponent.
#' @param ref.data.path Path to reference data. If not supplied, the year 2000
#' Centers for Disease Control (CDC) reference data will be used.
#' @param log.path Path to log file output when running in parallel (non-quiet mode). Default is NA. A new
#' directory will be created if necessary. Set to NA to disable log files.
#' @param parallel Determines if function runs in parallel. Defaults to FALSE.
#' @param num.batches Specify the number of batches to run in parallel. Only
#' applies if parallel is set to TRUE. Defaults to the number of workers
#' returned by the getDoParWorkers function in the foreach package.
#' @param quietly Determines if function messages are to be displayed and if log files (parallel only) are to be generated.
#' Defaults to TRUE
#' @param adult_cutpoint Number between 18 and 20, describing ages when the
#' pediatric algorithm should not be applied (< adult_cutpoint), and the adult
#' algorithm should apply (>= adult_cutpoint). Numbers outside this range will be
#' changed to the closest number within the range. Defaults to 20.
#' @param weight_cap Positive number, describing a weight cap in kg (rounded to the
#' nearest .1, +/- .1) within the adult dataset. If there is no weight cap, set
#' to Inf. Defaults to Inf.
#' @param adult_columns_filename Name of file to save original adult data, with additional output columns to
#' as CSV. Defaults to "", for which this data will not be saved. Useful
#' for post-analysis. For more information on this output, please see README.
#' @param prelim_infants TRUE/FALSE. Run the in-development release of the infants algorithm (expands pediatric algorithm to improve performance for children 0 – 2 years). Not recommended for use in research. For more information regarding the logic of the algorithm, see the vignette 'Preliminary Infants Algorithm.' Defaults to FALSE.
#'
#' @return Vector of exclusion codes for each of the input measurements.
#'
#' Possible values for each code are:
#'
#' * 'Include', 'Unit-Error-High', 'Unit-Error-Low', 'Swapped-Measurements', 'Missing',
#' * 'Exclude-Carried-Forward', 'Exclude-SD-Cutoff', 'Exclude-EWMA-Extreme', 'Exclude-EWMA-Extreme-Pair',
#' * 'Exclude-Extraneous-Same-Day',
#' * 'Exclude-EWMA-8', 'Exclude-EWMA-9', 'Exclude-EWMA-10', 'Exclude-EWMA-11', 'Exclude-EWMA-12', 'Exclude-EWMA-13', 'Exclude-EWMA-14',
#' * 'Exclude-Min-Height-Change', 'Exclude-Max-Height-Change',
#' * 'Exclude-Pair-Delta-17', 'Exclude-Pair-Delta-18', 'Exclude-Pair-Delta-19',
#' * 'Exclude-Single-Outlier', 'Exclude-Too-Many-Errors', 'Exclude-Too-Many-Errors-Other-Parameter'
#' @md
#'
#' @export
#' @import data.table
#' @rawNamespace import(plyr, except = c(failwith, id, summarize, count, desc, mutate, arrange, rename, is.discrete, summarise, summarize))
#' @import foreach
#' @import doParallel
#' @import parallel
#' @importFrom utils write.csv
#' @rawNamespace import(R.utils, except = c(extract))
#' @examples
#' \donttest{
#' # Run calculation using a small subset of given data
#' df_stats <- as.data.frame(syngrowth)
#' df_stats <- df_stats[df_stats$subjid %in% unique(df_stats[, "subjid"])[1:5], ]
#'
#' clean_stats <-cleangrowth(subjid = df_stats$subjid,
#' param = df_stats$param,
#' agedays = df_stats$agedays,
#' sex = df_stats$sex,
#' measurement = df_stats$measurement)
#'
#' # Once processed you can filter data based on result value
#' df_stats <- cbind(df_stats, "clean_result" = clean_stats)
#' clean_df_stats <- df_stats[df_stats$clean_result == "Include",]
#'
#' # Parallel processing: run using 2 cores and batches
#' clean_stats <- cleangrowth(subjid = df_stats$subjid,
#' param = df_stats$param,
#' agedays = df_stats$agedays,
#' sex = df_stats$sex,
#' measurement = df_stats$measurement,
#' parallel = TRUE,
#' num.batches = 2)
#' }
cleangrowth <- function(subjid,
param,
agedays,
sex,
measurement,
recover.unit.error = FALSE,
sd.extreme = 25,
z.extreme = 25,
lt3.exclude.mode = "default",
height.tolerance.cm = 2.5,
error.load.mincount = 2,
error.load.threshold = 0.5,
sd.recenter = NA,
sdmedian.filename = "",
sdrecentered.filename = "",
include.carryforward = FALSE,
ewma.exp = -1.5,
ref.data.path = "",
log.path = NA,
parallel = FALSE,
num.batches = NA,
quietly = TRUE,
adult_cutpoint = 20,
weight_cap = Inf,
adult_columns_filename = "",
prelim_infants = FALSE) {
# avoid "no visible binding" warnings
N <- age_years <- batch <- exclude <- index <- line <- NULL
newbatch <- sd.median <- sd.orig <- tanner.months <- tbc.sd <- NULL
v <- v_adult <- whoagegrp.ht <- whoagegrp_ht <- z.orig <- NULL
z.orig_cdc <- z.orig_who <- sd.orig_cdc <- sd.orig_who <- NULL
result <- NULL
sd.orig_uncorr <- agemonths <- intwt <- fengadays <- pmagedays <- cagedays <-
unmod_zscore <- fen_wt_m <- fen_wt_l <- fen_wt_s <- cwho_cv <- ccdc_cv <-
sd.c_cdc <- sd.c_who <- sd.c <- sd.corr <- seq_win <- sd.corr_abssumdiff <-
sd.orig_abssumdiff <- orig_colnames <- ctbc.sd <- sum_sde <- no_sde <-
sum_val <- no_dup_val <- no_outliers <- no_bigdiff <- nottoofar <- nnte <-
nnte_full <- NULL
# preprocessing ----
# organize data into a dataframe along with a line "index" so the original data order can be recovered
data.all.ages <- data.table(
line = seq_along(measurement),
subjid = as.factor(subjid),
param,
agedays = as.integer(agedays),
v = ifelse(measurement == 0, NaN, measurement),
v_adult = measurement,
sex = as.integer(ifelse(
sex %in% c(0, 'm', 'M'), 0, ifelse(sex %in% c(1, 'f', 'F'), 1, NA)
))
)
# quality checks
if (!is.numeric(adult_cutpoint)){
stop("adult_cutpoint not numeric. Please enter a number between 18 and 20.")
}
if (!is.numeric(weight_cap) | weight_cap < 0){
stop("weight_cap not numeric. Please enter a positive number.")
}
if (any(!param %in% c("LENGTHCM", "HEIGHTCM", "WEIGHTKG", "HEIGHIN",
"WEIGHTLBS", "HEADCM"))){
cat(sprintf("[%s] Parameters included that do not match 'param' specifications. Marking as missing...\n", Sys.time()))
data.all.ages <-
data.all.ages[
!param %in% c("LENGTHCM", "HEIGHTCM", "WEIGHTKG", "HEIGHIN",
"WEIGHTLBS", "HEADCM"), v := NA]
data.all.ages <-
data.all.ages[
!param %in% c("LENGTHCM", "HEIGHTCM", "WEIGHTKG", "HEIGHIN",
"WEIGHTLBS", "HEADCM"), v_adult := NA]
}
# rate limit cutpoint -- min 18, max 20
cutpoint_update <-
if (adult_cutpoint < 18){
18
} else if (adult_cutpoint > 20){
20
} else {
adult_cutpoint
}
# split by cutpoint
# for ease, data.all will refer to pediatric data; data.adult will refer to
# adult data -- copy to make sure they're separate
data.all <- copy(data.all.ages[agedays < adult_cutpoint*365.25,])
data.adult <- copy(data.all.ages[agedays >= adult_cutpoint*365.25,])
# TODO: ADD PARALLEL FOR ADULTS
# constants for pediatric
# enumerate the different exclusion levels
if (prelim_infants){
# different for infants
exclude.levels.peds <- c(
'Include',
'Unit-Error-High',
'Unit-Error-Low',
'Unit-Error-Possible',
'Swapped-Measurements',
'Exclude',
'Missing',
'Not cleaned',
'Exclude-Temporary-Extraneous-Same-Day',
'Exclude-Carried-Forward',
# added CF exclusions
"Exclude-1-CF-deltaZ-<0.05",
"Exclude-1-CF-deltaZ-<0.1-wholehalfimp",
"Exclude-Teen-2-plus-CF-deltaZ-<0.05",
"Exclude-Teen-2-plus-CF-deltaZ-<0.1-wholehalfimp",
'Exclude-EWMA-Extreme',
'Exclude-EWMA-Extreme-Pair',
'Exclude-SDE-Identical',
'Exclude-SDE-All-Exclude',
'Exclude-SDE-All-Extreme',
'Exclude-SDE-EWMA',
'Exclude-SDE-One-Day',
"Exclude-EWMA2-middle",
"Exclude-EWMA2-birth-WT",
"Exclude-EWMA2-birth-WT-ext",
"Exclude-EWMA2-first",
"Exclude-EWMA2-first-ext",
"Exclude-EWMA2-last",
"Exclude-EWMA2-last-high",
"Exclude-EWMA2-last-ext",
"Exclude-EWMA2-last-ext-high",
"Exclude-EWMA2-birth-HT-HC",
"Exclude-EWMA2-birth-HT-HC-ext",
"Exclude-Min-diff",
"Exclude-Max-diff",
"Exclude-2-meas->1-year",
"Exclude-2-meas-<1-year",
"Exclude-1-meas",
"Exclude-Error-load",
# old
'Exclude-Extraneous-Same-Day',
'Exclude-SD-Cutoff',
'Exclude-EWMA-8',
'Exclude-EWMA-9',
'Exclude-EWMA-10',
'Exclude-EWMA-11',
'Exclude-EWMA-12',
'Exclude-EWMA-13',
'Exclude-EWMA-14',
'Exclude-Min-Height-Change',
'Exclude-Max-Height-Change',
'Exclude-Pair-Delta-17',
'Exclude-Pair-Delta-18',
'Exclude-Pair-Delta-19',
'Exclude-Single-Outlier',
'Exclude-Too-Many-Errors',
'Exclude-Too-Many-Errors-Other-Parameter',
#new
"Exclude-Absolute-BIV",
"Exclude-Standardized-BIV",
"Exclude-Evil-Twins",
"Exclude-EWMA1-Extreme"
)
} else {
exclude.levels.peds <- c(
'Include',
'Unit-Error-High',
'Unit-Error-Low',
'Unit-Error-Possible',
'Swapped-Measurements',
'Exclude',
'Missing',
'Exclude-Temporary-Extraneous-Same-Day',
'Exclude-Carried-Forward',
'Exclude-SD-Cutoff',
'Exclude-EWMA-Extreme',
'Exclude-EWMA-Extreme-Pair',
'Exclude-Extraneous-Same-Day',
'Exclude-EWMA-8',
'Exclude-EWMA-9',
'Exclude-EWMA-10',
'Exclude-EWMA-11',
'Exclude-EWMA-12',
'Exclude-EWMA-13',
'Exclude-EWMA-14',
'Exclude-Min-Height-Change',
'Exclude-Max-Height-Change',
'Exclude-Pair-Delta-17',
'Exclude-Pair-Delta-18',
'Exclude-Pair-Delta-19',
'Exclude-Single-Outlier',
'Exclude-Too-Many-Errors',
'Exclude-Too-Many-Errors-Other-Parameter'
)
}
exclude.levels.adult <- c(
"Include",
"Exclude-Adult-BIV",
"Exclude-Adult-Hundreds",
"Exclude-Adult-Hundreds-RV",
"Exclude-Adult-Unit-Errors",
"Exclude-Adult-Unit-Errors-RV",
"Exclude-Adult-Transpositions",
"Exclude-Adult-Transpositions-RV",
"Exclude-Adult-Weight-Cap-Identical",
"Exclude-Adult-Weight-Cap",
"Exclude-Adult-Swapped-Measurements",
"Exclude-Adult-Identical-Same-Day",
"Exclude-Adult-Extraneous-Same-Day",
"Exclude-Adult-Distinct-Pairs",
"Exclude-Adult-Distinct-3-Or-More",
"Exclude-Adult-EWMA-Extreme",
"Exclude-Adult-EWMA-Extreme-RV",
"Exclude-Adult-Distinct-Ordered-Pairs",
"Exclude-Adult-EWMA-Moderate",
"Exclude-Adult-Possibly-Impacted-By-Weight-Cap",
"Exclude-Adult-Distinct-Single",
"Exclude-Adult-Too-Many-Errors"
)
exclude.levels <- base::union(exclude.levels.peds, exclude.levels.adult)
# if there's no pediatric data, no need to go through this rigamarole
if (nrow(data.all) > 0){
# pediatric: height velocity calculations and preprocessing ----
# for pediatric data, convert in and lbs to cm and kg (adult is done within algo)
data.all[param == "HEIGHTIN", v := v*2.54]
data.all[param == "HEIGHTIN", param := "HEIGHTCM"]
data.all[param == "WEIGHTLBS", v := v/2.2046226]
data.all[param == "WEIGHTLBS", param := "WEIGHTKG"]
# if parallel processing is desired, load additional modules
if (parallel) {
if (is.na(num.batches)) {
num.batches <- getDoParWorkers()
}
if (prelim_infants){
# variables needed for parallel workers
var_for_par <- c("temporary_extraneous", "valid", "swap_parameters",
"na_as_false", "ewma", "read_anthro", "as_matrix_delta",
"sd_median",
"temporary_extraneous_infants",
"get_dop", "calc_oob_evil_twins",
"calc_and_recenter_z_scores")
} else {
var_for_par <- c("temporary_extraneous", "valid", "swap_parameters",
"na_as_false", "ewma", "read_anthro", "as_matrix_delta",
"sd_median")
}
cl <- makeCluster(num.batches)
clusterExport(cl = cl, varlist = var_for_par, envir = environment())
registerDoParallel(cl)
} else {
if (is.na(num.batches))
num.batches <- 1
}
setkey(data.all, subjid)
subjid.unique <- data.all[j = unique(subjid)]
batches.all <- data.table(
subjid = subjid.unique,
batch = sample(num.batches, length(subjid.unique), replace = TRUE),
key = 'subjid'
)
data.all <- batches.all[data.all]
if (!quietly){
cat(sprintf("[%s] Begin processing pediatric data...\n", Sys.time()))
}
# load tanner height velocity data. sex variable is defined such that 0=male and 1=female
# recode column names to match syntactic style ("." rather than "_" in variable names)
tanner_ht_vel_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata", "tanner_ht_vel.csv.gz"), package = "growthcleanr"),
file.path(ref.data.path, "tanner_ht_vel.csv.gz")
)
tanner.ht.vel <- fread(tanner_ht_vel_path)
setnames(tanner.ht.vel,
colnames(tanner.ht.vel),
gsub('_', '.', colnames(tanner.ht.vel)))
setkey(tanner.ht.vel, sex, tanner.months)
# keep track of column names in the tanner data
tanner.fields <- colnames(tanner.ht.vel)
tanner.fields <- tanner.fields[!tanner.fields %in% c('sex', 'tanner.months')]
if (!prelim_infants){
who_max_ht_vel_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata", "who_ht_maxvel_3sd.csv.gz"), package = "growthcleanr"),
file.path(ref.data.path, "who_ht_maxvel_3sd.csv.gz")
)
who_ht_vel_3sd_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata", "who_ht_vel_3sd.csv.gz"), package = "growthcleanr"),
file.path(ref.data.path, "who_ht_vel_3sd.csv.gz")
)
} else {
who_max_ht_vel_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata", "who_hc_maxvel_3sd_infants.csv.gz"), package = "growthcleanr"),
file.path(ref.data.path, "who_hc_maxvel_3sd_infants.csv.gz")
)
who_ht_vel_3sd_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata", "who_hc_vel_3sd_infants.csv.gz"), package = "growthcleanr"),
file.path(ref.data.path, "who_hc_vel_3sd_infants.csv.gz")
)
}
who.max.ht.vel <- fread(who_max_ht_vel_path)
who.ht.vel <- fread(who_ht_vel_3sd_path)
setkey(who.max.ht.vel, sex, whoagegrp_ht)
setkey(who.ht.vel, sex, whoagegrp_ht)
who.ht.vel <- as.data.table(dplyr::full_join(who.ht.vel, who.max.ht.vel, by =
c('sex', 'whoagegrp_ht')))
setnames(who.ht.vel, colnames(who.ht.vel), gsub('_', '.', colnames(who.ht.vel)))
setkey(who.ht.vel, sex, whoagegrp.ht)
# keep track of column names in the who growth velocity data
who.fields <- colnames(who.ht.vel)
who.fields <- who.fields[!who.fields %in% c('sex', 'whoagegrp.ht')]
# 1. General principles
# a. All steps are done separately for each parameter unless otherwise noted
# b. All steps are done sorted by subject, parameter, and age (in days) for nonexcluded and nonmissing values only unless otherwise noted. This is very important.
# Sorting needs to be redone with each step to account for excluded and transformed values.
# c. The next value refers to the value with the next highest age for the same parameter and the same subject, and the previous value refers to the value with the
# next lowest age for the same parameter and the same subject.
# d. You will need to set up a method for keeping track of whether a value is missing or excluded (and in what step). I use variables called exc_* that are =0
# if a value is to be included, =1 if missing, and =2 or higher if it is to be excluded, with each number indicating a different step. I also set up
# parameter-specific subjid_* variables that are = to subjid for included values and are blank if the value is missing or should be excluded. These subjid_*
# variables need to be updated with each step.
# e. All steps assume that data are sorted by subjid_*, parameter, and age (in days) for nonexcluded and nonmissing values only
# unless otherwise noted. Sorting needs to be redone after any transformations or exclusions to account for excluded and
# transformed values.
# f. The next value refers to the nonexcluded nonmissing value with the next highest age for the same parameter and the same
# subject, and the previous value refers to the nonexcluded nonmissing value with the next lowest age for the same parameter
# and the same subject.
# g. exc_* should only be replaced with a higher value if exc_*==0 at the time of replacement, unless otherwise specified.
# NOTE: in the R code below exclusion is documented as a series of factor levels, where all levels occuring before 'Exclude' in the sequence are considered
# to be valid measurements. We use the built in sorting of the data.table object and subsets rather than re-sorting at each step
# to ensure that only valid measurements are used at the beginning of each step.
# Also, unlike the Stata code, the measurement parameter (weight vs. height) is recorded as a factor in the data frame, rather than as a variable name
# 2. Data set-up
# a. I always code sex as 0=Male, 1=Female, so I recoded the variable sex that way and left a variable sexorigcode the way the data was sent to me (1=Female 2=Male)
# b. Remove rows that are extraneous for subjid, param, and measurement from further analysis
# NOTE: this step is not needed -- handled automatically by "temporary extraneous" step.
# c. I generated separate variables for weight (wt) and height (ht), as well as exc_* and subjid_* variables. Set exc_*=0 if value is not missing
# and exc_*=1 if value is missing. In all future steps, exc_* should only be changed if it is 0. This helps to keep track of which step excluded a value.
# I also kept the measurement variable there and untouched because sometimes wt and ht got transformed to something else.
# d. I made tables based on CDC growth curve parameters that include data for each day that I will send separately. The LMS parameters for each day are
# cubically interpolated from the values by month available on the CDC website. Create wt and ht z-scores for each value of each parameter (Z: WtZ and HtZ).
# e. There are variables in the table labelled cdc_*_csd_pos and cdc_*_csd_neg. For each age and sex, these correspond to ½ of the absolute value of the
# median and the value with a z-score of +2 (csd_pos) and -2 (csd_neg). These can be created to generate a score similar to the z-score but with an
# important difference. The z-scores created using the LMS method account for skewness in the distribution of the parameters (particularly weight), which
# can lead to small changes in z-score with large changes in weight in subjects with very high weight, and relatively large changes in z-score for smaller
# changes in weight in subjects with low weights. The score we will create can be called an SD-score (SDorig: WtSDorig and HtSDorig that is calculated by
# dividing the difference between the value and the median by the SD score (use csd_pos if the value is above the median, csd_neg if the value is below the
# median). These SD-scores, rather than z-scores, now form the basis for the algorithm.
# recategorize linear parameters as 'HEIGHTCM'
# NOTE: this will be changed in future to consider this difference
data.all[param == 'LENGTHCM', param := 'HEIGHTCM']
# calculate z/sd scores
if(prelim_infants){
if (!quietly)
cat(sprintf("[%s] Calculating z-scores...\n", Sys.time()))
# removing z calculations, as they are not used
# for infants, use z and who
measurement.to.z <- read_anthro(ref.data.path, cdc.only = TRUE,
prelim_infants = TRUE)
measurement.to.z_who <- read_anthro(ref.data.path, cdc.only = FALSE,
prelim_infants = TRUE)
# calculate "standard deviation" scores
if (!quietly)
cat(sprintf("[%s] Calculating SD-scores...\n", Sys.time()))
data.all[, sd.orig_cdc := measurement.to.z(param, agedays, sex, v, TRUE)]
data.all[, sd.orig_who := measurement.to.z_who(param, agedays, sex, v, TRUE)]
# smooth z-scores/SD scores between ages 1 - 3yo using weighted scores
# older uses cdc, younger uses who
data.all$ageyears <- data.all$agedays/365.25
who_weight <- 4 - (data.all$ageyears)
cdc_weight <- (data.all$ageyears) - 2
smooth_val <- data.all$ageyears >= 2 &
data.all$ageyears <= 4 &
data.all$param != "HEADCM"
data.all[smooth_val,
sd.orig := (data.all$sd.orig_cdc[smooth_val]*cdc_weight[smooth_val] +
data.all$sd.orig_who[smooth_val]*who_weight[smooth_val])/2]
# otherwise use WHO and CDC for older and younger, respectively
who_val <- data.all$param == "HEADCM" |
data.all$ageyears < 2
data.all[(who_val & !smooth_val) | (smooth_val & is.na(data.all$sd.orig_cdc)),
sd.orig := data.all$sd.orig_who[(who_val & !smooth_val) | (smooth_val & is.na(data.all$sd.orig_cdc))]]
cdc_val <- data.all$param != "HEADCM" &
data.all$ageyears > 4
data.all[(cdc_val & !smooth_val) | (smooth_val & is.na(data.all$sd.orig_who)),
sd.orig := data.all$sd.orig_cdc[(cdc_val & !smooth_val) | (smooth_val & is.na(data.all$sd.orig_who))]]
# NOTE: SD SCORES IN CODE ARE Z IN INFANT DOCS -- USE sd.orig ONLY
# keep the original, uncorrected, unrecentered zscores
data.all[,sd.orig_uncorr := sd.orig]
# NOTE: MAY WANT TO SUBSET HERE
# 2b: corrected z scores ----
# keep the original column names -- we're adding a ton of columns that we
# want to filter out after correction
orig_colnames <- copy(colnames(data.all))
# start by reading in fenton data
fentlms_foraga <- fread(
system.file(file.path("extdata", "fentlms_foraga.csv.gz"),
package = "growthcleanr"))
fentlms_forz <- fread(
system.file(file.path("extdata", "fentlms_forz.csv.gz"),
package = "growthcleanr"))
# add age in months
data.all[, agemonths := agedays/30.4375]
potcorr <- data.all$param == "WEIGHTKG" &
data.all$sd.orig < -2 &
data.all$agemonths < 10
# integer weight is in grams, rounded to the nearest 10
data.all[potcorr, intwt := round(v*100)*10]
# replace to facilitate merging with fenton curves
data.all[intwt >= 250 & intwt <=560, intwt := 570]
# merge with fenton curves
data.all <- merge(
data.all, fentlms_foraga, by = c("sex", "intwt"),
all.x = TRUE)
data.all[fengadays < 259, pmagedays := agedays + fengadays]
data.all[fengadays < 259, cagedays := pmagedays - 280]
# replace fengadays with pmagedays to facilitate merging
data.all[, fengadays := pmagedays]
# merge with fenton curves
data.all <- merge(
data.all, fentlms_forz, by = c("sex", "fengadays"),
all.x = TRUE)
# add unmodified zscore using weight in unrounded grams
data.all[, unmod_zscore :=
((v*1000/fen_wt_m)^fen_wt_l - 1)/(fen_wt_l * fen_wt_s)]
# read in who/cdc data
growthfile_who <- fread(
system.file(file.path("extdata", "growthfile_who.csv.gz"),
package = "growthcleanr"))
growthfile_cdc <- fread(
system.file(file.path("extdata", "growthfile_cdc_ext_infants.csv.gz"),
package = "growthcleanr"))
# merge in the who/cdc data
data.all <- merge(data.all, growthfile_who,
by.x = c("sex", "cagedays"),
by.y = c("sex", "agedays"),
all.x = TRUE)
data.all <- merge(data.all, growthfile_cdc,
by.x = c("sex", "cagedays"),
by.y = c("sex", "agedays"),
all.x = TRUE)
# adjust WHO and CDC heights based on age
data.all[, cwho_cv := v]
data.all[, ccdc_cv := v]
data.all[param == "HEIGHTCM" & agedays > 730 & cagedays <= 730,
cwho_cv := cwho_cv + 0.8]
data.all[param == "HEIGHTCM" & agedays > 730 & cagedays <= 730,
ccdc_cv := ccdc_cv + 0.7]
# create the corrected z scores
# use read anthro, but pass in different arguments
# pass in the corrected height
data.all[, sd.c_cdc :=
measurement.to.z(param, cagedays, sex, ccdc_cv, TRUE)]
data.all[, sd.c_who :=
measurement.to.z_who(param, cagedays, sex, cwho_cv, TRUE)]
# smooth using weights as in original z score creation
who_weight <- 4 - (data.all$agedays/365.25)
cdc_weight <- (data.all$agedays/365.25) - 2
smooth_val <- data.all$agedays/365.25 >= 2 &
data.all$agedays/365.25 <= 4 &
data.all$param != "HEADCM"
data.all[smooth_val,
sd.c := (sd.c_cdc[smooth_val]*cdc_weight[smooth_val] +
sd.c_who[smooth_val]*who_weight[smooth_val])/2]
# otherwise use WHO and CDC for older and younger, respectively
who_val <- data.all$param == "HEADCM" |
data.all$agedays/365.25 < 2
data.all[who_val | (smooth_val & is.na(data.all$sd.c_cdc)),
sd.c := data.all$sd.c_who[who_val | (smooth_val & is.na(data.all$sd.c_cdc))]]
cdc_val <- data.all$param != "HEADCM" |
data.all$agedays/365.25 > 4
data.all[cdc_val | (smooth_val & is.na(data.all$sd.c_who)),
sd.c := data.all$sd.c_cdc[cdc_val | (smooth_val & is.na(data.all$sd.c_who))]]
# smooth corrected and uncorrected z scores
uncorrweight <- 4 - (data.all$agedays/365.25)
corrweight <- (data.all$agedays/365.25) - 2
smooth_val <- data.all$agedays/365.25 >= 2 &
data.all$agedays/365.25 <= 4
data.all[smooth_val,
sd.corr := (sd.c[smooth_val]*corrweight[smooth_val] +
sd.orig[smooth_val]*uncorrweight[smooth_val])/2]
# for < 2 & potential correction, use fenton corrected score
data.all[agedays/365.25 <= 2 & potcorr, sd.corr := sd.c]
# for > 4, use original z score
data.all[agedays/365.25 >= 4, sd.corr := sd.orig]
# for not potential corrections, use the original z score
data.all[!potcorr, sd.corr := sd.orig]
# if the who/fenton score is not available for any reason, use the
# original
data.all[is.na(sd.corr), sd.corr := sd.orig]
# check for consistent growth
examine_only <- data.all$param == "WEIGHTKG" &
data.all$subjid %in% data.all$subjid[potcorr]
tmp <- copy(data.all[examine_only,])
tmp <- tmp[order(subjid, agedays),]
tmp[, seq_win := sequence(.N), by = subjid]
# we're only looking at the first 4 values, and they need to be < 2 years
tmp <- tmp[seq_win <= 4 & (agedays/365.25) < 2,]
# don't look at subjects where there is only 1 score and there is no
# value for either
tmp <- tmp[!(is.na(sd.corr & sd.orig)),]
tmp <- tmp[subjid %in% names(table(subjid) > 1),]
# create differences, absolute sum them
tmp[, sd.corr_abssumdiff := abs(sum(sd.corr[1] - sd.corr)), by = subjid]
tmp[, sd.orig_abssumdiff := abs(sum(sd.orig[1] - sd.orig)), by = subjid]
# find subjects where corrected value needs to be replaced
sub_replace <- unique(tmp[sd.corr_abssumdiff > sd.orig_abssumdiff,
subjid])
# replace accordingly in the main dataframe
data.all[subjid %in% sub_replace, sd.corr := sd.orig]
orig_colnames <- c(orig_colnames, "sd.corr")
# remove many added columns
data.all <- data.all[, colnames(data.all) %in% orig_colnames,
with = FALSE]
} else {
# calculate z scores
if (!quietly)
cat(sprintf("[%s] Calculating z-scores...\n", Sys.time()))
measurement.to.z <- read_anthro(ref.data.path, cdc.only = TRUE)
data.all[, z.orig := measurement.to.z(param, agedays, sex, v)]
# calculate "standard deviation" scores
if (!quietly)
cat(sprintf("[%s] Calculating SD-scores...\n", Sys.time()))
data.all[, sd.orig := measurement.to.z(param, agedays, sex, v, TRUE)]
}
# sort by subjid, param, agedays
setkey(data.all, subjid, param, agedays)
# add a new convenience index for bookkeeping
data.all[, index := 1:.N]
# Mark missing values for exclusion
data.all[, exclude := factor(with(data.all, ifelse(
is.na(v) |
agedays < 0, 'Missing', 'Include'
)),
levels = exclude.levels,
ordered = TRUE)]
# also mark certain measurements to not consider
data.all[param == "HEADCM" & agedays > (3*365.25), exclude := "Not cleaned"]
# define field names needed by helper functions
ewma.fields <- c('ewma.all', 'ewma.before', 'ewma.after')
# 3. SD-score recentering: Because the basis of the method is comparing SD-scores over time, we need to account for the fact that
# the mean SD-score for the population changes with age.
# a. Determine the median cdc*sd for each parameter by year of age (with sexes combined): median*sd.
# b. The median*sd should be considered to apply to midyear-age, defined as the age in days with the same value as the integer
# portion of (365.25*year + 365.25/2).
# c. Linearly interpolate median*sd for each parameter between each midyear-age, naming the interpolated values rc*sd.
# d. For ages below the first midyear-age, let rc*sd equal the median*sd for the earliest year.
# For ages above the last midyear_age, let rc*sd equal the median*sd for the last year.
# e. Subtract rcsd_* from SDorig to create the recentered SD-score. This recentered SD-score, labeled tbc*sd
# (stands for "to be cleaned") will be used for most of the rest of the analyses.
# f. In future steps I will sometimes refer to measprev and measnext which refer to the previous or next wt or ht measurement
# for which exc_*==0 for the subject and parameter, when the data are sorted by subject, parameter, and agedays. SDprev and SDnext refer to the tbc*sd of the previous or next measurement.
if (!quietly)
cat(sprintf("[%s] Re-centering data...\n", Sys.time()))
# see function definition below for explanation of the re-centering process
# returns a data table indexed by param, sex, agedays. can use NHANES reference
# data, derive from input, or use user-supplied data.
if (!is.data.table(sd.recenter)) {
# INFANTS CHANGES:
# use recentering file derived from work, independent of sex
if (prelim_infants){
infants_reference_medians_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata",
"rcfile-2023-08-15_format.csv.gz"),
package = "growthcleanr"),
file.path(ref.data.path, "rcfile-2023-08-15_format.csv.gz")
)
sd.recenter <- fread(infants_reference_medians_path)
if (!quietly)
cat(
sprintf(
"[%s] Using infants reference medians...\n",
Sys.time()
)
)
} else if ((is.character(sd.recenter) &
tolower(sd.recenter) == "nhanes") |
(!(is.character(sd.recenter) &
tolower(sd.recenter) == "derive") & (data.all[, .N] < 5000))) {
# Use NHANES medians if the string "nhanes" is specified instead of a data.table
# or if sd.recenter is not specified as "derive" and N < 5000.
nhanes_reference_medians_path <- ifelse(
ref.data.path == "",
system.file(file.path("extdata", "nhanes-reference-medians.csv.gz"), package = "growthcleanr"),
file.path(ref.data.path, "nhanes-reference-medians.csv.gz")
)
sd.recenter <- fread(nhanes_reference_medians_path)
if (!quietly)
cat(
sprintf(
"[%s] Using NHANES reference medians...\n",
Sys.time()
)
)
} else {
# Derive medians from input data
sd.recenter <- data.all[exclude < 'Exclude', sd_median(param, sex, agedays, sd.orig)]
if (!quietly)
cat(
sprintf(
"[%s] Using re-centering medians derived from input...\n",
Sys.time()
)
)
if (sdmedian.filename != "") {
write.csv(sd.recenter, sdmedian.filename, row.names = FALSE)
if (!quietly)
cat(
sprintf(
"[%s] Wrote re-centering medians to %s...\n",
Sys.time(),
sdmedian.filename
)
)
}
}
} else {
# Use specified data
if (!quietly)
cat(
sprintf(
"[%s] Using specified re-centering medians...\n",
Sys.time()
)
)
}
# ensure recentering medians are sorted correctly
setkey(sd.recenter, param, sex, agedays)
# add sd.recenter to data, and recenter
setkey(data.all, param, sex, agedays)
data.all <- sd.recenter[data.all]
setkey(data.all, subjid, param, agedays)
data.all[, tbc.sd := sd.orig - sd.median]
if (prelim_infants){
# separate out corrected and noncorrected values
data.all[, ctbc.sd := sd.corr - sd.median]
}
if (sdrecentered.filename != "") {
write.csv(data.all, sdrecentered.filename, row.names = FALSE)
if (!quietly)
cat(
sprintf(
"[%s] Wrote re-centered data to %s...\n",
Sys.time(),
sdrecentered.filename
)
)
}
# notification: ensure awareness of small subsets in data
if (!quietly) {
year.counts <- data.all[, .N, floor(agedays / 365.25)]
if (year.counts[N < 100, .N] > 0) {
cat(
sprintf(
"[%s] Note: input data has at least one age-year with < 100 subjects...\n",
Sys.time()
)
)
}
}
# safety check: treat observations where tbc.sd cannot be calculated as missing
data.all[is.na(tbc.sd), exclude := 'Missing']
if (prelim_infants){
# 4: identify subset that don't need to be cleaned (nnte) ----
# identify those meeting all subjects meeting these criteria as "no need
# to ewma"
# does the subject have sdes
# keep the original column names -- we're adding a ton of columns that we
# want to filter out after correction
orig_colnames <- copy(colnames(data.all))
# no SDEs
data.all[, sum_sde := .N, by = c("subjid", "param", "agedays")]
data.all[, no_sde := sum_sde == 1]
# does the subject have identical values
data.all[, sum_val := .N, by = c("subjid", "param", "v")]
data.all[, no_dup_val := sum_val == 1]
# all tbc.sd are within [-3,3] -- 0 is false
data.all[, no_outliers := sum((tbc.sd > -3 & tbc.sd < 3) |
is.na(tbc.sd)) == (.N),
by = c("subjid", "param")]
data.all[, no_outliers := no_outliers == 1]
# all max - min tbd.sc < 2.5
data.all[, no_bigdiff :=
rep((abs(max(tbc.sd, na.rm = TRUE) - min(tbc.sd, na.rm = TRUE)) < 2.5),
.N),
by = c("subjid", "param")]
# the previous value can't be too far from the current value
data.all[, seq_win := sequence(.N), by = c("subjid", "param")]
data.all[, nottoofar :=
(abs(tbc.sd - dplyr::lag(tbc.sd)) < 1 | seq_win == 1) &
(abs(tbc.sd - dplyr::lead(tbc.sd)) < 1 | seq_win == .N),
by = c("subjid", "param")]
data.all[is.na(nottoofar), nottoofar := TRUE]
# cumulative: no need to ewma -- needs to work for all within a subject &
# parameter
data.all[, nnte := no_sde & no_dup_val & no_outliers & no_bigdiff & nottoofar]
# NNTE can be calculated by parameter -- but it's occasionally easier for
# calculations to require all parameters to be nnte
data.all[, nnte_full := sum(nnte) == .N, by = c("subjid", "param")]
data.all[, nnte := sum(nnte) == .N, by = c("subjid")]
# remove many added columns -- except for nnte
orig_colnames <- c(orig_colnames, "nnte", "nnte_full")
data.all <- data.all[, colnames(data.all) %in% orig_colnames,
with = FALSE]
}
# pediatric: cleanbatch (most of steps) ----
# NOTE: the rest of cleangrowth's steps are done through cleanbatch().
# optionally process batches in parallel
if (!quietly)
cat(sprintf(
"[%s] Cleaning growth data in %d batch(es)...\n",
Sys.time(),
num.batches
))
if (num.batches == 1) {
if (!prelim_infants){
ret.df <- cleanbatch(data.all,
log.path = log.path,
quietly = quietly,
parallel = parallel,
measurement.to.z = measurement.to.z,
ewma.fields = ewma.fields,
ewma.exp = ewma.exp,
recover.unit.error = recover.unit.error,
include.carryforward = include.carryforward,
sd.extreme = sd.extreme,
z.extreme = z.extreme,
exclude.levels = exclude.levels,
tanner.ht.vel = tanner.ht.vel,
who.ht.vel = who.ht.vel,
lt3.exclude.mode = lt3.exclude.mode,
error.load.threshold = error.load.threshold,
error.load.mincount = error.load.mincount)
} else {
ret.df <- cleanbatch_infants(
data.all,
log.path = log.path,
quietly = quietly,
parallel = parallel,
measurement.to.z = measurement.to.z,
ewma.fields = ewma.fields,
recover.unit.error = recover.unit.error,
include.carryforward = include.carryforward,
sd.extreme = sd.extreme,
z.extreme = z.extreme,
exclude.levels = exclude.levels,
tanner.ht.vel = tanner.ht.vel,
who.ht.vel = who.ht.vel,
lt3.exclude.mode = lt3.exclude.mode,
error.load.threshold = error.load.threshold,
error.load.mincount = error.load.mincount,
ref.data.path = ref.data.path)
}
} else {
# create log directory if necessary
if (!is.na(log.path)) {
cat(sprintf("[%s] Writing batch logs to '%s'...\n", Sys.time(), log.path))
ifelse(!dir.exists(log.path), dir.create(log.path, recursive = TRUE), FALSE)
}
if (!prelim_infants){
ret.df <- ddply(
data.all,
.(batch),
cleanbatch,
.parallel = parallel,
.paropts = list(.packages = "data.table"),
log.path = log.path,
quietly = quietly,
parallel = parallel,
measurement.to.z = measurement.to.z,
ewma.fields = ewma.fields,
ewma.exp = ewma.exp,
recover.unit.error = recover.unit.error,
include.carryforward = include.carryforward,
sd.extreme = sd.extreme,
z.extreme = z.extreme,
exclude.levels = exclude.levels,
tanner.ht.vel = tanner.ht.vel,
who.ht.vel = who.ht.vel,
lt3.exclude.mode = lt3.exclude.mode,