This repository has been archived by the owner on May 25, 2023. It is now read-only.
/
becca_plot.R
344 lines (298 loc) · 10.5 KB
/
becca_plot.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#' @title Becca Vichniac's Quartile (Floating Bar) Chart
#'
#' @description
#' \code{becca_plot} returns a ggplot object binned quaritle performonce
#'
#' @details
#' This function builds and prints a bar graph with 4 bins per bar show MAP data
#' binned by quartile (National Percentile Rank). Bars are centered at 50th percentile
#' horizonatally
#'
#' @param .data the data frame in TEAM canoncical style (long data forms)
#' @param school_name_column column in \code{.data} with school names
#' @param cohort_name_column column in \code{.data} with cohornt names
#' @param grade_level_season_column column in \code{.data} with numeric indicating grade season (e.g., Fall 4th
#' = 3.3, Winter 4th = 3.7, Spring 4th = 4.0).
#' @param measurement_scale_column column in \code{.data} with subject
#' @param test_percentile_column = column in \code{.data} with NPR.
#' @param first_and_spring_only indicator for showing shoing Fall-to-Spring rather than Spring-to-Spring
#' @param justify_widths width justification indicator
#' @param justify_min
#' @param justify_max
#' @param entry_grades = c(-0.7, 4.3) GradeSeasons that should use Fall and Spring (i.e., Kinder =-.3 and 5th = 4.3)
#' @param color_scheme only 'KIPP Report Card'
#' @param facets = FALSE
#' @param facet_opts = FALSE
#' @param title_text = FALSE
#' @param small_n_cutoff drop a grade_level_season if less than x% of the max?
#' (useful when dealing with weird cohort histories)
#'
#' @return prints a ggplot object
#' @export
becca_plot <- function(
.data
#for munging data
,school_name_column = 'sch_abbr'
,cohort_name_column = 'cohort'
,grade_level_season_column = 'grade_level_season'
,measurement_scale_column = 'measurementscale'
,test_percentile_column = 'percentile_2011_norms'
#controls if 'extra' data points (fall, winter) get dropped
,first_and_spring_only = TRUE
,entry_grades = c(-0.8, 4.2)
,auto_justify_x = TRUE
,justify_widths = FALSE
,justify_min = NA
,justify_max = NA
,color_scheme = 'KIPP Report Card'
,facets = FALSE
,facet_opts = FALSE
,title_text = FALSE
,small_n_cutoff = .001
) {
require(dplyr)
# Changed passed dataframes' column names to those used throughout
# function
colnames(.data)[colnames(.data) == school_name_column] <- 'SCH_ABBREV'
colnames(.data)[colnames(.data) == cohort_name_column] <- 'COHORT'
colnames(.data)[colnames(.data) == grade_level_season_column] <- 'GRADE_LEVEL_SEASON'
colnames(.data)[colnames(.data) == measurement_scale_column] <- 'MEASUREMENTSCALE'
colnames(.data)[colnames(.data) == test_percentile_column] <- 'PERCENTILE_2011_NORMS'
#TRANSFORMATION 1 - TRIM
#trim down the C.data - we don't need all the columns
require(data.table)
d1 <- as.data.table(as.data.frame(.data[,c(
'SCH_ABBREV'
,'COHORT'
,'GRADE_LEVEL_SEASON'
,'MEASUREMENTSCALE'
,'PERCENTILE_2011_NORMS')]))
#all terms or first & spring only?
if (first_and_spring_only) {
#possible entry grades controlled by entry_grades parameter
#default is Fall K, Fall 5 (aka -0.7, 4.3) - only change if you need to
#add an additional entry grade (perhaps 9th?) or to take away 5th
#(eg for a fully grown KIPP school?)
d1 <- d1[with(d1, round(GRADE_LEVEL_SEASON, 1) %in% round(entry_grades,1) | GRADE_LEVEL_SEASON %% 1 == 0), ]
}
#drop small N time periods
by_grade_season <- group_by(.data, GRADE_LEVEL_SEASON)
grade_season_counts <- dplyr::summarize(
by_grade_season
,n=n()
)
biggest <- max(grade_season_counts$n)
grade_season_counts$include <- grade_season_counts$n >= small_n_cutoff * biggest
use_these <- grade_season_counts[grade_season_counts$include==TRUE, 'GRADE_LEVEL_SEASON']
d1 <- d1[d1$GRADE_LEVEL_SEASON %in% use_these$GRADE_LEVEL_SEASON, ]
#calculate quartile from test percentile
d1[,QUARTILE:=floor((PERCENTILE_2011_NORMS/25) + 1)]
#TRANSFORMATION 2 - COUNT
#calculate group level averages. Our final data set should have
#SCHOOL COHORT YEAR SUBJECT QUARTILE PCT
#There is definitely a more elegant way to do this that doesn't
#require 2 ddply calls, but this works for now
d2<-d1[,list(N_Qrtl=.N),
keyby=list(SCH_ABBREV,
COHORT,
GRADE_LEVEL_SEASON,
MEASUREMENTSCALE,
QUARTILE)][
d1[,list(.N),
keyby=list(SCH_ABBREV,
COHORT,
GRADE_LEVEL_SEASON,
MEASUREMENTSCALE)]][,PCT:=round(N_Qrtl/N*100,1)]
#add a column that indicates above/below grade level
#this simplifies bar chart creation
#set flags for above and below
d2[QUARTILE<=2, AT_GRADE_LEVEL_DUMMY:='NO']
d2[QUARTILE>=3, AT_GRADE_LEVEL_DUMMY:='YES']
#TRANSFORMATION 4 - CUSTOM ORDERING
#this was tricky (and important!) -- thanks Mike H.
d2[,ORDER:=QUARTILE]
#stage_3$ORDER <- stage_3$QUARTILE
#2 becomes placeholder
d2[QUARTILE==2, ORDER:=99]
#stage_3[stage_3$QUARTILE == 2, 'ORDER'] <- 'placeholder'
#1 becomes 2
d2[QUARTILE==1, ORDER:=2]
#stage_3[stage_3$QUARTILE == 1, 'ORDER'] <- 2
#placeholder becomes 1
d2[ORDER==99,ORDER:=1]
#stage_3[stage_3$ORDER == 'placeholder', 'ORDER'] <- 1
#finally sort by new order (so midpoint calculation works properly)
final_data <- copy(d2[order(MEASUREMENTSCALE,
SCH_ABBREV,
COHORT,
GRADE_LEVEL_SEASON,
ORDER)])
#TRANSFORMATION 5 - TWO .datas FOR CHART
#super helpful advice from: http://stackoverflow.com/questions/13734368/ggplot2-and-a-stacked-bar-chart-with-negative-values
#above
npr_above <- final_data[AT_GRADE_LEVEL_DUMMY == 'YES']
#below
npr_below <- final_data[AT_GRADE_LEVEL_DUMMY == 'NO']
#flip the sign
npr_below[, PCT:= PCT * -1]
#TRANSFORMATION 5 - CALCULATE MIDPOINTS (for chart labels)
#one df for the two quartiles above the national average...
npr_above <- npr_above[,list(N,
PCT,
AT_GRADE_LEVEL_DUMMY,
ORDER,
QUARTILE,
MIDPOINT=cumsum(PCT) - 0.5*PCT),
by=list(SCH_ABBREV,
COHORT,
GRADE_LEVEL_SEASON,
MEASUREMENTSCALE)]
#...and another for those below.
npr_below <- npr_below[,list(N,
PCT,
AT_GRADE_LEVEL_DUMMY,
ORDER,
QUARTILE,
MIDPOINT=cumsum(PCT) - 0.5*PCT),
by=list(SCH_ABBREV,
COHORT,
GRADE_LEVEL_SEASON,
MEASUREMENTSCALE)]
npr_below[,QUARTILE:=ordered(QUARTILE, levels = names(sort(-table(QUARTILE))))]
#FORMAT X AXIS LABELS
becca_x_breaks <- sort(unique(final_data$GRADE_LEVEL_SEASON))
becca_x_labels <- unlist(lapply(becca_x_breaks, fall_spring_me))
if (auto_justify_x == TRUE) {
becca_x_breaks <- c(min(becca_x_breaks) - 0.35, becca_x_breaks, max(becca_x_breaks) + 0.35)
becca_x_labels <- c('', becca_x_labels, '')
}
if (justify_widths == TRUE) {
becca_x_breaks <- c(justify_min, becca_x_breaks, justify_max)
becca_x_labels <- c('', becca_x_labels, '')
}
#PLOT PLOT PLOT PLOT
p <- ggplot() +
#top half of NPR plots
geom_bar(
data = npr_above
,aes(
x = GRADE_LEVEL_SEASON
,y = PCT
,fill = factor(QUARTILE)
,order = ORDER
)
,stat = "identity"
) +
#bottom half of NPR plots
geom_bar(
data = npr_below
,aes(
x = GRADE_LEVEL_SEASON
,y = PCT
,fill = factor(QUARTILE)
,order = ORDER
)
,stat = "identity"
) +
#labels above
geom_text(
data = npr_above
,aes(
x = GRADE_LEVEL_SEASON
,y = MIDPOINT
,label = round(PCT,0)
)
,size = 4
) +
#labels below
geom_text(
data = npr_below
,aes(
x = GRADE_LEVEL_SEASON
,y = MIDPOINT
,label = abs(round(PCT, 0))
)
,size = 4
) +
#axis labels
labs(
x = 'Grade Level'
,y = 'Percentage of Cohort'
) +
#clean out some default ggplot formatting elements
theme(
#zero out cetain formatting
panel.background = element_blank()
,plot.background = element_blank()
,panel.grid.major = element_blank()
,panel.grid.minor = element_blank()
,axis.ticks.y = element_blank()
#title and axis sizes
,title = element_text(size = rel(0.9))
,axis.title.x = element_text(size = rel(0.9))
,axis.text.y = element_blank()
,plot.margin = rep(unit(0,"null"),4)
) +
scale_x_continuous(
breaks = becca_x_breaks
,labels = becca_x_labels
) +
coord_cartesian(
xlim=c(min(becca_x_breaks),max(becca_x_breaks))
)
legend_labels = c('1st', '2nd', '3rd', '4th')
#color style?
if(color_scheme == 'KIPP Report Card') {
p <- p +
#dark gray, light gray, light orange, dark orange
scale_fill_manual(
values = c(
rgb(207, 204, 193, max = 255)
,rgb(230, 230, 230, max = 255)
,rgb(254, 188, 17, max = 255)
,rgb(247, 148, 30, max = 255)
)
,name = 'Quartiles'
,labels = legend_labels
)
} else if (color_scheme == 'Sequential Blues') {
p <- p + scale_fill_brewer(
type = "seq"
,palette = 1
)
} else {
p <- p + scale_fill_manual(
values = color_scheme
,labels = legend_labels
)
}
#title?
if (title_text != FALSE) {
p <- p +
labs(
title = title_text
)
}
#facet specified AND facet opts
if (facets != FALSE & facet_opts != FALSE) {
p <- p + eval(facet_grid(as.formula(facets), facet_opts))
#facet specified WITHOUT facet opts
} else if (facets != FALSE & facet_opts == FALSE) {
p <- p + facet_grid(as.formula(facets))
}
#no facet specified = no need to do anything (implicit)
#legend formatting stuff
p <- p + guides(
fill = guide_legend(
#make it a little smaller
title.theme = element_text(
angle = 0
,size = rel(8)
)
,keywidth = .5
,keyheight = .5
#flips the order
,reverse = TRUE)
)
p
}