In [1]:
library(tidyverse)
library(lubridate)

library(assertr)

library(ggtext)


── [1mAttaching packages[22m ────────────────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ───────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




## Notes


#### Legal (ISO) gender types:

* https://data.gov.uk/education-standards/sites/default/files/CL-Legal-Sex-Type-v2-0.pdf


#### For data from 2010 and all stored as % 

* need to relax sum to 100%
* 			
Symbol	Meaning		
    * '-'	Not Applicable		
    * '-'	No Entries (Table 3)		
    * 0%	Less than 0.5%		
    * ***	Fewer Than 5 Entries	

<br>&nbsp;
<h3>Error Checking &amp; Warnings</h3>

* Ideally correct errors here and write out corrected csv to file with a note
* TODO - log errors found and include error-checking code as part of pre-processing flow



<h3>Errors to Watch For</h3>

<b>Please document as not found and/or what corrected, so can trace back to original. 
Update as needed and mirror in final docs submitted with project.</b>

* "Computing" (or "Computing Studies" or "Computing (New)") ... included in list of subjects
    * need to decide if files will be excluded or included with a flag to track changes in subjects offered
* Each subject and grade listed only once per gender
* proportions of male/female add up to 1
<br />


<h3>Warning Only Needed</h3>

<b>Need only document if triggered.</b>

* All values for a subject set to "-" or 0 (rare) -> translates to NAs if read in properly

<br />

In [2]:
files_to_verify <- list.files("sta_it_402/data", pattern = "higher", full.names = T, 
                              ignore.case = T, recursive = T) #%>%
                        #as.data.frame
files_to_verify
length(files_to_verify)

In [3]:
focus_subject <- "computing"
redundant_column_flags <- c("-Passes", "-percentage*", "-COMP", "-PassesUngradedCourses")

gender_options <- c("male-", "female-", "NotKnown-", "NA-", "NotApplicable")

In [4]:
# check focus subject (typically, but not necessarily, Computing) in list of subjects

checkFocusSubjectListed <- 
    function(awardFile, glimpseContent = FALSE, listSubjects = FALSE) {
        awardData <- read_csv(awardFile, trim_ws = TRUE) %>% #, skip_empty_rows = T) # NOT skipping empty rows... :(
                            filter(rowSums(is.na(.)) != ncol(.)) %>%
                            suppressMessages
         
        print(awardFile)
        if (!exists("focus_subject") || is_null(focus_subject) || (str_trim(focus_subject) == "")) {
            focus_subject <- "computing"
            print(paste("No focus subject specified; defaulting to subjects containing: ", focus_subject))
            
        } else 
            print(paste("Search on focus subject (containing term) '", focus_subject, "'", sep = ""))
        
        if (glimpseContent)
            print(glimpse(awardData))
        
        result <- awardData %>%
            select(Subject) %>%

            filter(str_detect(Subject, regex(focus_subject, ignore_case = TRUE))) %>%
            verify(nrow(.) > 0, error_fun = just_warn) 
        
        if (!listSubjects)    
            return(nrow(result)) # comment out this row to list subject names
        else
            return(result)
    }

In [5]:
#lapply(files_to_verify, checkFocusSubjectListed, listSubjects = TRUE)
#Map(checkFocusSubjectListed, files_to_verify, listSubjects = TRUE)

as.data.frame(sapply(files_to_verify, checkFocusSubjectListed)) # call without as.data.frame if listing values


[1] "sta_it_402/data/grades/1986_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
verification [nrow(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA nrow(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/1987_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
verification [nrow(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA nrow(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/1988_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
verification [nrow(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA nrow(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/1989_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1990_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1991_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1992_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1993_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1994_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1995_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1996_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1997_Higher.csv"
[1] "Search on focus subject (containing term) 'computing'"
[1] "sta_it_402/data/grades/1998_Higher.csv"
[1] "Searc

Unnamed: 0_level_0,"sapply(files_to_verify, checkFocusSubjectListed)"
Unnamed: 0_level_1,<int>
sta_it_402/data/grades/1986_Higher.csv,0
sta_it_402/data/grades/1987_Higher.csv,0
sta_it_402/data/grades/1988_Higher.csv,0
sta_it_402/data/grades/1989_Higher.csv,1
sta_it_402/data/grades/1990_Higher.csv,1
sta_it_402/data/grades/1991_Higher.csv,1
sta_it_402/data/grades/1992_Higher.csv,1
sta_it_402/data/grades/1993_Higher.csv,1
sta_it_402/data/grades/1994_Higher.csv,1
sta_it_402/data/grades/1995_Higher.csv,1


In [6]:
# check for data stored as percentages only

checkDataAsPercentageOnly <- 
    function(awardFile, glimpseContent = FALSE) {
        awardData <- read_csv(awardFile, trim_ws = TRUE) %>% #, skip_empty_rows = T) # NOT skipping empty rows... :(
                            filter(rowSums(is.na(.)) != ncol(.)) %>%
                            suppressMessages
        
        print(awardFile)
        if (glimpseContent)
            print(glimpse(awardData))
        
        if (!exists("redundant_column_flags") || is.null(redundant_column_flags)) 
            redundant_column_flags <- c("-percentage*", "-COMP", "-PassesUngradedCourses")
        
        awardData %>%
            select(-matches(c(redundant_column_flags, "all-Entries"))) %>% # "-percentage")) %>%
            select(matches(c("male-", "female-", "all-"))) %>%
            verify(ncol(.) > 0, error_fun = just_warn) %>%
        
            #head(0) - comment in and next line out to list headers remaining
            summarise(data_as_counts = (ncol(.) > 0))
    }

In [7]:
sapply(files_to_verify, checkDataAsPercentageOnly)
#Map(checkDataAsPercentageOnly, files_to_verify) #, T)


[1] "sta_it_402/data/grades/1986_Higher.csv"
[1] "sta_it_402/data/grades/1987_Higher.csv"
[1] "sta_it_402/data/grades/1988_Higher.csv"
[1] "sta_it_402/data/grades/1989_Higher.csv"
[1] "sta_it_402/data/grades/1990_Higher.csv"
[1] "sta_it_402/data/grades/1991_Higher.csv"
[1] "sta_it_402/data/grades/1992_Higher.csv"
[1] "sta_it_402/data/grades/1993_Higher.csv"
[1] "sta_it_402/data/grades/1994_Higher.csv"
[1] "sta_it_402/data/grades/1995_Higher.csv"
[1] "sta_it_402/data/grades/1996_Higher.csv"
[1] "sta_it_402/data/grades/1997_Higher.csv"
[1] "sta_it_402/data/grades/1998_Higher.csv"
[1] "sta_it_402/data/grades/1999_Higher.csv"
[1] "sta_it_402/data/grades/2000_NewHigher.csv"
[1] "sta_it_402/data/grades/2001_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2001_NewHigher.csv"
[1] "sta_it_402/data/grades/2002_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2002_NewHigher.csv"
[1] "sta_it_402/data/grades/2003_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2003_Higher.csv"
[1] "sta_it_402/data/g

“assertr encountered errors”


[1] "sta_it_402/data/grades/2014_Higher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2015_AdvancedHigher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2015_Higher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2015_NewHigher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2016_AdvancedHigher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2016_Higher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2017_AdvancedHigher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2017_Higher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2018_AdvancedHigher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2018_Higher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2019_AdvancedHigher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2019_Higher.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2020_AdvancedHigher-revised-12.csv"
[1] "sta_it_402/data/grades/2020_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2020_Higher-revised-12.csv"
[1] "sta_it_402/data/grades/2020_Higher.csv"


In [8]:
# error checking - need to manually correct data if mismatch between breakdown by gender and totals found
# this case, if found, is relatively easy to fix

#TODO -include NotKnown and NA

checkDistributionByGenderErrors <- 
    function(awardFile, glimpseContent = FALSE) {
        awardData <- read_csv(awardFile, trim_ws = TRUE) %>% #, skip_empty_rows = T) # NOT skipping empty rows... :(
                            filter(rowSums(is.na(.)) != ncol(.)) %>%
                            suppressMessages
        
        print(awardFile)
        if (glimpseContent)
            print(glimpse(awardData))

        
        if (awardData %>%
                select(matches(gender_options)) %>%
                verify(ncol(.) > 0, error_fun = just_warn) %>%

                summarise(data_as_counts = (ncol(.) == 0)) == TRUE) { 
            
            awardData <- awardData %>%
                select(-NumberOfCentres) %>%
                pivot_longer(!c(Subject), names_to = "grade", values_to = "PercentageOfStudents") %>%
                separate("grade", c("gender", "grade"), extra = "merge") %>%
                mutate_at(c("gender", "grade"), as.factor) %>%
                filter((gender %in% c("all")) & (grade %in% c("Entries"))) 
        
            # building parallel structure
            return(awardData %>%
                       group_by(Subject) %>%            
                       mutate(total = -1) %>%
                       summarise(total = sum(total)) %>%
                       mutate(DataError = TRUE) # confirmation only - comment out to print al            
            )
        }
        
        
        awardData <- awardData %>%
            mutate_at(vars(starts_with("male-") | starts_with("female-") | starts_with("all-")), as.character) %>%
            mutate_at(vars(starts_with("male-") | starts_with("female-") | starts_with("all-")), parse_number) %>%
            suppressWarnings


        data_as_counts <- awardData %>%
                    select(-matches(redundant_column_flags)) %>% # "-percentage")) %>%
                    select(matches(c("male-", "female-"))) %>%

                    summarise(data_as_counts = (ncol(.) > 0)) %>%
                    as.logical


        if (data_as_counts) {

            awardData <- awardData %>%

                select(-NumberOfCentres) %>%
                mutate_at(vars(starts_with("male")), ~(. / `all-Entries`)) %>%
                mutate_at(vars(starts_with("female")), ~(. / `all-Entries`)) %>%
                select(-(starts_with("all") & !ends_with("-Entries"))) %>%

                pivot_longer(!c(Subject), names_to = "grade", values_to = "PercentageOfStudents") %>%
                separate("grade", c("gender", "grade"), extra = "merge") %>%
                mutate_at(c("gender", "grade"), as.factor) %>%
                filter(!(gender %in% c("all")) & (grade %in% c("Entries")))


        } else { # dataAsPercentageOnly

            awardData <- awardData %>%

                select(Subject, ends_with("-percentage")) %>%
                mutate_at(vars(ends_with("-percentage")), ~(. / 100)) %>%


                pivot_longer(!c(Subject), names_to = "grade", values_to = "PercentageOfStudents") %>%
                separate("grade", c("gender", "grade"), extra = "merge") %>%
                mutate_at(c("gender", "grade"), as.factor)

        } # end if-else - check for data capture approach
        

        awardData %>%

            group_by(Subject) %>%
            summarise(total = sum(PercentageOfStudents, na.rm = TRUE)) %>%
            verify((total == 1.0) | (total == 0), error_fun = just_warn) %>% 

            mutate(DataError = if_else(((total == 1.0) | (total == 0)), FALSE, TRUE)) %>%
            filter(DataError == TRUE) %>% # confirmation only - comment out to print all
            suppressMessages # ungrouping messages

}


In [9]:
data.frame(sapply(files_to_verify, checkDistributionByGenderErrors))



[1] "sta_it_402/data/grades/1986_Higher.csv"
[1] "sta_it_402/data/grades/1987_Higher.csv"
[1] "sta_it_402/data/grades/1988_Higher.csv"
[1] "sta_it_402/data/grades/1989_Higher.csv"
[1] "sta_it_402/data/grades/1990_Higher.csv"
[1] "sta_it_402/data/grades/1991_Higher.csv"
[1] "sta_it_402/data/grades/1992_Higher.csv"
[1] "sta_it_402/data/grades/1993_Higher.csv"
[1] "sta_it_402/data/grades/1994_Higher.csv"
[1] "sta_it_402/data/grades/1995_Higher.csv"
[1] "sta_it_402/data/grades/1996_Higher.csv"
[1] "sta_it_402/data/grades/1997_Higher.csv"
[1] "sta_it_402/data/grades/1998_Higher.csv"
[1] "sta_it_402/data/grades/1999_Higher.csv"
[1] "sta_it_402/data/grades/2000_NewHigher.csv"
[1] "sta_it_402/data/grades/2001_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2001_NewHigher.csv"
[1] "sta_it_402/data/grades/2002_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2002_NewHigher.csv"
[1] "sta_it_402/data/grades/2003_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2003_Higher.csv"
verification [(total =

“assertr encountered errors”


[1] "sta_it_402/data/grades/2004_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2004_Higher.csv"
verification [(total == 1) | (total == 0)] failed! (2 failures)

    verb redux_fn                   predicate column index value
1 verify       NA (total == 1) | (total == 0)     NA     7    NA
2 verify       NA (total == 1) | (total == 0)     NA     8    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2005_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2005_Higher.csv"
[1] "sta_it_402/data/grades/2006_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2006_Higher.csv"
[1] "sta_it_402/data/grades/2007_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2007_Higher.csv"
[1] "sta_it_402/data/grades/2008_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2008_Higher.csv"
[1] "sta_it_402/data/grades/2009_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2009_Higher.csv"
[1] "sta_it_402/data/grades/2010_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2010_Higher.csv"
[1] "sta_it_402/data/grades/2011_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2011_Higher.csv"
[1] "sta_it_402/data/grades/2012_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2012_Higher.csv"
[1] "sta_it_402/data/grades/2013_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2013_Higher.csv"
[1] "sta_it_402/data/grades/2014_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2014_Higher.csv"
verification [(total

“assertr encountered errors”


[1] "sta_it_402/data/grades/2015_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2015_Higher.csv"
[1] "sta_it_402/data/grades/2015_NewHigher.csv"
[1] "sta_it_402/data/grades/2016_AdvancedHigher.csv"
verification [(total == 1) | (total == 0)] failed! (1 failure)

    verb redux_fn                   predicate column index value
1 verify       NA (total == 1) | (total == 0)     NA     7    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2016_Higher.csv"
[1] "sta_it_402/data/grades/2017_AdvancedHigher.csv"
verification [(total == 1) | (total == 0)] failed! (1 failure)

    verb redux_fn                   predicate column index value
1 verify       NA (total == 1) | (total == 0)     NA     8    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2017_Higher.csv"
[1] "sta_it_402/data/grades/2018_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2018_Higher.csv"
[1] "sta_it_402/data/grades/2019_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2019_Higher.csv"
verification [(total == 1) | (total == 0)] failed! (1 failure)

    verb redux_fn                   predicate column index value
1 verify       NA (total == 1) | (total == 0)     NA    38    NA



“assertr encountered errors”


[1] "sta_it_402/data/grades/2020_AdvancedHigher-revised-12.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”
`summarise()` ungrouping output (override with `.groups` argument)



[1] "sta_it_402/data/grades/2020_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2020_Higher-revised-12.csv"
verification [ncol(.) > 0] failed! (1 failure)

    verb redux_fn   predicate column index value
1 verify       NA ncol(.) > 0     NA     1    NA



“assertr encountered errors”
`summarise()` ungrouping output (override with `.groups` argument)



[1] "sta_it_402/data/grades/2020_Higher.csv"
verification [(total == 1) | (total == 0)] failed! (32 failures)

     verb redux_fn                   predicate column index value
1  verify       NA (total == 1) | (total == 0)     NA     1    NA
2  verify       NA (total == 1) | (total == 0)     NA     2    NA
3  verify       NA (total == 1) | (total == 0)     NA     3    NA
4  verify       NA (total == 1) | (total == 0)     NA     4    NA
5  verify       NA (total == 1) | (total == 0)     NA     5    NA
6  verify       NA (total == 1) | (total == 0)     NA     6    NA
7  verify       NA (total == 1) | (total == 0)     NA     7    NA
8  verify       NA (total == 1) | (total == 0)     NA     8    NA
9  verify       NA (total == 1) | (total == 0)     NA    11    NA
10 verify       NA (total == 1) | (total == 0)     NA    13    NA
11 verify       NA (total == 1) | (total == 0)     NA    14    NA
12 verify       NA (total == 1) | (total == 0)     NA    16    NA
13 verify       NA (total == 1)

“assertr encountered errors”


Unnamed: 0_level_0,sta_it_402.data.grades.1986_Higher.csv,sta_it_402.data.grades.1987_Higher.csv,sta_it_402.data.grades.1988_Higher.csv,sta_it_402.data.grades.1989_Higher.csv,sta_it_402.data.grades.1990_Higher.csv,sta_it_402.data.grades.1991_Higher.csv,sta_it_402.data.grades.1992_Higher.csv,sta_it_402.data.grades.1993_Higher.csv,sta_it_402.data.grades.1994_Higher.csv,sta_it_402.data.grades.1995_Higher.csv,⋯,sta_it_402.data.grades.2017_AdvancedHigher.csv,sta_it_402.data.grades.2017_Higher.csv,sta_it_402.data.grades.2018_AdvancedHigher.csv,sta_it_402.data.grades.2018_Higher.csv,sta_it_402.data.grades.2019_AdvancedHigher.csv,sta_it_402.data.grades.2019_Higher.csv,sta_it_402.data.grades.2020_AdvancedHigher.revised.12.csv,sta_it_402.data.grades.2020_AdvancedHigher.csv,sta_it_402.data.grades.2020_Higher.revised.12.csv,sta_it_402.data.grades.2020_Higher.csv
Unnamed: 0_level_1,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,⋯,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>
Subject,,,,,,,,,,,⋯,Classical Studies,,,,,Photography,"Accounting , Art and Design (Design) , Art and Design (Expressive) , Biology , Business Management , Chemistry , Chinese Languages , Classical Studies , Computing Science , Design and Manufacture , Drama , Economics , Engineering Science , English , French , Gaelic (Learners) , Gàidhlig , Geography , German , Graphic Communication , Health and Food Technology , History , Italian , Latin , Mathematics , Mathematics of Mechanics , Modern Studies , Music , Music Technology , Music: Portfolio , Physical Education , Physics , Religious, Moral and Philosophical Studies, Spanish , Statistics",,"Accounting , Administration and IT , Art and Design , Biology , Business Management , Care , Chemistry , Childcare and Development , Chinese Languages , Classical Studies , Computing Science , Dance , Design and Manufacture , Drama , Economics , Engineering Science , English , English for Speakers of Other Languages , Environmental Science , Fashion and Textile Technology , French , Gaelic (Learners) , Gàidhlig , Geography , German , Graphic Communication , Health and Food Technology , History , Human Biology , Italian , Latin , Mathematics , Media , Modern Studies , Music , Music Technology , Philosophy , Photography , Physical Education , Physics , Politics , Psychology , Religious, Moral and Philosophical Studies, Sociology , Spanish , Urdu","Accounting , Administration and IT , Art and Design , Biology , Business Management , Care , Chemistry , Childcare and Development , Computing Science , Design and Manufacture , Drama , Engineering Science , English , English for Speakers of Other Languages , Environmental Science , French , Gaelic (Learners) , Geography , Graphic Communication , Health and Food Technology , History , Human Biology , Mathematics , Modern Studies , Music , Music Technology , Philosophy , Physical Education , Physics , Psychology , Religious, Moral and Philosophical Studies, Spanish"
total,,,,,,,,,,,⋯,1.01,,,,,0.99,"-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1",,"-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1","1.0008703, 1.0002534, 1.0003779, 1.0021587, 1.0013479, 0.9972184, 1.0015965, 0.9966555, 1.0044416, 1.0035787, 1.0013008, 1.0028222, 1.0045021, 1.0013532, 1.0055866, 1.0009482, 0.9838710, 1.0007774, 1.0021264, 1.0009001, 1.0017168, 1.0020132, 1.0011482, 1.0018917, 1.0007868, 1.0024010, 1.0017036, 1.0004700, 1.0017906, 1.0003224, 1.0028401, 1.0013807"
DataError,,,,,,,,,,,⋯,TRUE,,,,,TRUE,"TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE",,"TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE","TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE"


In [10]:
 
# warning only - document if necessary
# double-check for subjects with values all NA - does this mean subject being excluded or no one took it?

checkSubjectsWithNoEntries <- 
    function(awardFile, glimpseContent = FALSE) {
        awardData <- read_csv(awardFile, trim_ws = TRUE) %>% #, skip_empty_rows = T) # NOT skipping empty rows... :(
                            filter(rowSums(is.na(.)) != ncol(.)) %>%
                            suppressMessages
        
        print(awardFile)
        if (glimpseContent)
            print(glimpse(awardData))
        
        bind_cols(
            awardData %>%
                mutate(row_id = row_number()) %>%
                select(row_id, Subject), 
                  
            awardData %>%
                select(-c(Subject, NumberOfCentres)) %>%
                mutate_at(vars(starts_with("male-") | starts_with("female-") | starts_with("all-")), as.character) %>%
                mutate_at(vars(starts_with("male-") | starts_with("female-") | starts_with("all-")), parse_number) %>%
                suppressWarnings %>%
 
                assert_rows(num_row_NAs, 
                    within_bounds(0, length(colnames(.)), include.upper = F), everything(), error_fun = just_warn) %>% 
                    # comment out just_warn to stop execution on fail
                summarise(column_count = length(colnames(.)),
                          count_no_entries = num_row_NAs(.)) 
                  
        ) %>% # end bind_cols
            
        filter(count_no_entries == column_count) # comment out to print all
    }

In [11]:
data.frame(sapply(files_to_verify, checkSubjectsWithNoEntries))

[1] "sta_it_402/data/grades/1986_Higher.csv"
[1] "sta_it_402/data/grades/1987_Higher.csv"
[1] "sta_it_402/data/grades/1988_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 1 time
         verb    redux_fn
1 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    20
  value
1    18



“assertr encountered errors”


[1] "sta_it_402/data/grades/1989_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 1 time
         verb    redux_fn
1 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    21
  value
1    18



“assertr encountered errors”


[1] "sta_it_402/data/grades/1990_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 1 time
         verb    redux_fn
1 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    23
  value
1    18



“assertr encountered errors”


[1] "sta_it_402/data/grades/1991_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 1 time
         verb    redux_fn
1 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    46
  value
1    18



“assertr encountered errors”


[1] "sta_it_402/data/grades/1992_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 1 time
         verb    redux_fn
1 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    57
  value
1    18



“assertr encountered errors”


[1] "sta_it_402/data/grades/1993_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 4 times
         verb    redux_fn
1 assert_rows num_row_NAs
2 assert_rows num_row_NAs
3 assert_rows num_row_NAs
4 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    17
2 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    73
3 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    74
4 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    75
  value
1    18
2    18
3    18
4    18



“assertr encountered errors”


[1] "sta_it_402/data/grades/1994_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 10 times
         verb    redux_fn
1 assert_rows num_row_NAs
2 assert_rows num_row_NAs
3 assert_rows num_row_NAs
4 assert_rows num_row_NAs
5 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()     6
2 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    11
3 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    17
4 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    54
5 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    58
  value
1    18
2    18
3    18
4    18
5    18
  [omitted 5 rows]




“assertr encountered errors”


[1] "sta_it_402/data/grades/1995_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 20 times
         verb    redux_fn
1 assert_rows num_row_NAs
2 assert_rows num_row_NAs
3 assert_rows num_row_NAs
4 assert_rows num_row_NAs
5 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()     1
2 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()     7
3 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()     9
4 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    12
5 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    14
  value
1    18
2    18
3    18
4    18
5    18
  [omitted 15 rows]




“assertr encountered errors”


[1] "sta_it_402/data/grades/1996_Higher.csv"
Data frame row reduction 'num_row_NAs' violates predicate 'within_bounds(0, length(colnames(.)), include.upper = F)' 5 times
         verb    redux_fn
1 assert_rows num_row_NAs
2 assert_rows num_row_NAs
3 assert_rows num_row_NAs
4 assert_rows num_row_NAs
5 assert_rows num_row_NAs
                                                 predicate        column index
1 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()     1
2 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()     3
3 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    23
4 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    33
5 within_bounds(0, length(colnames(.)), include.upper = F) ~everything()    37
  value
1    18
2    18
3    18
4    18
5    18



“assertr encountered errors”


[1] "sta_it_402/data/grades/1997_Higher.csv"
[1] "sta_it_402/data/grades/1998_Higher.csv"
[1] "sta_it_402/data/grades/1999_Higher.csv"
[1] "sta_it_402/data/grades/2000_NewHigher.csv"
[1] "sta_it_402/data/grades/2001_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2001_NewHigher.csv"
[1] "sta_it_402/data/grades/2002_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2002_NewHigher.csv"
[1] "sta_it_402/data/grades/2003_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2003_Higher.csv"
[1] "sta_it_402/data/grades/2004_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2004_Higher.csv"
[1] "sta_it_402/data/grades/2005_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2005_Higher.csv"
[1] "sta_it_402/data/grades/2006_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2006_Higher.csv"
[1] "sta_it_402/data/grades/2007_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2007_Higher.csv"
[1] "sta_it_402/data/grades/2008_AdvancedHigher.csv"
[1] "sta_it_402/data/grades/2008_Higher.csv"
[1] "sta_it_402/data/grades

Unnamed: 0_level_0,sta_it_402.data.grades.1986_Higher.csv,sta_it_402.data.grades.1987_Higher.csv,sta_it_402.data.grades.1988_Higher.csv,sta_it_402.data.grades.1989_Higher.csv,sta_it_402.data.grades.1990_Higher.csv,sta_it_402.data.grades.1991_Higher.csv,sta_it_402.data.grades.1992_Higher.csv,sta_it_402.data.grades.1993_Higher.csv,sta_it_402.data.grades.1994_Higher.csv,sta_it_402.data.grades.1995_Higher.csv,⋯,sta_it_402.data.grades.2017_AdvancedHigher.csv,sta_it_402.data.grades.2017_Higher.csv,sta_it_402.data.grades.2018_AdvancedHigher.csv,sta_it_402.data.grades.2018_Higher.csv,sta_it_402.data.grades.2019_AdvancedHigher.csv,sta_it_402.data.grades.2019_Higher.csv,sta_it_402.data.grades.2020_AdvancedHigher.revised.12.csv,sta_it_402.data.grades.2020_AdvancedHigher.csv,sta_it_402.data.grades.2020_Higher.revised.12.csv,sta_it_402.data.grades.2020_Higher.csv
Unnamed: 0_level_1,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,⋯,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>
row_id,,,20,21,23,46,57,"17, 73, 74, 75","6, 11, 17, 54, 58, 76, 77, 78, 79, 80","1, 7, 9, 12, 14, 17, 19, 30, 46, 49, 52, 54, 55, 66, 67, 68, 69, 70, 71, 72",⋯,,,,,,,,,,
Subject,,,Hebrew,Greek (Optional Paper III)*,Greek (Optional Paper III)*,Metalwork,Music Part II*,"Latin (Optional Paper III)*, Music Part II* , Music Part III* , Music Part IV*","Gáidhlig , Greek (Optional Paper III)* , Latin (Optional Paper III)* , Craft and Design , Home Economics (Fabrics and Fashion), Music Part II* , Music Part III* , Music Part IV* , Music Part V* , Music Part VI*","Classical Greek (Optional Paper III) , Gáidhlig , Gaelic (Learners) , Hebrew , Italian , Portuguese , Russian , Chemistry , Agricultural Science , Engineering , Horticultural Science , Navigation , Secretarial Studies (Audio-typewriting), Music (Practical Harmony)* , Music Part II* , Music Part III* , Music Part IV* , Music Part V* , Music Part VI* , Music",⋯,,,,,,,,,,
column_count,,,18,18,18,18,18,"18, 18, 18, 18","18, 18, 18, 18, 18, 18, 18, 18, 18, 18","18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18",⋯,,,,,,,,,,
count_no_entries,,,18,18,18,18,18,"18, 18, 18, 18","18, 18, 18, 18, 18, 18, 18, 18, 18, 18","18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18",⋯,,,,,,,,,,
