# Changing Adobe Digital Editions annotations to text
This notebook takes annotation file from Adobe Digital Editions and turn into a text doc with page numbers

First, get the needed library and set up a few variables.

In [1]:
library(XML)

pDiff <- 17 # subtract this from the page number
pagePad <- "%03d" # set the number of digits in your page numbers... so if your maximum page < 1000, set to %03d
lineSuffix <- '  \n'
fileNames <- c(
    'c:/Users/ahipp/Dropbox/PERSONAL/WRITING/AAA - OAKS - UNTOLD STORIES/READING.NOTES_REFS/byTopicAndAuthor/Oak Seed Dispersal-5-1.pdf.annot',
    'c:/Users/ahipp/Dropbox/PERSONAL/WRITING/AAA - OAKS - UNTOLD STORIES/READING.NOTES_REFS/byTopicAndAuthor/Oak Seed Dispersal-5-2.pdf.annot'
    )
    
pagePrefix <- 'Page ' # prefix for page numbers; set to '' if no page prefix desired
pageDelim <- ' --- ' # delimiter between page numbers and text
tocPrefix <- '# ' # prefix for TOC entries; defaults to # for md
# fileOut <- 'c:/Users/ahipp/Dropbox/PERSONAL/WRITING/AAA - OAKS - UNTOLD STORIES/READING.NOTES_REFS/byTopicAndAuthor/Steele_OakSeedDispersal_readingNotes.md'

# toc should be a tab-delimited file with two columns: 'chapter' and 'startPage'
# comment out this line if there is no toc
toc <- 'c:/Users/ahipp/Dropbox/PERSONAL/WRITING/AAA - OAKS - UNTOLD STORIES/READING.NOTES_REFS/byTopicAndAuthor/OakSeedDispersal.toc.tsv'

"package 'XML' was built under R version 4.1.3"


Then, read in and format the XML from Adobe:

In [5]:
for(i in fileNames) {
    dat <- xmlParse(i)
    dat <- xmlToList(dat)
    dat.text <- sapply(seq_along(dat), function(x) {
        dat[[x]]$target$fragment$text
        }
        )
    dat.page <- sapply(seq_along(dat), function(x) {
        dat[[x]]$title
        }
        )
    keep <- !(sapply(dat.text, is.null) | sapply(dat.page, is.null))
    dat.text <- 
        dat.text[keep] |> unlist()
    dat.page <- 
        dat.page[keep] |> 
        sapply(FUN = strsplit, split = ',', fixed = TRUE) |>
        sapply(FUN = '[', 1) |>
        gsub(pattern = 'Page ', replacement = '', fixed = T) |>
        as.integer() |>
        (function(x) {x-pDiff})() |>
        sprintf(fmt = pagePad)

    dat.out <- paste(pagePrefix, dat.page, pageDelim, dat.text, lineSuffix, sep = '')

    if(exists('toc')) {
        dat.toc <- read.delim(toc)
        dat.toc.out <- 
            paste(pagePrefix, 
                  dat.toc$startPage |> sprintf(fmt = pagePad),
                  pageDelim,
                  dat.toc$chapter,
                  'TOCPLACEHOLDER',
                  sep = '')
        dat.out <- c(dat.out, dat.toc.out) |> sort()
        dat.out[grep('TOCPLACEHOLDER', dat.out)] <- paste(tocPrefix, dat.out[grep('TOCPLACEHOLDER', dat.out)])
        dat.out <- gsub('TOCPLACEHOLDER', '', dat.out)
    }

    writeLines(dat.out, paste(i, '.md', sep = '')) # for now, just writes each one out to a separate markdown file
    } # close i