In [41]:
#Splitting pdf files with lecture notes into chapters.
#For this code to work the tex file need to include table
#of content, so that a .toc file is present. The code uses 
#this file to retrieve page numbers of chapters. 

import re
import PyPDF2 as pdf

def extract_pages(inputpdf, fpage, lpage):
    '''
    inputpdf: a PyPDF2 PfgFileReader object
    fpage: page number of the first page to be extracted
    lpage: page number of the last page to be extracted
    Returns:
    output: PyPDF2 PfgFileWriter object containing extracted pages
    '''
    output = pdf.PdfFileWriter()
    for i in range(fpage-1,lpage-1):
        output.addPage(inputpdf.getPage(i))
    return output

def chapter_page_nums(tocfile):
    '''
    Uses a LaTeX .toc file to retrieve page numbers of chapters
    Returns:
    page_nums: a list of chapter page numbers
    '''
    page_nums = []
    with open(tocfile, 'r') as foo:
        toc_lines = foo.readlines()
    for line in toc_lines:
        page_nums.append(int(re.findall(r'\{(\d+)\}', line)[1]))
    return page_nums


def split_chapters(inputpdf, outfile_name, chapter_page_nums, shift):
    '''
    Produces pdf files with individual chapters
    inputpdf: a PyPDF2 PfgFileReader object
    outfile_name: template of names of pdf files that are to be produced; 
                  the pdf file of chapter n will be named outfile_name_n.pdf
    chapter_page_nums: a list of page numbers of chapters
    shift: number of beginning pages to skip (used to skip over toc pages)
    Returns: None
    
    '''
    chapter_page_nums = chapter_page_nums[:]
    print(chapter_page_nums)
    for i in range(len(chapter_page_nums)-1):
        outpdf = extract_pages(inputpdf, chapter_page_nums[i]+shift, chapter_page_nums[i+1]+shift)
        with open(outfile_name + "_" + str(i+1) + ".pdf" , "wb") as outputStream:
            outpdf.write(outputStream)  
            
def notes_splitter(texfile, outfile_name, shift=2):
    '''
    texfile: source LaTeX file
    outfile_name: template of names of pdf files that are to be produced; 
                  the pdf file of chapter n will be named outfile_name_n.pdf
    chapter_page_nums: a list of page numbers of chapters
    shift: number of beginning pages to skip (used to skip over toc pages)
    Returns: None
    
    '''
    texfile_root = texfile.split('.')[0]
    tocfile = texfile_root + '.toc'
    pdffile = texfile_root + '.pdf'
    inpdf = pdf.PdfFileReader(open(pdffile, 'rb'))
    ch_page_list = chapter_page_nums(tocfile)
    ch_page_list.append(inpdf.getNumPages()-shift+1)
    split_chapters(inpdf, outfile_name, ch_page_list, shift)

In [42]:
import shutil

def split(infile, outpath, outfile_name, shift=2):
    #if os.path.exists(outpath):
    #    shutil.rmtree(outpath, ignore_errors=False, onerror=None)    
    #os.makedirs(outpath)
    notes_splitter(infile, outpath + outfile_name)
        
notes_file = "/Users/bb1/Box Sync/Courses/U_Buffalo/2018-09-mth309/lecture notes/mth309_lecture_notes.tex"
outpath_notes = "/Users/bb1/Box Sync/Courses/U_Buffalo/2018-09-mth309/Sphinx/_static/"

out_notes = 'mth309_notes'
shift=2

notes = [notes_file, outpath_notes, out_notes, shift]


split(*notes)

[1, 5, 9, 17, 19, 25, 33, 42, 48, 51, 56, 63, 68, 76, 85, 90, 104, 113, 114, 115, 116, 118, 119, 130, 140, 152, 153, 154, 155, 156, 157, 167]
