In [1]:
import re
import pypdf as pdf


    
#Splitting pdf files with lecture notes into chapters.
#For this code to work the tex file need to include table
#of content, so that a .toc file is present. The code uses 
#this file to retrieve page numbers of chapters. 



def extract_pages(inputpdf, fpage, lpage):
    '''
    inputpdf: a PyPDF2 PdfReader object
    fpage: page number of the first page to be extracted
    lpage: page number of the last page to be extracted
    Returns:
    output: PyPDF2 PdfFWriter object containing extracted pages
    '''
    output = pdf.PdfWriter()
    for i in range(fpage-1,lpage-1):
        output.add_page(inputpdf.pages[i])
    return output

def chapter_page_nums(tocfile):
    '''
    Uses a LaTeX .toc file to retrieve page numbers of chapters
    Returns:
    page_nums: a list of chapter page numbers
    '''
    page_nums = []
    with open(tocfile, 'r') as foo:
        toc_lines = foo.readlines()
    for line in toc_lines:
        page_nums.append(int(re.findall(r'\{(\d+)\}', line)[1]))
    return page_nums


def split_chapters(inputpdf, outfile_name, chapter_page_nums):
    '''
    Produces pdf files with individual chapters
    inputpdf: a PyPDF2 PfgFileReader object
    outfile_name: template of names of pdf files that are to be produced; 
                  the pdf file of chapter n will be named outfile_name_n.pdf
    chapter_page_nums: a list of page numbers of chapters
    Returns: None
    
    '''
    chapter_page_nums = chapter_page_nums[:]
    print(chapter_page_nums)
    for i in range(len(chapter_page_nums)-1):
        outpdf = extract_pages(inputpdf, chapter_page_nums[i], chapter_page_nums[i+1])
        with open(outfile_name + "_" + str(i+1) + ".pdf" , "wb") as outputStream:
            outpdf.write(outputStream)  
            
def notes_splitter(texfile, outfile_name, shift=0):
    '''
    texfile: source LaTeX file
    outfile_name: template of names of pdf files that are to be produced; 
                  the pdf file of chapter n will be named outfile_name_n.pdf
    chapter_page_nums: a list of page numbers of chapters
    Returns: None
    
    '''
    texfile_root = texfile.split('.')[0]
    tocfile = texfile_root + '.toc'
    pdffile = texfile_root + '.pdf'
    inpdf = pdf.PdfReader(open(pdffile, 'rb'))
    ch_page_list = chapter_page_nums(tocfile)
    ch_page_list = [x + shift for x in ch_page_list]
    ch_page_list.append(len(inpdf.pages) + 1)
    split_chapters(inpdf, outfile_name, ch_page_list)

In [2]:
import shutil

def split(infile, outpath, outfile_name, shift=0):
    #if os.path.exists(outpath):
    #    shutil.rmtree(outpath, ignore_errors=False, onerror=None)    
    #os.makedirs(outpath)
    notes_splitter(infile, outpath + outfile_name, shift=shift)
        
notes_file = "/Users/bb/Library/CloudStorage/Box-Box/Lecture_Notes/MTH_727_notes/mth727_lecture_notes.tex"
outpath_notes = "/Users/bb/Library/CloudStorage/Box-Box/Courses/U_Buffalo/2024-09-mth727/notes/"


out_notes = 'mth727_notes'

notes = [notes_file, outpath_notes, out_notes, 2]


split(*notes)

[3, 5, 8, 13, 20, 26, 30, 38, 45, 50, 55, 60, 66, 70, 75, 79, 82, 88, 93, 97, 99, 103, 111, 112]
