In [1]:
# imports
import os
from langchain_text_splitters import MarkdownHeaderTextSplitter, CharacterTextSplitter, RecursiveCharacterTextSplitter

In [34]:
# Step 1: Function to read input documents
def read_document(file_path):
    ''' 
    Reads a document based on its extension.
    Returns its content and a boolean indicating if it is a markdown file.
    '''
    _, file_extension = os.path.splitext(file_path)
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read(), file_extension == '.md'


In [41]:
# Step 2: Combined chunking strategy function
def split_strategy(text, chunk_size, chunk_overlap, is_markdown, strategy):
    '''
    Splits the text using the specified chunking strategy.
    Return a list of text chunks.
    '''
    if 'markdown' in strategy and is_markdown:
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ('###', 'Header 3')]
        md_splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=headers_to_split_on, strip_headers=False)
        text = md_splitter.split_text(text)
        if strategy == 'markdown':
            print(f"hello hello")
            return [doc.page_content for doc in text]
        strategy = strategy.split('-')[1]  # Get 'fixed' or 'recursive'

    if strategy == 'fixed':
        print(f"came here")
        splitter = CharacterTextSplitter(
            separator=SEPARATOR, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    elif strategy == 'recursive':
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    else:
        raise ValueError(f"Unsupported chunking strategy: {strategy}")

    if is_markdown:
        documents = splitter.split_documents(text)
        return [doc.page_content for doc in documents]
    else:
        return splitter.split_text(text)


In [38]:
# Global parameters
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
SEPARATOR = "\n\n" # other options are: "\n" or "",""

def process_document(file_path, strategy):
    '''
    Processes the document and splits it into chunks using the specified strategy.
    Returns a list of text chunks.
    '''
    document_content, is_md = read_document(file_path)
    
    chunks = split_strategy(
        document_content, 
        chunk_size=CHUNK_SIZE, 
        chunk_overlap=CHUNK_OVERLAP, 
        is_markdown=is_md, 
        strategy=strategy)
    return chunks


In [44]:
file_path = 'Employee guide.pdf.md'
strategies = ['fixed', 'recursive', 'markdown', 'markdown-fixed', 'markdown-recursive']
chunks = process_document(file_path, strategy='')

ValueError: Unsupported chunking strategy: 

In [40]:
(chunks)

['Note: ClearPeaks at its own discretion, may change, delete, suspend or discontinue parts or the policy in its entirety,\nat any time without prior notice.',
 '# WORKING AT CLEARPEAKS  \n**June 23**',
 '## CONTENT TABLE  \n1. Introduction\n1.1Welcome\n1.2 Company Values\n2. Employee definition and status\n2.1 Probationary Period for New Employees\n3. Employee Policies\n3.1 Equal Employee Opportunity\n3.2 Employee Background Check\n3.3 New Employee Orientation\n3.4 Personnel Records and Administration\n3.5 Change of Personal Data\n3.6 Safety\n3.7 Health & Absence\n3.8 Employee Requiring Medical Attention\n3.9 Visitors in the Workplace\n4. Standards of Conduct\n4.1 General Guidelines\n4.2 Attendance and Punctuality\n4.3 Work Schedule\n4.4 Meal and Break Periods\n4.5 Hybrid Working Model\n4.6 Absence and Lateness\n4.7 Unscheduled Absence\n4.8 Voice mail, out-of-office message, e-mail signatures, communication, etc.\n4.9 Harassment Policy\n4.10 Violent Behaviour in the Workplace\n4.11 Con