In [1]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Chapter and subtopic patterns
chapter_splitter = 'chapter end -------------------------------------'
subtopic_pattern = re.compile(r'(\d+(\.\d+)+)')

# Custom function for splitting by chapters first
def split_by_chapters(text):
    return text.split(chapter_splitter)

# Custom function for splitting by subtopics within each chapter
def split_by_subtopics(text):
    subtopic_splits = subtopic_pattern.split(text)
    # Remove empty elements and return cleaned list
    return [subtopic.strip() for subtopic in subtopic_splits if subtopic.strip()]

# Define your text splitter with RecursiveCharacterTextSplitter
class CustomTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    
    # Overriding the split_text method
    def split_text(self, text):
        # Step 1: Split by chapters
        chapters = split_by_chapters(text)
        
        # Step 2: For each chapter, split by subtopics
        split_texts = []
        for chapter in chapters:
            subtopics = split_by_subtopics(chapter)
            split_texts.extend(subtopics)
        
        return split_texts

# Sample text (assume this is the content of your file)
text = """
Chapter 1. Introduction
This is the introduction.
1.1 Overview
Details of the overview.
chapter end -------------------------------------
Chapter 2. Literature Review
This is the literature review.
2.1 Previous Work
Details of previous work.
chapter end -------------------------------------
"""

# Create an instance of the custom splitter
custom_splitter = CustomTextSplitter()

# Call split_text to split by chapters and subtopics
split_texts = custom_splitter.split_text(text)

# Print the result
for i, split in enumerate(split_texts):
    print(f"Split {i + 1}: {split}\n")


Split 1: Chapter 1. Introduction
This is the introduction.

Split 2: 1.1

Split 3: .1

Split 4: Overview
Details of the overview.

Split 5: Chapter 2. Literature Review
This is the literature review.

Split 6: 2.1

Split 7: .1

Split 8: Previous Work
Details of previous work.



In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
import re

# Define your subtopic and chapter separators
subtopic_pattern = re.compile(r'(\d+(\.\d+)+)')
chapter_separator = 'chapter end -------------------------------------'

# Custom RecursiveCharacterTextSplitter with regex patterns for subtopics and chapters
class CustomTextSplitter(RecursiveCharacterTextSplitter):
    def __init__(self, **kwargs):
        # Initialize with any other parameters, and add your separators
        super().__init__(separators=[chapter_separator], **kwargs)
        self.subtopic_pattern = subtopic_pattern

    def split_text(self, text):
        # First, split by chapters
        texts = super().split_text(text)
        documents = []
        
        # For each chapter, split by subtopic using the subtopic regex
        chapter_number = 1
        for chapter in texts:
            subtopic_splits = self._split_by_subtopic(chapter, chapter_number)
            documents.extend(subtopic_splits)
            chapter_number += 1
        
        return documents

    def _split_by_subtopic(self, text, chapter_number):
        # Use the subtopic regex to split text
        matches = list(self.subtopic_pattern.finditer(text))
        if not matches:
            # No subtopics found, return the full text as a single Document
            return [Document(page_content=text.strip(), metadata={"chapter": chapter_number})]
        
        subtopics = []
        start_idx = 0
        subtopic_number = 1
        
        for match in matches:
            end_idx = match.start()
            if start_idx != end_idx:
                subtopics.append(Document(
                    page_content=text[start_idx:end_idx].strip(),
                    metadata={"chapter": chapter_number, "subtopic": subtopic_number}
                ))
            start_idx = end_idx
            subtopic_number += 1
            
        # Append the remaining part as a subtopic
        subtopics.append(Document(
            page_content=text[start_idx:].strip(),
            metadata={"chapter": chapter_number, "subtopic": subtopic_number}
        ))
        
        return subtopics

# Example usage
text_splitter = CustomTextSplitter(chunk_size=1000, chunk_overlap=100)
document = """
Chapter 1
1.1 Introduction
Some introduction content...
1.2 Subtopic
Subtopic content...

chapter end -------------------------------------

Chapter 2
2.1 Another Subtopic
Content here...
"""

splits = text_splitter.split_text(document)
for idx, split in enumerate(splits):
    print(f"Chunk {idx + 1}:")
    print(f"Page Content:\n{split.page_content}\n")
    print(f"Metadata:\n{split.metadata}\n")


Chunk 1:
Page Content:
1 Problem Solving
Short Introduction
Problem solving is the process of solving complex problems. This unit introduces the methodologies to WyateK=)acue-lale mmo) ge)e)(=laam-lale mye) \(-mimlam-lam=liceaah'i-maar-lalalelm
Students’ Learning Outcomes
1. Problem Solving Steps @ Defining a problem @ Understanding a problem @ Planning a solution @ Defining candid solutions @ Selecting the best solution
2. Flowcharts @ Defining a flowchart @ Explaining the importance of a flowchart for problem solving @ Determining the requirements for a flowchart @ Using flowchart symbols @ Drawing flowcharts for sample problems
3. Algorithm
@ Defining an algorithm Describing role of algorithm in problem solving Formulating an algorithm Writing algorithms for sample problems Understanding efficiency of algorithms Differentiating between algorithms and flowcharts
4. Test data @ Understanding the concept of test data @ Describing importance of testing @ Understanding types of test case

  document = """
