In [1]:
from pathlib import Path

from rich import print

In [4]:
test_files = list(Path.cwd().parent.glob('data/**/*.pdf'))
test_files

[PosixPath('/home/thankgod/2025/tiiqu/textract-tiquu/data/JSR_1778.pdf'),
 PosixPath('/home/thankgod/2025/tiiqu/textract-tiquu/data/JSR_1774.pdf'),
 PosixPath('/home/thankgod/2025/tiiqu/textract-tiquu/data/What_is_Sustainability-1.pdf'),
 PosixPath('/home/thankgod/2025/tiiqu/textract-tiquu/data/JSR_1775.pdf')]

# Pdf to markdown conversation


In [5]:
from marker.config.parser import ConfigParser
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict

config = {
    "output_format": "markdown",
    "ADDITIONAL_KEY": "VALUE"
}
config_parser = ConfigParser(config)

converter = PdfConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
    processor_list=config_parser.get_processors(),
    renderer=config_parser.get_renderer(),
    llm_service=config_parser.get_llm_service()
)

rendered_file = converter(filepath=str(test_files[2]))

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 7.62 GiB of which 3.00 MiB is free. Process 23341 has 5.65 GiB memory in use. Including non-PyTorch memory, this process has 1.86 GiB memory in use. Of the allocated memory 1.76 GiB is allocated by PyTorch, and 10.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Headers and Text Extraction


In [None]:
import re


def extract_headers(markdown_text):
    # Pattern to match headers 1-3 (# ## ###)
    pattern = r'^(#{1,10})\s+(.+)$'
    
    headers = []
    for line in markdown_text.split('\n'):
        match = re.match(pattern, line.strip())
        if match:
            level = len(match.group(1))  # Number of # symbols
            title = match.group(2).strip()
            headers.append({'level': level, 'title': title})
    
    return headers

# Block Chunking


In [None]:
def count_words(text):
    """Count words in text"""
    return len(text.split())

count_words(block2)

In [None]:
def split_into_sentences(text):
    """Split text into sentences using basic regex"""
    sentences = re.split(r'[.!?]+', text)
    return [s.strip() for s in sentences if s.strip()]

sentences = split_into_sentences(block2)

for i, sentence in enumerate(sentences, 1):
    print(f"{i}: {sentence}")

In [None]:
def cosine_similarity(vec1, vec2):
    """Calculate cosine similarity between two vectors"""
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))


In [None]:
def semantic_chunk(text, min_words=100, max_words=250, similarity_threshold=0.7):
    """
    Split text into semantic chunks
    
    Args:
        text: Input text to chunk
        min_words: Minimum words per chunk
        max_words: Maximum words per chunk  
        similarity_threshold: Similarity threshold for splitting
    """
    
    # Initialize model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Split into sentences
    sentences = split_into_sentences(text)
    sentence_embeddings = model.encode(sentences)
    
    chunks = []
    current_chunk = []
    current_embeddings = []
    current_word_count = 0
    
    for i, sentence in enumerate(sentences):
        sentence_words = count_words(sentence)
        proposed_word_count = current_word_count + sentence_words
        
        # If we're under minimum, must add
        if current_word_count < min_words:
            current_chunk.append(sentence)
            current_embeddings.append(sentence_embeddings[i])
            current_word_count += sentence_words
            
        # If adding would exceed maximum, split now
        elif proposed_word_count > max_words:
            # Save current chunk
            chunks.append(' '.join(current_chunk))
            
            # Start new chunk with current sentence
            current_chunk = [sentence]
            current_embeddings = [sentence_embeddings[i]]
            current_word_count = sentence_words
            
        # We're in the decision zone (between min and max)
        else:
            # Compare new sentence to current chunk average
            chunk_avg_embedding = np.mean(current_embeddings, axis=0)
            similarity = cosine_similarity(sentence_embeddings[i], chunk_avg_embedding)
            #print(similarity)
            if similarity >= similarity_threshold:
                # Similar enough, add to current chunk
                current_chunk.append(sentence)
                current_embeddings.append(sentence_embeddings[i])
                current_word_count += sentence_words
            else:
                # Not similar, split here
                chunks.append(' '.join(current_chunk))
                
                # Start new chunk
                current_chunk = [sentence]
                current_embeddings = [sentence_embeddings[i]]
                current_word_count = sentence_words
    
    # Add final chunk if it exists
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

chunks = semantic_chunk(block1,similarity_threshold=0.4)
#chunks

for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {count_words(chunk)} words")
    print(chunk[:100] + "...")
    print()