# Simple Chunking

In [1]:
import os

In [2]:
def simple_chunk_document(document, chunk_size):
    chunks = []
    words = document.split()  # Tokenize document into words
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def simple_chunk_files(input_folder, output_folder, chunk_size):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Process each file in the input folder
    for filename in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, filename)
        if os.path.isfile(input_file_path):
            # Read file content
            with open(input_file_path, 'r', encoding='utf-8-sig') as file:
                document = file.read()

            # Get chunks for the document
            chunks = simple_chunk_document(document, chunk_size)

            # Write chunks to output file
            output_file_path = os.path.join(output_folder, f'{os.path.splitext(filename)[0]}_simple_chunk.txt')
            with open(output_file_path, 'w', encoding='utf-8-sig') as file:
                file.write(str(chunks))

    print("Chunking completed. Chunks stored in", output_folder)

In [3]:
# Change this to the path of your input folder
input_folder = r"C:\Users\adykh\Desktop\subs_db\subtitles\subtitle_demo\cleaned_subtitle_demo"
# Change this to the path of your output folder
output_folder = r"C:\Users\adykh\Desktop\subs_db\subtitles\subtitle_demo\simple_chunks" 

In [77]:
chunk_size = 10  # Change this to your desired chunk size

In [78]:
simple_chunk_files(input_folder, output_folder, chunk_size)

Chunking completed. Chunks stored in C:\Users\adykh\Desktop\subs_db\subtitles\subtitles_demo\simple_chunks


# Overlapping Chunks

In [73]:
def overlapping_chunk_document(document, chunk_size=500, overlap=100):
    chunks = []
    words = document.split()
    start_idx = 0
    end_idx = min(chunk_size, len(words))

    while start_idx < len(words):
        chunk = ' '.join(words[start_idx:end_idx])
        chunks.append(chunk)
        start_idx += chunk_size - overlap
        end_idx = min(start_idx + chunk_size, len(words))
    return chunks

def overlap_chunk_files(input_folder, output_folder, chunk_size, overlap):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Process each file in the input folder
    for filename in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, filename)
        if os.path.isfile(input_file_path):
            # Read file content
            with open(input_file_path, 'r', encoding='utf-8-sig') as file:
                document = file.read()

            # Get chunks for the document
            chunks = overlapping_chunk_document(document, chunk_size, overlap)

            # Write chunks to output file
            output_file_path = os.path.join(output_folder, f'{os.path.splitext(filename)[0]}_overlapping_chunk.txt')
            with open(output_file_path, 'w', encoding='utf-8-sig') as file:
                file.write(str(chunks))

    print("Chunking completed. Chunks stored in", output_folder)

In [62]:
# Change this to the path of your input folder
input_folder = r"C:\Users\adykh\Desktop\subs_db\subtitles\subtitle_demo\cleaned_subtitle_demo"
# Change this to the path of your output folder
output_folder = r"C:\Users\adykh\Desktop\subs_db\subtitles\subtitle_demo\overlapping_chunks" 

In [63]:
chunk_size = 10  # Change this to your desired chunk size
overlap = 3  # Change this to your desired overlap size

In [74]:
overlap_chunk_files(input_folder, output_folder, chunk_size, overlap)

Chunking completed. Chunks stored in C:\Users\adykh\Desktop\subs_db\subtitles\subtitles_demo\overlapping_chunks
