In [1]:
import os

def read_and_split_paragraphs(data_dir):
    texts = []
    filenames = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
                paragraphs = file.read().split('\n')  
                texts.append(paragraphs)
                filenames.append(filename)
    return texts, filenames

def save_paragraphs(texts, filenames, output_dir):
    for paragraphs, filename in zip(texts, filenames):
        file_base_name = os.path.splitext(filename)[0]
        file_dir = os.path.join(output_dir, file_base_name)
        os.makedirs(file_dir, exist_ok=True)

        for i, paragraph in enumerate(paragraphs):
            paragraph_filename = f'paragraph_{i+1}.txt'
            paragraph_filepath = os.path.join(file_dir, paragraph_filename)
            with open(paragraph_filepath, 'w', encoding='utf-8') as paragraph_file:
                paragraph_file.write(paragraph.strip())

def process_data_folders(base_dir):
    for folder in ['train', 'validation']:
        data_dir = os.path.join(base_dir, folder)
        output_dir = os.path.join(base_dir, f'{folder}_processed')
        os.makedirs(output_dir, exist_ok=True)

        texts, filenames = read_and_split_paragraphs(data_dir)
        save_paragraphs(texts, filenames, output_dir)

base_data_dir = 'data'
process_data_folders(base_data_dir)

In [6]:
import os

In [7]:
def read_and_split_paragraphs(data_dir):
    texts = []
    filenames = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
                paragraphs = []
                current_paragraph = ''
                for line in file:
                    stripped_line = line.strip()
                    if stripped_line:
                        if line.startswith(' '):  # Check for indentation
                            current_paragraph += ' ' + stripped_line
                        else:
                            if current_paragraph:
                                paragraphs.append(current_paragraph)
                            current_paragraph = stripped_line
                    else:
                        if current_paragraph:
                            paragraphs.append(current_paragraph)
                            current_paragraph = ''
                if current_paragraph:  # Append the last paragraph if it exists
                    paragraphs.append(current_paragraph)
                texts.append(paragraphs)
                filenames.append(filename)
    return texts, filenames

In [9]:
def save_paragraphs(texts, filenames, output_dir):
    for paragraphs, filename in zip(texts, filenames):
        file_base_name = os.path.splitext(filename)[0]
        file_dir = os.path.join(output_dir, file_base_name)
        os.makedirs(file_dir, exist_ok=True)

        for i, paragraph in enumerate(paragraphs):
            paragraph_filename = f'paragraph_{i+1}.txt'
            paragraph_filepath = os.path.join(file_dir, paragraph_filename)
            with open(paragraph_filepath, 'w', encoding='utf-8') as paragraph_file:
                paragraph_file.write(paragraph.strip())

In [10]:
def process_data_folders(base_dir):
    for folder in ['train', 'validation']:
        data_dir = os.path.join(base_dir, folder)
        output_dir = os.path.join(base_dir, f'{folder}_processed')
        os.makedirs(output_dir, exist_ok=True)

        texts, filenames = read_and_split_paragraphs(data_dir)
        save_paragraphs(texts, filenames, output_dir)

In [11]:
base_data_dir = 'data'
process_data_folders(base_data_dir)

### Test problem-3403

In [1]:
import os

def read_and_split_paragraphs(file_path):
    texts = []
    filenames = []
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            paragraphs = []
            current_paragraph = ''
            for line in file:
                stripped_line = line.strip()
                if stripped_line:
                    if line.startswith(' '):  # Check for indentation
                        current_paragraph += ' ' + stripped_line
                    else:
                        if current_paragraph:
                            paragraphs.append(current_paragraph)
                        current_paragraph = stripped_line
                else:
                    if current_paragraph:
                        paragraphs.append(current_paragraph)
                        current_paragraph = ''
            if current_paragraph:  # Append the last paragraph if it exists
                paragraphs.append(current_paragraph)
            texts.append(paragraphs)
            filenames.append(os.path.basename(file_path))
    return texts, filenames

# 设置测试文件路径
test_file_path = 'data/train/problem-3043.txt'

# 测试函数
texts, filenames = read_and_split_paragraphs(test_file_path)

# 打印结果
for filename, paragraphs in zip(filenames, texts):
    print(f"Filename: {filename}")
    for i, paragraph in enumerate(paragraphs, 1):
        print(f"Paragraph {i}: {paragraph}")


Filename: problem-3043.txt
Paragraph 1: Did you expect anything else but downvotes on this sub if you dare to stray from the Western MSM cool-aid narrative? lol.
Paragraph 2: Of course, that's what they always do. That is why they were and are war mongering around the world, creating conflicts where there were none.
Paragraph 3: Phippines literally tried to be chinas ally and china doubled down on south china sea claims. China has themselves to blame.
Paragraph 4: What would scare the shit out of you is the Chinese military starting up a war with Taiwan, similar to the one Russia has started with Ukraine. The US has already stated that they would get involved directly with a conflict over Taiwan, so it’s best to keep the Chinese military at bay through deterrence, in case they decide to get too ballsy.
Paragraph 5: " The US already had limited access to five sites under the Enhanced Defence Cooperation Agreement (EDCA) - the new additions and expanded access, according to a statement f