In [1]:
import os

def read_and_split_paragraphs(data_dir):
    texts = []
    filenames = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
                paragraphs = file.read().split('\n')  
                texts.append(paragraphs)
                filenames.append(filename)
    return texts, filenames

def save_paragraphs(texts, filenames, output_dir):
    for paragraphs, filename in zip(texts, filenames):
        file_base_name = os.path.splitext(filename)[0]
        file_dir = os.path.join(output_dir, file_base_name)
        os.makedirs(file_dir, exist_ok=True)

        for i, paragraph in enumerate(paragraphs):
            paragraph_filename = f'paragraph_{i+1}.txt'
            paragraph_filepath = os.path.join(file_dir, paragraph_filename)
            with open(paragraph_filepath, 'w', encoding='utf-8') as paragraph_file:
                paragraph_file.write(paragraph.strip())

def process_data_folders(base_dir):
    for folder in ['train', 'validation']:
        data_dir = os.path.join(base_dir, folder)
        output_dir = os.path.join(base_dir, f'{folder}_processed')
        os.makedirs(output_dir, exist_ok=True)

        texts, filenames = read_and_split_paragraphs(data_dir)
        save_paragraphs(texts, filenames, output_dir)

base_data_dir = 'data'
process_data_folders(base_data_dir)

In [6]:
import os

In [7]:
def read_and_split_paragraphs(data_dir):
    texts = []
    filenames = []
    for filename in os.listdir(data_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(data_dir, filename), 'r', encoding='utf-8') as file:
                paragraphs = []
                current_paragraph = ''
                for line in file:
                    stripped_line = line.strip()
                    if stripped_line:
                        if line.startswith(' '):  # Check for indentation
                            current_paragraph += ' ' + stripped_line
                        else:
                            if current_paragraph:
                                paragraphs.append(current_paragraph)
                            current_paragraph = stripped_line
                    else:
                        if current_paragraph:
                            paragraphs.append(current_paragraph)
                            current_paragraph = ''
                if current_paragraph:  # Append the last paragraph if it exists
                    paragraphs.append(current_paragraph)
                texts.append(paragraphs)
                filenames.append(filename)
    return texts, filenames

In [9]:
def save_paragraphs(texts, filenames, output_dir):
    for paragraphs, filename in zip(texts, filenames):
        file_base_name = os.path.splitext(filename)[0]
        file_dir = os.path.join(output_dir, file_base_name)
        os.makedirs(file_dir, exist_ok=True)

        for i, paragraph in enumerate(paragraphs):
            paragraph_filename = f'paragraph_{i+1}.txt'
            paragraph_filepath = os.path.join(file_dir, paragraph_filename)
            with open(paragraph_filepath, 'w', encoding='utf-8') as paragraph_file:
                paragraph_file.write(paragraph.strip())

In [10]:
def process_data_folders(base_dir):
    for folder in ['train', 'validation']:
        data_dir = os.path.join(base_dir, folder)
        output_dir = os.path.join(base_dir, f'{folder}_processed')
        os.makedirs(output_dir, exist_ok=True)

        texts, filenames = read_and_split_paragraphs(data_dir)
        save_paragraphs(texts, filenames, output_dir)

In [11]:
base_data_dir = 'data'
process_data_folders(base_data_dir)

### Test problem-1033

In [12]:
import os

def read_and_split_paragraphs(file_path):
    texts = []
    filenames = []
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            paragraphs = []
            current_paragraph = ''
            for line in file:
                stripped_line = line.strip()
                if stripped_line:
                    if line.startswith(' '):  # Check for indentation
                        current_paragraph += ' ' + stripped_line
                    else:
                        if current_paragraph:
                            paragraphs.append(current_paragraph)
                        current_paragraph = stripped_line
                else:
                    if current_paragraph:
                        paragraphs.append(current_paragraph)
                        current_paragraph = ''
            if current_paragraph:  # Append the last paragraph if it exists
                paragraphs.append(current_paragraph)
            texts.append(paragraphs)
            filenames.append(os.path.basename(file_path))
    return texts, filenames

# 设置测试文件路径
test_file_path = 'data/train/problem-1033.txt'

# 测试函数
texts, filenames = read_and_split_paragraphs(test_file_path)

# 打印结果
for filename, paragraphs in zip(filenames, texts):
    print(f"Filename: {filename}")
    for i, paragraph in enumerate(paragraphs, 1):
        print(f"Paragraph {i}: {paragraph}")


Filename: problem-1033.txt
Paragraph 1: WARSAW, Feb 9 (Reuters) - Poland is to close a key border crossing with Belarus until further notice, the Polish interior minister said on Thursday, as relations between Warsaw and Minsk sink to new lows.
Paragraph 2: The already tense relations between Poland and Belarus were further strained on Wednesday when a journalist of Polish origin was sentenced to eight years in prison by a Belarusian court in a trial Warsaw says was politically motivated.
Paragraph 3: "Due to the important interest of state security, I decided to suspend until further notice from 1200 on Feb. 10 this year traffic at the Polish-Belarusian border crossing in Bobrowniki," Mariusz Kaminski wrote on Twitter. Bobrowniki, more than 200 km northeast of Warsaw, is one of the main crossing points between Poland and Belarus. Kaminski also said that as a result of the jailing of journalist Andrzej Poczobut he would apply for further people connected with Belarusian President Alexa