In [49]:
from datasets import load_from_disk, concatenate_datasets

# 加载多个数据集
dataset1 = load_from_disk("/mnt/file2/changye/dataset/casual_formal_pair_ACL80k/casual_formal_pair_ACL80k_part1")
dataset2 = load_from_disk("/mnt/file2/changye/dataset/casual_formal_pair_ACL80k/casual_formal_pair_ACL80k_part2")
dataset3 = load_from_disk("/mnt/file2/changye/dataset/casual_formal_pair_ACL80k/casual_formal_pair_ACL80k_part3")
dataset4 = load_from_disk("/mnt/file2/changye/dataset/casual_formal_pair_ACL80k/casual_formal_pair_ACL80k_part4")


# 合并数据集
combined_dataset = concatenate_datasets([dataset1, dataset2,dataset3,dataset4])

print(combined_dataset)


Dataset({
    features: ['directory', 'filename', 'formal_text', 'casual_text'],
    num_rows: 40000
})


In [50]:
train_test_split = combined_dataset.train_test_split(test_size=0.2, seed=42)

# 再将临时集拆分为验证集和测试集（各占 50%，即总数据的 10%）
val_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=42)

# 最终数据集
final_dataset = {
    'train': train_test_split['train'],
    'val': val_test_split['train'],
    'test': val_test_split['test']
}

# 打印拆分后的数据集信息
print("Train dataset size:", len(final_dataset['train']))
print("Validation dataset size:", len(final_dataset['val']))
print("Test dataset size:", len(final_dataset['test']))

Train dataset size: 32000
Validation dataset size: 4000
Test dataset size: 4000


In [51]:
output_dir = "/mnt/file2/changye/dataset/casual_formal_pair_ACL80k"  # 设置保存目录
final_dataset['train'].save_to_disk(f"{output_dir}/train")
final_dataset['val'].save_to_disk(f"{output_dir}/val")
final_dataset['test'].save_to_disk(f"{output_dir}/test")


Saving the dataset (1/1 shards): 100%|██████████| 32000/32000 [00:00<00:00, 126576.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4000/4000 [00:00<00:00, 107768.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 4000/4000 [00:00<00:00, 116918.47 examples/s]


In [52]:
final_dataset

{'train': Dataset({
     features: ['directory', 'filename', 'formal_text', 'casual_text'],
     num_rows: 32000
 }),
 'val': Dataset({
     features: ['directory', 'filename', 'formal_text', 'casual_text'],
     num_rows: 4000
 }),
 'test': Dataset({
     features: ['directory', 'filename', 'formal_text', 'casual_text'],
     num_rows: 4000
 })}

In [1]:
from datasets import load_from_disk
test_dataset=load_from_disk("/mnt/file2/changye/dataset/ACL_clear/test")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(test_dataset)

Dataset({
    features: ['directory', 'filename', 'texts'],
    num_rows: 1811
})


In [3]:
import hashlib
def generate_text_hash(text: str) -> str:
    """
    Generate a unique identifier for the given text using SHA-256.

    Args:
        text (str): Input text.

    Returns:
        str: Unique hash for the text.
    """
    hash_object = hashlib.sha256(text.encode('utf-8'))
    return hash_object.hexdigest()
test_list=[]
for item in test_dataset:
    for test in item['texts']:
        test_list.append({
            'directory': item['directory'],
            'filename': item['filename'],
            'text': test,
            'hash': generate_text_hash(item['directory']+item['filename']+test)
        })

In [12]:
print((test_list[0].keys()))

dict_keys(['directory', 'filename', 'text', 'hash'])


In [29]:
def clean_text_english(text):
    import re
    # 移除多余的换行符，将多行文本合并为单段落
    text = re.sub(r'\s*\n\s*', ' ', text)
    # 替换多个连续空格为单个空格
    text = re.sub(r'\s{2,}', ' ', text)
    # 确保标点符号和单词之间的空格规范化
    text = re.sub(r'\s*([.,!?;:])\s*', r'\1 ', text)
    # 修复句号后需要的空格
    text = re.sub(r'([.!?])([A-Za-z])', r'\1 \2', text)
    # 去除首尾空格
    text = text.strip()
    return text

# 示例
text = """  This is   an example   text.  
It contains   unnecessary   spaces and  line breaks.  Here is a new line. 
Another sentence.  """




In [36]:
import re
def split_and_process_texts(text_list, max_length=200):
    updated_text_list = []

    for text in text_list:
        text['text']=clean_text_english(text['text'])
        if len(text['text']) > max_length:
            # 按标点分割文本
            sentences = re.split(r'(?<=[.!?])\s+', text['text'])
            current_chunk = ""
            for sentence in sentences:
                # 如果当前段落加上句子长度超出限制，将当前段落加入结果，并重新开始
                if len(current_chunk) + len(sentence) > max_length:
                    updated_text_list.append({
                        'directory': text['directory'],
                        'filename': text['filename'],
                        'text': current_chunk.strip(),
                        'hash': generate_text_hash(text['directory'] + text['filename'] + current_chunk.strip())
                    })
                    current_chunk = sentence
                else:
                    current_chunk += " " + sentence

            # 添加最后的剩余部分
            if current_chunk:
                updated_text_list.append({
                    'directory': text['directory'],
                    'filename': text['filename'],
                    'text': current_chunk.strip(),
                    'hash': generate_text_hash(text['directory'] + text['filename'] + current_chunk.strip())
                })
        else:
            # 如果文本长度小于等于限制，直接加入结果
            updated_text_list.append({
                'directory': text['directory'],
                'filename': text['filename'],
                'text': text['text'].strip(),
                'hash': generate_text_hash(text['directory'] + text['filename'] + text['text'].strip())
            })

    return updated_text_list



# 筛选和拆分
filtered_list = split_and_process_texts(test_list, max_length=1500)



In [31]:
count=0
for text in test_list:
    if len(text['text'])>50 and len(text['text'])<400:
        count+=1
print(count)

42257


In [38]:
print(len(filtered_list))
formal_list=[]
for item in filtered_list:
    if len(item['text'])>2000 or len(item['text'])<75:
        continue
    formal_list.append(item)
print(len(formal_list))

97974
81294


In [42]:
import torch
from datasets import Dataset
import hashlib


# 将数据转换为 Hugging Face 数据集格式
dataset = Dataset.from_dict({
    'directory': [entry['directory'] for entry in formal_list],
    'filename': [entry['filename'] for entry in formal_list],
    'text': [entry['text'] for entry in formal_list],
    'hash': [entry['hash'] for entry in formal_list]
})

# 显示数据集
print("Original Hugging Face dataset:")
print(dataset)

# 保存数据集到磁盘（使用 Arrow 格式）
dataset.save_to_disk("clear_ACL_sentences80k")



Original Hugging Face dataset:
Dataset({
    features: ['directory', 'filename', 'text', 'hash'],
    num_rows: 81294
})


Saving the dataset (1/1 shards): 100%|██████████| 81294/81294 [00:00<00:00, 864396.95 examples/s]


In [47]:
new_data=load_from_disk("/mnt/file2/changye/NLPFINAL/casual_formal_pair_ACL80k")
print(new_data[0]['formal_text'])

After achieving remarkable successes in Machine Translation (Sutskever et al. , 2014; Cho et al. , 2014), neural networks with the encoder-decoder architectures (a. k. a sequence-to-sequence models, Seq2Seq) have been proven to be a functioning method to model short-text conversations (Vinyals and Le, 2015; Shang et al. , 2015), where the corresponding task is often called Neural Response Generation. The advantage of applying Seq2Seq models to conversation generation is that the training procedure can be performed end-to-end in an unsupervised manner, based on human-generated conversational utterances (typically query-response pairs mined from social networks). One of the potential applications of such neural response generators is to improve the capability of existing conversational interfaces (informally also known as chatbots) by enabling them to go beyond predefined tasks and chat with human users in an open domain.
