# Train DataPre

In [16]:
import os
import json
from collections import defaultdict
import glob

# 指定目标目录路径
directory = 'F://project/Project/fine-tuning-z1/datasets-CMU_DoG-master/Conversations/train/'

# 准备存储对话数据的列表
all_conversations = []

# 使用 glob 模块获取目录下所有 .json 文件的路径列表
json_files = glob.glob(os.path.join(directory, '*.json'))

# 遍历每个 .json 文件
for i,json_file in enumerate(json_files):
    with open(json_file, 'r', encoding='utf-8') as f:
        conversation_data = json.load(f)
    
    # 整理为以 docIdx 为 key 的对话数据
    dialogues = defaultdict(list)
    for item in conversation_data['history']:
        dialogues[item["docIdx"]].append({
            "text": item["text"],
            "uid": item["uid"]
        })
    
    # 转换为包含上下文和回复的格式
    conversations = []
    for idx, dialogue in dialogues.items():
        if len(dialogue) >= 2:
            conversation = {
                "Context": dialogue[0]["text"],
                "Response": dialogue[1]["text"],
                "Extra Contexts": {f"context/{i}": dialogue[i + 2]["text"] for i in range(len(dialogue) - 2)},
                "Other features": {
                    "context_author": dialogue[0]["uid"],
                    "response_author": dialogue[1]["uid"],
                    "subreddit": "MovieDiscussions",
                    "thread_id": f"conversation_{idx}",
                    "ID": f"{i}_conv_{idx}"
                }
            }
            conversations.append(conversation)
    
    # 将本次文件的对话数据添加到总体列表中
    all_conversations.extend(conversations)

# 保存整理好的对话数据到一个 JSON 文件
output_file = './datasets-CMU_DoG/conversations_train.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_conversations, f, ensure_ascii=False, indent=4)

print(f"Saved conversations to {output_file}")


Saved conversations to ./datasets-CMU_DoG/conversations_train.json


# Test DataPre

In [19]:
import os
import json
from collections import defaultdict
import glob

# 指定目标目录路径
directory = 'F://project/Project/fine-tuning-z1/datasets-CMU_DoG-master/Conversations/test/'

# 准备存储对话数据的列表
all_conversations = []

# 使用 glob 模块获取目录下所有 .json 文件的路径列表
json_files = glob.glob(os.path.join(directory, '*.json'))

# 遍历每个 .json 文件
for i,json_file in enumerate(json_files):
    with open(json_file, 'r', encoding='utf-8') as f:
        conversation_data = json.load(f)
    
    # 整理为以 docIdx 为 key 的对话数据
    dialogues = defaultdict(list)
    for item in conversation_data['history']:
        dialogues[item["docIdx"]].append({
            "text": item["text"],
            "uid": item["uid"]
        })
    
    # 转换为包含上下文和回复的格式
    conversations = []
    for idx, dialogue in dialogues.items():
        if len(dialogue) >= 2:
            conversation = {
                "Context": dialogue[0]["text"],
                "Response": dialogue[1]["text"],
                "Extra Contexts": {f"context/{i}": dialogue[i + 2]["text"] for i in range(len(dialogue) - 2)},
                "Other features": {
                    "context_author": dialogue[0]["uid"],
                    "response_author": dialogue[1]["uid"],
                    "subreddit": "MovieDiscussions",
                    "thread_id": f"conversation_{idx}",
                    "ID": f"{i}_conv_{idx}"
                }
            }
            conversations.append(conversation)
    
    # 将本次文件的对话数据添加到总体列表中
    all_conversations.extend(conversations)

# 保存整理好的对话数据到一个 JSON 文件
output_file = './datasets-CMU_DoG/conversations_test.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(all_conversations, f, ensure_ascii=False, indent=4)

print(f"Saved conversations to {output_file}")


Saved conversations to ./datasets-CMU_DoG/conversations_test.json


# Make conversation pairs

In [5]:
import json
import os
import random

def read_json_file(filepath):
    """读取单个JSON文件并返回数据"""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def generate_random_pairs(conversation):
    """从每个对话条目中随机生成新的对话对"""
    pairs = []
    context = conversation["Context"]
    response = conversation["Response"]
    extra_contexts = conversation["Extra Contexts"]

    # 创建初始对话对
    pairs.append((context, response))

    # 从额外上下文生成对话对
    all_contexts = [response] + [extra_contexts[key] for key in sorted(extra_contexts.keys())]
    for i in range(len(all_contexts) - 1):
        pairs.append((all_contexts[i], all_contexts[i + 1]))

    return pairs

def format_pairs(pairs):
    """格式化对话对为指定的输出格式"""
    formatted_pairs = []
    for idx, (conv, response) in enumerate(pairs, 1):
        formatted_pairs.append({"ID": idx, "conv": conv, "response": response})
    return formatted_pairs

def save_to_json(data, output_file):
    """将数据保存到JSON文件"""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def main(input_file, output_file):
    # 读取原始JSON文件
    conversation_data = read_json_file(input_file)

    # 存储新的对话对
    new_pairs = []
    for conversation in conversation_data:
        new_pairs.extend(generate_random_pairs(conversation))
    
    # 格式化对话对
    formatted_pairs = format_pairs(new_pairs)
    
    # 保存新的对话对到JSON文件
    save_to_json(formatted_pairs, output_file)
    print("done")
    
# 指定输入和输出文件路径
input_file = './datasets-CMU_DoG/conversations_train.json'
output_file = './datasets-CMU_DoG/pairs/train_pairs.json'

# 运行主函数
main(input_file, output_file)

done


In [6]:
import json
import os
import random

def read_json_file(filepath):
    """读取单个JSON文件并返回数据"""
    with open(filepath, 'r', encoding='utf-8') as f:
        return json.load(f)

def generate_random_pairs(conversation):
    """从每个对话条目中随机生成新的对话对"""
    pairs = []
    context = conversation["Context"]
    response = conversation["Response"]
    extra_contexts = conversation["Extra Contexts"]

    # 创建初始对话对
    pairs.append((context, response))

    # 从额外上下文生成对话对
    all_contexts = [response] + [extra_contexts[key] for key in sorted(extra_contexts.keys())]
    for i in range(len(all_contexts) - 1):
        pairs.append((all_contexts[i], all_contexts[i + 1]))

    return pairs

def format_pairs(pairs):
    """格式化对话对为指定的输出格式"""
    formatted_pairs = []
    for idx, (conv, response) in enumerate(pairs, 1):
        formatted_pairs.append({"ID": idx, "conv": conv, "response": response})
    return formatted_pairs

def save_to_json(data, output_file):
    """将数据保存到JSON文件"""
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

def main(input_file, output_file):
    # 读取原始JSON文件
    conversation_data = read_json_file(input_file)

    # 存储新的对话对
    new_pairs = []
    for conversation in conversation_data:
        new_pairs.extend(generate_random_pairs(conversation))
    
    # 格式化对话对
    formatted_pairs = format_pairs(new_pairs)
    
    # 保存新的对话对到JSON文件
    save_to_json(formatted_pairs, output_file)
    print("done")
    
# 指定输入和输出文件路径
input_file = './datasets-CMU_DoG/conversations_test.json'
output_file = './datasets-CMU_DoG/pairs/test_pairs.json'

# 运行主函数
main(input_file, output_file)

done
