In [1]:
import pandas as pd
import json
import os
import openai
import time
import logging
from dotenv import load_dotenv
from loguru import logger
import tqdm
import random
from typing import List, Dict, Any

# 環境変数をロード
load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
dataset_path = "../data/Persona-Chat/personachat/train_self_original.txt"

output_path = "results/personachat.json"

In [5]:
def parse_personachat(file_path, limit=None):
    """
    PersonaChatのデータをパースする関数
    limit: 読み込むエピソード(セット)の上限数。Noneの場合は全件読み込む。
    """
    dialogues = []
    current_persona = []
    current_history = []
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line: continue
            
            space_idx = line.find(' ')
            if space_idx == -1: continue
            
            idx = int(line[:space_idx])
            content = line[space_idx+1:]
            
            # IDが1なら新しいエピソードの開始
            if idx == 1:
                # 前のエピソードがバッファにあれば保存
                if current_persona: 
                    dialogues.append({
                        'persona': current_persona[:],
                        'history': current_history[:]
                    })
                    
                    # ★追加部分: 指定した件数(limit)に達したらここで終了
                    if limit is not None and len(dialogues) >= limit:
                        return dialogues

                # バッファをリセット
                current_persona = []
                current_history = []

            # ペルソナ行の処理
            if content.startswith('your persona:'):
                persona_text = content.replace('your persona:', '').strip()
                current_persona.append(persona_text)
            
            # 会話行の処理
            else:
                if '\t' in content:
                    partner_msg, my_reply = content.split('\t')[:2]
                    current_history.append({'partner': partner_msg, 'me': my_reply})

        # (limitに達せずに)ファイル末尾まで来た場合、最後のバッファを保存
        if current_persona:
            dialogues.append({'persona': current_persona, 'history': current_history})
            
    return dialogues

In [7]:
# 10セットだけ読み込む
data = parse_personachat(dataset_path, limit=10)

print(f"取得したデータセット数: {len(data)}")
print("-" * 30)

# 全データを表示して確認
for i, episode in enumerate(data):
    print(f"【Set {i+1}】")
    print("[Persona]")
    for p in episode['persona']:
        print(f"  - {p}")
    print("[Dialogue]")
    for turn in episode['history']:
        print(f"  User: {turn['partner']}")
        print(f"  You : {turn['me']}")
    print("-" * 30)

取得したデータセット数: 10
------------------------------
【Set 1】
[Persona]
  - i like to remodel homes.
  - i like to go hunting.
  - i like to shoot a bow.
  - my favorite holiday is halloween.
[Dialogue]
  User: hi , how are you doing ? i am getting ready to do some cheetah chasing to stay in shape .
  You : you must be very fast . hunting is one of my favorite hobbies .
  User: i am ! for my hobby i like to do canning or some whittling .
  You : i also remodel homes when i am not out bow hunting .
  User: that is neat . when i was in high school i placed 6th in 100m dash !
  You : that is awesome . do you have a favorite season or time of year ?
  User: i do not . but i do have a favorite meat since that is all i eat exclusively .
  You : what is your favorite meat to eat ?
  User: i would have to say its prime rib . do you have any favorite foods ?
  You : i like chicken or macaroni and cheese .
  User: do you have anything planned for today ? i think i am going to do some canning .
  You : 