In [1]:
import pandas as pd
import json
import os
import openai
import time
import logging
from dotenv import load_dotenv
from loguru import logger
import tqdm

# 環境変数をロード
load_dotenv()
openai.api_key = os.environ["OPENAI_KEY"]


In [2]:
dataset_name = "daily_dialogue_with_recoginized_concept_raw"  # データセット名を指定
# dataset_name = "empathetic_dialogues_llm"

pa_prompt_template_file_path = "../prompt_templates/oie_template.txt"
oie_prompt_template_file_path = "../prompt_templates/oie_template.txt"
oie_few_shot_example_file_path = "../few_shot_examples/example/oie_few_shot_examples.txt"

In [3]:
# parquetファイルから直接データを読み込む
parquet_path = f'/home/y-aida/Programs/preference-kg/data/raw/{dataset_name}/train.parquet'

# DataFrameに変換して表示
df = pd.read_parquet(parquet_path, engine='pyarrow')
print("\n--- データの詳細 ---")
print(df.head(2))



--- データの詳細 ---
                                              dialog  \
0  [Say , Jim , how about going for a few beers a...   
1  [Can you do push-ups ? ,  Of course I can . It...   

                              act                         emotion  \
0  [3, 4, 2, 2, 2, 3, 4, 1, 3, 4]  [0, 0, 0, 0, 0, 0, 4, 4, 4, 4]   
1              [2, 1, 2, 2, 1, 1]              [0, 0, 6, 0, 0, 0]   

                                             mention          source  \
0  [[dinner, beer], [fitness, know], [will, mean,...  [dinner, beer]   
1  [[push], [cake, push, believe, minute, piece],...          [push]   

                          target  
0  [fun, dance, sound, exercise]  
1           [everyday, exercise]  


## Empathetic Dialogues LLM

In [15]:

# Cell 3
# conversations列のみを抽出して、最初の3データを取得
example_df = df[['conversations']].head(3)

print("\n--- conversations列のみを抽出（3件） ---")
print(example_df)
print(f"\nデータ型: {example_df['conversations'].dtype}")
print(f"\nサイズ: {example_df.shape}")

# 最初の1件の内容を詳細表示
print("\n--- 最初の1件の詳細 ---")
print(example_df['conversations'].iloc[0])


--- conversations列のみを抽出（3件） ---
                                       conversations
0  [{'content': 'I remember going to see the fire...
1  [{'content': ' it feels like hitting to blank ...
2  [{'content': 'Hi how are you doing today', 'ro...

データ型: object

サイズ: (3, 1)

--- 最初の1件の詳細 ---
[{'content': 'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.', 'role': 'user'}
 {'content': 'Was this a friend you were in love with, or just a best friend?', 'role': 'assistant'}
 {'content': 'This was a best friend. I miss her.', 'role': 'user'}
 {'content': 'Where has she gone?', 'role': 'assistant'}
 {'content': 'We no longer talk.', 'role': 'user'}
 {'content': 'Oh was this something that happened because of an argument?', 'role': 'assistant'}]


### Data Processing

In [None]:
def openai_chat_completion(model, system_prompt, temperature=0, max_tokens=512):
    ## OpenAI API call
    response = None
    if system_prompt is not None:
        messages = [{"role": "system", "content": system_prompt}]
    else:
        messages = None
    while response is None:
        try:
            response = openai.chat.completions.create(
                model=model, messages=messages, temperature=temperature, max_tokens=max_tokens
            )
        except Exception as e:
            time.sleep(5)
    logging.debug(f"Model: {model}\nPrompt:\n {messages}\n Result: {response.choices[0].message.content}")
    return response.choices[0].message.content

In [None]:
system_prompt = """You are an expert in user preference analysis. Your task is to analyze conversations between a user and an assistant to identify and report on user preferences.

## Task Overview
You will receive conversations in JSON format containing exchanges between 'user' and 'assistant' in English. Your goal is to create a comprehensive user preference analysis report based on these conversations.

## Key Definitions
**Preference (嗜好)**: A long-term, stable evaluative tendency that is distinct from temporary emotions. While emotions like "joy" or "anger" are transient reactions to specific situations, preferences are persistent tendencies such as "liking specific foods" or "preferring certain activities." Preferences represent stable patterns of likes and dislikes that do not fluctuate dramatically with situations.

**Emotion vs. Preference**: Emotions are temporary psychological/physiological reactions, whereas preferences are enduring tendencies toward objects or activities.

## Analysis Requirements

### Step 1: Conversation Analysis
First, analyze the conversation itself:
- Identify the main topics discussed
- Note the context and situational factors
- Observe the user's language patterns and expressions

### Step 2: Preference Identification
Analyze user preferences from both perspectives:

**Explicit Preferences (顕在的嗜好)**:
- Directly stated likes/dislikes
- Clear expressions of preference
- Obvious choices made by the user

**Latent Preferences (潜在的嗜好)**:
- Implied preferences through behavior patterns
- Indirect indicators of likes/dislikes
- Underlying tendencies not directly stated

### Step 3: Summary Report
For each identified preference, provide a concise statement in the following format:
"[対象(Target)]の[側面(Aspect)]に対し、[条件(Conditions)]という条件下で[嗜好内容(Preference)]という嗜好がある"

Example: "食べ物の味に対し、健康を意識する条件下で、野菜を好む嗜好がある"

## Important Notes
- If no preference information is found in the conversation, clearly state: "この会話からは嗜好情報を特定できません"
- Focus on stable, long-term preferences, not temporary emotional reactions
- Distinguish between situational reactions and underlying preference patterns
- Provide evidence from the conversation to support identified preferences

## Output Format
1. **会話分析** (Conversation Analysis)
2. **嗜好分析** (Preference Analysis)
   - 顕在的嗜好 (Explicit Preferences)
   - 潜在的嗜好 (Latent Preferences)
3. **嗜好サマリー** (Preference Summary) - List each preference in the specified format

**Character Limit**: Keep your response between 800-1000 Japanese characters. Be concise but comprehensive."""

In [None]:
# Cell 6: 各会話データに対して嗜好分析レポートを生成
def analyze_conversation(conversation_data, model="gpt-4o-mini"):
    """
    会話データを分析して嗜好レポートを生成
    
    Args:
        conversation_data: 会話データ（リスト形式）
        model: 使用するOpenAIモデル
    
    Returns:
        嗜好分析レポート（文字列）
    """
    # 会話データをJSON文字列に変換
    conversation_json = json.dumps(conversation_data, ensure_ascii=False, indent=2)
    
    # ユーザメッセージを作成
    user_message = f"以下の会話を分析してください:\n\n{conversation_json}"
    
    # OpenAI APIを呼び出し
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
    
    response = None
    while response is None:
        try:
            response = openai.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0,
                max_tokens=512
            )
        except Exception as e:
            logger.error(f"API呼び出しエラー: {e}")
            time.sleep(5)
    
    return response.choices[0].message.content

In [None]:
# Cell 7: example_dfの各行を処理
print("=" * 80)
print("嗜好分析レポート生成開始")
print("=" * 80)

results = []

for idx, row in example_df.iterrows():
    print(f"\n{'='*80}")
    print(f"会話 #{idx + 1} の分析")
    print(f"{'='*80}\n")
    
    # conversations列のデータを取得
    conversation = row['conversations']
    
    # ndarrayをPythonのリストに変換
    if hasattr(conversation, 'tolist'):
        conversation = conversation.tolist()
    
    # 会話内容を表示
    print("【入力会話】")
    print(json.dumps(conversation, ensure_ascii=False, indent=2))
    print("\n")
    
    # 嗜好分析レポートを生成
    print("【嗜好分析レポート】")
    try:
        report = analyze_conversation(conversation)
        print(report)
        
        # 結果を保存
        results.append({
            'conversation_id': idx,
            'conversation': conversation,
            'analysis_report': report
        })
        
        # 字数をカウント（日本語文字のみ）
        japanese_chars = len([c for c in report if ord(c) > 127])
        print(f"\n文字数: {japanese_chars} 文字")
        
    except Exception as e:
        print(f"エラーが発生しました: {e}")
        results.append({
            'conversation_id': idx,
            'conversation': conversation,
            'analysis_report': f"エラー: {e}"
        })
    
    print("\n")
    
    # API制限を考慮して待機
    if idx < len(example_df) - 1:
        time.sleep(2)

print("=" * 80)
print("全ての分析が完了しました")
print("=" * 80)

# 結果をDataFrameとして保存
results_df = pd.DataFrame(results)
print(f"\n処理件数: {len(results_df)}")

嗜好分析レポート生成開始

会話 #1 の分析

【入力会話】
[
  {
    "content": "I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people, we felt like the only people in the world.",
    "role": "user"
  },
  {
    "content": "Was this a friend you were in love with, or just a best friend?",
    "role": "assistant"
  },
  {
    "content": "This was a best friend. I miss her.",
    "role": "user"
  },
  {
    "content": "Where has she gone?",
    "role": "assistant"
  },
  {
    "content": "We no longer talk.",
    "role": "user"
  },
  {
    "content": "Oh was this something that happened because of an argument?",
    "role": "assistant"
  }
]


【嗜好分析レポート】
1. **会話分析**  
この会話では、ユーザーが友人との思い出を語っています。特に花火を見に行った経験が強調されており、その時の感情や状況が描写されています。ユーザーは友人を懐かしむ気持ちを表現しており、過去の関係についての感情が中心テーマです。

2. **嗜好分析**  
   - **顕在的嗜好 (Explicit Preferences)**:  
     - 友人との思い出に対し、特別な感情を持っていることが明示されています。「彼女が恋人ではなく、親友である」との発言から、親友との関係を大切に思っていることがわか

In [None]:
# Cell 8: 結果をCSVファイルに保存
output_csv_path = f'/home/y-aida/Programs/preference-kg/results/reports/{dataset_name}/preference_analysis_results_example.csv'

# CSVに保存
results_df.to_csv(output_csv_path, index=False, encoding='utf-8')

logger.success(f"結果をCSVに保存しました: {output_csv_path}")
print(f"\n保存先: {output_csv_path}")
print(f"保存件数: {len(results_df)}件")

# ファイルサイズを確認
import os
file_size = os.path.getsize(output_csv_path) / 1024  # KB単位
print(f"ファイルサイズ: {file_size:.2f} KB")

[32m2025-10-26 03:03:48.687[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [32m[1m結果をCSVに保存しました: /home/y-aida/Programs/preference-kg/data/processed/preference_analysis_results.csv[0m



保存先: /home/y-aida/Programs/preference-kg/data/processed/preference_analysis_results.csv
保存件数: 3件
ファイルサイズ: 5.38 KB


## Daily Dialogue with Recognized Concept Raw

In [19]:
# dialog列のみを抽出して、最初の3データを取得
example_df = df[['dialog']].head(3)

print("\n--- dialog列のみを抽出（3件） ---")
print(example_df)
print(f"\nデータ型: {example_df['dialog'].dtype}")
print(f"\nサイズ: {example_df.shape}")

# 最初の1件の内容を詳細表示
print("\n--- 最初の1件の詳細 ---")
print(example_df['dialog'].iloc[0])


--- dialog列のみを抽出（3件） ---
                                              dialog
0  [Say , Jim , how about going for a few beers a...
1  [Can you do push-ups ? ,  Of course I can . It...
2  [Can you study with the radio on ? ,  No , I l...

データ型: object

サイズ: (3, 1)

--- 最初の1件の詳細 ---
['Say , Jim , how about going for a few beers after dinner ? '
 ' You know that is tempting but is really not good for our fitness . '
 ' What do you mean ? It will help us to relax . '
 " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? "
 " I guess you are right.But what shall we do ? I don't feel like sitting at home . "
 ' I suggest a walk over to the gym where we can play singsong and meet some of our friends . '
 " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . "
 ' Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , t

### Preference Analyze

In [31]:
system_prompt = """You are an expert in user preference analysis. Your task is to analyze conversations between a user and an assistant to identify and report on user preferences.

## Task Overview
You will receive conversations in JSON format containing exchanges between 'user' and 'assistant' in English. Your goal is to create a comprehensive user preference analysis report based on these conversations.

## Key Definitions
**Preference**: A long-term, stable evaluative tendency that is distinct from temporary emotions. While emotions like "joy" or "anger" are transient reactions to specific situations, preferences are persistent tendencies such as "liking specific foods" or "preferring certain activities." Preferences represent stable patterns of likes and dislikes that do not fluctuate dramatically with situations.

**Emotion vs. Preference**: Emotions are temporary psychological/physiological reactions, whereas preferences are enduring tendencies toward objects or activities.

## Analysis Requirements

### Step 1: Conversation Analysis
First, analyze the conversation itself:
- Identify the main topics discussed
- Note the context and situational factors
- Observe the user's language patterns and expressions

### Step 2: Preference Identification
Analyze user preferences from both perspectives:

**Explicit Preferences**:
- Directly stated likes/dislikes
- Clear expressions of preference
- Obvious choices made by the user

**Latent Preferences**:
- Implied preferences through behavior patterns
- Indirect indicators of likes/dislikes
- Underlying tendencies not directly stated

### Step 3: Summary Report
For each identified preference, provide a concise statement in the following format:
"The user has a preference for [Preference Content] regarding [Aspect] of [Target], under the condition of [Conditions]"

Example: "The user has a preference for vegetables regarding taste of food, under the condition of health consciousness"

## Important Notes
- If no preference information is found in the conversation, clearly state: "No preference information can be identified from this conversation"
- Focus on stable, long-term preferences, not temporary emotional reactions
- Distinguish between situational reactions and underlying preference patterns
- Provide evidence from the conversation to support identified preferences

## Output Format
You must output your analysis in JSON format with the following structure:

```json
{
  "conversation_analysis": "Your analysis of the conversation including main topics, context, and language patterns",
  "preference_analysis": {
    "explicit_preferences": "Directly stated preferences with evidence from the conversation",
    "latent_preferences": "Implied preferences based on behavior patterns and indirect indicators"
  },
  "preference_summary": [
    "The user has a preference for [Preference Content] regarding [Aspect] of [Target], under the condition of [Conditions]",
    "Additional preference statements..."
  ]
}
```

## Important Guidelines
- **conversation_analysis**: 2-3 sentences describing the conversation's main topics and context
- **preference_analysis**: Separate explicit and latent preferences with supporting evidence (2-3 sentences each)
- **preference_summary**: Array of concise preference statements in the specified format
- If no preferences are found, use: `"preference_summary": ["No preference information can be identified from this conversation"]`
- Keep total output between 800-1000 characters
- Output must be valid JSON and in English"""

In [None]:
# 各会話データに対して嗜好分析レポートを生成
def analyze_conversation(conversation_data, model="gpt-4o-mini"):
    """
    会話データを分析して嗜好レポートを生成
    
    Args:
        conversation_data: 会話データ（リスト形式）
        model: 使用するOpenAIモデル
    
    Returns:
        嗜好分析レポート（dict）
    """
    # 会話データをJSON文字列に変換
    conversation_json = json.dumps(conversation_data, ensure_ascii=False, indent=2)
    
    # ユーザメッセージを作成
    user_message = f"Please analyze the following conversation:\n\n{conversation_json}"
    
    # OpenAI APIを呼び出し
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_message}
    ]
    
    response = None
    while response is None:
        try:
            response = openai.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0,
                max_tokens=256,
                response_format={"type": "json_object"}
            )
        except Exception as e:
            logger.error(f"API呼び出しエラー: {e}")
            time.sleep(5)
    
    # JSON形式のレスポンスをパース
    try:
        result = json.loads(response.choices[0].message.content)
        return result
    except json.JSONDecodeError as e:
        logger.error(f"JSON解析エラー: {e}")
        return {
            "conversation_analysis": "Error parsing response",
            "preference_analysis": {
                "explicit_preferences": "",
                "latent_preferences": ""
            },
            "preference_summary": ["Error occurred during analysis"]
        }

In [33]:
# example_dfの各行を処理
print("=" * 80)
print("嗜好分析レポート生成開始")
print("=" * 80)

results = []

for idx, row in example_df.iterrows():
    print(f"\n{'='*80}")
    print(f"会話 #{idx + 1} の分析")
    print(f"{'='*80}\n")
    
    # dialog列のデータを取得
    conversation = row['dialog']
    
    # ndarrayをPythonのリストに変換
    if hasattr(conversation, 'tolist'):
        conversation = conversation.tolist()
    
    # 会話内容を表示
    print("【入力会話】")
    print(json.dumps(conversation, ensure_ascii=False, indent=2))
    print("\n")
    
    # 嗜好分析レポートを生成
    print("【嗜好分析レポート】")
    try:
        report = analyze_conversation(conversation)
        print(json.dumps(report, ensure_ascii=False, indent=2))
        
        # preference_analysisを文字列に変換
        preference_analysis_text = f"Explicit: {report['preference_analysis']['explicit_preferences']}\nLatent: {report['preference_analysis']['latent_preferences']}"
        
        # preference_summaryを文字列に変換（リストを改行で結合）
        preference_summary_text = "\n".join(report['preference_summary'])
        
        # 結果を保存（CSV列構造に対応）
        results.append({
            'conversation_id': idx,
            'conversation': json.dumps(conversation, ensure_ascii=False),
            'conversation_analysis': report['conversation_analysis'],
            'preference_analysis': preference_analysis_text,
            'preference_summary': preference_summary_text
        })
        
        # 文字数をカウント
        total_chars = len(json.dumps(report, ensure_ascii=False))
        print(f"\n文字数: {total_chars} 文字")
        
    except Exception as e:
        print(f"エラーが発生しました: {e}")
        results.append({
            'conversation_id': idx,
            'conversation': json.dumps(conversation, ensure_ascii=False),
            'conversation_analysis': f"エラー: {e}",
            'preference_analysis': "",
            'preference_summary': ""
        })
    
    print("\n")
    
    # API制限を考慮して待機
    if idx < len(example_df) - 1:
        time.sleep(2)

print("=" * 80)
print("全ての分析が完了しました")
print("=" * 80)

# 結果をDataFrameとして保存
results_df = pd.DataFrame(results)
print(f"\n処理件数: {len(results_df)}")
print(f"\n列: {list(results_df.columns)}")

嗜好分析レポート生成開始

会話 #1 の分析

【入力会話】
[
  "Say , Jim , how about going for a few beers after dinner ? ",
  " You know that is tempting but is really not good for our fitness . ",
  " What do you mean ? It will help us to relax . ",
  " Do you really think so ? I don't . It will just make us fat and act silly . Remember last time ? ",
  " I guess you are right.But what shall we do ? I don't feel like sitting at home . ",
  " I suggest a walk over to the gym where we can play singsong and meet some of our friends . ",
  " That's a good idea . I hear Mary and Sally often go there to play pingpong.Perhaps we can make a foursome with them . ",
  " Sounds great to me ! If they are willing , we could ask them to go dancing with us.That is excellent exercise and fun , too . ",
  " Good.Let ' s go now . ",
  " All right . "
]


【嗜好分析レポート】
{
  "conversation_analysis": "The conversation revolves around the decision of whether to go for beers after dinner or engage in a healthier activity. The context h

In [34]:
# Cell 8: 結果をCSVファイルに保存
output_csv_path = f'/home/y-aida/Programs/preference-kg/notebooks/results/reports/{dataset_name}/preference_analysis_results_example.csv'

# 親ディレクトリを作成（存在しない場合）
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

# CSVに保存
results_df.to_csv(output_csv_path, index=False, encoding='utf-8')

logger.success(f"結果をCSVに保存しました: {output_csv_path}")
print(f"\n保存先: {output_csv_path}")
print(f"保存件数: {len(results_df)}件")

# ファイルサイズを確認
import os
file_size = os.path.getsize(output_csv_path) / 1024  # KB単位
print(f"ファイルサイズ: {file_size:.2f} KB")

[32m2025-10-27 14:49:35.807[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [32m[1m結果をCSVに保存しました: /home/y-aida/Programs/preference-kg/notebooks/results/reports/daily_dialogue_with_recoginized_concept_raw/preference_analysis_results_example.csv[0m



保存先: /home/y-aida/Programs/preference-kg/notebooks/results/reports/daily_dialogue_with_recoginized_concept_raw/preference_analysis_results_example.csv
保存件数: 3件
ファイルサイズ: 3.99 KB


## OIE

In [5]:
report_csv_path = f'/home/y-aida/Programs/preference-kg/notebooks/results/reports/{dataset_name}/preference_analysis_results_example.csv'

In [6]:
# CSVファイルを読み込み、preference_summaryを表示
print("=" * 80)
print("CSVファイルの読み込みと表示")
print("=" * 80)

# CSVファイルを読み込み
df_analysis = pd.read_csv(report_csv_path)

print(f"\n読み込んだファイル: {report_csv_path}")
print(f"行数: {len(df_analysis)}")
print(f"列: {list(df_analysis.columns)}\n")

# 各行のpreference_summaryを表示
for idx, row in df_analysis.iterrows():
    print(f"\n{'='*80}")
    print(f"会話 #{row['conversation_id'] + 1}")
    print(f"{'='*80}")
    print(f"\n【Conversation Analysis】")
    print(row['conversation_analysis'])
    print(f"\n【Preference Analysis】")
    print(row['preference_analysis'])
    print(f"\n【Preference Summary】")
    print(row['preference_summary'])
    print()

print("=" * 80)
print("表示完了")
print("=" * 80)

CSVファイルの読み込みと表示

読み込んだファイル: /home/y-aida/Programs/preference-kg/notebooks/results/reports/daily_dialogue_with_recoginized_concept_raw/preference_analysis_results_example.csv
行数: 3
列: ['conversation_id', 'conversation', 'conversation_analysis', 'preference_analysis', 'preference_summary']


会話 #1

【Conversation Analysis】
The conversation revolves around the decision of whether to go for beers after dinner or engage in a healthier activity. The context highlights a concern for fitness and the desire to socialize, leading to a discussion about alternatives like going to the gym and dancing.

【Preference Analysis】
Explicit: The user explicitly states a preference against drinking beer due to fitness concerns, saying it will 'just make us fat and act silly.' This indicates a clear dislike for activities that compromise health.
Latent: The user shows an implied preference for socializing in healthier environments, as they suggest going to the gym and playing pingpong instead of drinking. Thi

In [7]:
# preference_summary列のみを抽出して、最初の3データを取得
df_oie = df_analysis[['preference_summary']].head(3)

print("\n--- preference_summary列のみを抽出（3件） ---")
print(df_oie)
print(f"\nデータ型: {df_oie['preference_summary'].dtype}")
print(f"\nサイズ: {df_oie.shape}")

# 最初の1件の内容を詳細表示
print("\n--- 最初の1件の詳細 ---")
print(df_oie['preference_summary'].iloc[0])


--- preference_summary列のみを抽出（3件） ---
                                  preference_summary
0  The user has a preference against drinking alc...
1  The user has a preference for exercise regardi...
2  The user has a preference for background music...

データ型: object

サイズ: (3, 1)

--- 最初の1件の詳細 ---
The user has a preference against drinking alcohol regarding health and fitness, under the condition of wanting to maintain a healthy lifestyle.
The user has a preference for engaging in social activities that promote fitness regarding leisure activities, under the condition of wanting to avoid sedentary options.


In [20]:
oie_prompt_template_str = open(oie_prompt_template_file_path, 'r').read()
oie_few_shot_example_str = open(oie_few_shot_example_file_path, 'r').read()

In [22]:
def extract_oie_triples(
        input_text_str: str,
        few_shot_examples_str: str,
        prompt_template_str: str,
        entities_hint: str = None,
        relations_hint: str = None,
    ):
    """
    OIEを使用してテキストからトリプルを抽出
    
    Args:
        input_text_str: 入力テキスト
        few_shot_examples_str: Few-shotの例
        prompt_template_str: プロンプトテンプレート
        entities_hint: エンティティのヒント（オプション）
        relations_hint: リレーションのヒント（オプション）
    
    Returns:
        抽出されたトリプルのリスト
    """
    # entities_hintとrelations_hintの条件チェックを修正
    if entities_hint is not None or relations_hint is not None:
        if entities_hint is None or relations_hint is None:
            raise ValueError("entities_hint と relations_hint は両方指定するか、両方Noneにする必要があります")
    
    # プロンプトを構築
    filled_prompt = prompt_template_str.format(
        few_shot_examples=few_shot_examples_str,
        input_text=input_text_str,
        entities_hint=entities_hint if entities_hint else "",
        relations_hint=relations_hint if relations_hint else ""
    )
    
    # JSON形式での出力を明示的に指示
    filled_prompt += "\n\nPlease output your response in JSON format with a 'triplets' key containing an array of triplets."
    
    messages = [{"role": "user", "content": filled_prompt}]
    
    response = None
    while response is None:
        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
                temperature=0,
                max_tokens=512,
                response_format={"type": "json_object"}
            )
        except Exception as e:
            logger.error(f"API呼び出しエラー: {e}")
            time.sleep(5)
    
    # JSONレスポンスをパース
    try:
        result = json.loads(response.choices[0].message.content)
        # tripletsキーがあればそれを返す、なければ空リストを返す
        extracted_triplets_list = result.get("triplets", [])
        return extracted_triplets_list
    except json.JSONDecodeError as e:
        logger.error(f"JSON解析エラー: {e}")
        return []

In [23]:
# すべてのpreference_summaryからトリプルを抽出
print("=" * 80)
print("OIE トリプル抽出開始")
print("=" * 80)

oie_results = []


for idx, input_text in enumerate(tqdm(df_oie['preference_summary'], desc="トリプル抽出中")):
    print(f"\n処理中: {idx + 1}/{len(df_oie)}")
    
    # トリプルを抽出
    oie_triples = extract_oie_triples(input_text, oie_few_shot_example_str, oie_prompt_templates_str)
    
    # 結果を保存（conversation_idと共に）
    oie_results.append({
        'conversation_id': idx,
        'preference_summary': input_text,
        'triplets': oie_triples,
        'triplet_count': len(oie_triples)
    })
    
    print(f"抽出されたトリプル数: {len(oie_triples)}")
    
    # API制限を考慮して待機
    if idx < len(df_oie) - 1:
        time.sleep(1)

print("\n" + "=" * 80)
print("トリプル抽出完了")
print("=" * 80)

# 結果をJSONファイルに保存
output_json_path = f'/home/y-aida/Programs/preference-kg/notebooks/results/reports/{dataset_name}/oie_triplets.json'

# 親ディレクトリを作成（存在しない場合）
os.makedirs(os.path.dirname(output_json_path), exist_ok=True)

# JSONファイルに保存
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(oie_results, f, ensure_ascii=False, indent=2)

logger.success(f"トリプルをJSONに保存しました: {output_json_path}")
print(f"\n保存先: {output_json_path}")
print(f"保存件数: {len(oie_results)}件")
print(f"総トリプル数: {sum([r['triplet_count'] for r in oie_results])}個")

# ファイルサイズを確認
file_size = os.path.getsize(output_json_path) / 1024  # KB単位
print(f"ファイルサイズ: {file_size:.2f} KB")

OIE トリプル抽出開始


トリプル抽出中:   0%|          | 0/3 [00:00<?, ?it/s]


処理中: 1/3
抽出されたトリプル数: 6
抽出されたトリプル数: 6


トリプル抽出中:  33%|███▎      | 1/3 [00:03<00:06,  3.14s/it]


処理中: 2/3
抽出されたトリプル数: 6
抽出されたトリプル数: 6


トリプル抽出中:  67%|██████▋   | 2/3 [00:06<00:03,  3.18s/it]


処理中: 3/3


トリプル抽出中: 100%|██████████| 3/3 [00:08<00:00,  2.67s/it]
[32m2025-10-27 17:32:20.044[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m43[0m - [32m[1mトリプルをJSONに保存しました: /home/y-aida/Programs/preference-kg/notebooks/results/reports/daily_dialogue_with_recoginized_concept_raw/oie_triplets.json[0m
トリプル抽出中: 100%|██████████| 3/3 [00:08<00:00,  2.67s/it]
[32m2025-10-27 17:32:20.044[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36m<module>[0m:[36m43[0m - [32m[1mトリプルをJSONに保存しました: /home/y-aida/Programs/preference-kg/notebooks/results/reports/daily_dialogue_with_recoginized_concept_raw/oie_triplets.json[0m


抽出されたトリプル数: 3

トリプル抽出完了

保存先: /home/y-aida/Programs/preference-kg/notebooks/results/reports/daily_dialogue_with_recoginized_concept_raw/oie_triplets.json
保存件数: 3件
総トリプル数: 15個
ファイルサイズ: 2.42 KB


In [8]:
# Open Information Extraction (OIE) を使用して preference_summary からトリプルを抽出
def extract_triplets_from_preferences(preference_summary_text: str, model="gpt-4o-mini"):
    """
    preference_summaryからトリプル（主語、述語、目的語）を抽出
    
    Args:
        preference_summary_text: preference_summaryのテキスト
        model: 使用するOpenAIモデル
    
    Returns:
        抽出されたトリプルのリスト
    """
    
    prompt = f"""Extract knowledge graph triplets from the following preference statements.
Each triplet should be in the format: (subject, predicate, object)

Preference statements:
{preference_summary_text}

Output format: Return a JSON array of triplets, where each triplet is an array of [subject, predicate, object].
Example: {{"triplets": [["user", "prefers", "vegetables"], ["user", "likes", "healthy food"]]}}
"""
    
    messages = [{"role": "user", "content": prompt}]
    
    response = None
    while response is None:
        try:
            response = openai.chat.completions.create(
                model=model,
                messages=messages,
                temperature=0,
                max_tokens=256,
                response_format={"type": "json_object"}
            )
        except Exception as e:
            logger.error(f"API呼び出しエラー: {e}")
            time.sleep(5)
    
    # JSON形式のレスポンスをパース
    try:
        result = json.loads(response.choices[0].message.content)
        return result.get("triplets", [])
    except json.JSONDecodeError as e:
        logger.error(f"JSON解析エラー: {e}")
        return []

# テスト: 最初の1件でトリプル抽出を実行
print("=" * 80)
print("トリプル抽出テスト")
print("=" * 80)

first_preference_summary = df_oie['preference_summary'].iloc[0]
print(f"\n【入力】")
print(first_preference_summary)

print(f"\n【抽出されたトリプル】")
triplets = extract_triplets_from_preferences(first_preference_summary)
for i, triplet in enumerate(triplets, 1):
    print(f"{i}. {triplet}")

print("\n" + "=" * 80)

トリプル抽出テスト

【入力】
The user has a preference against drinking alcohol regarding health and fitness, under the condition of wanting to maintain a healthy lifestyle.
The user has a preference for engaging in social activities that promote fitness regarding leisure activities, under the condition of wanting to avoid sedentary options.

【抽出されたトリプル】
1. ['user', 'prefers against', 'drinking alcohol']
2. ['drinking alcohol', 'regarding', 'health and fitness']
3. ['user', 'prefers', 'engaging in social activities that promote fitness']
4. ['engaging in social activities that promote fitness', 'regarding', 'leisure activities']
5. ['user', 'wants to', 'maintain a healthy lifestyle']
6. ['user', 'wants to', 'avoid sedentary options']

1. ['user', 'prefers against', 'drinking alcohol']
2. ['drinking alcohol', 'regarding', 'health and fitness']
3. ['user', 'prefers', 'engaging in social activities that promote fitness']
4. ['engaging in social activities that promote fitness', 'regarding', 'leisure a