In [1]:
# 批量返回结果解析（OK 与 ERROR JSONL）
import os, re, json
from typing import Optional, List, Any, Dict
import pandas as pd
from ast import literal_eval

# 文件路径
ok_path = os.path.join('.', '中间文件', 'batch_return_files', 'output_202509081250.jsonl')
err_path = os.path.join('.', '中间文件', 'batch_return_files', 'error_202509081251.jsonl')


def extract_code_block_text(text: str) -> str:
    """从三引号代码块中提取内容（若存在），否则返回原文。支持```json/```等。"""
    if not isinstance(text, str):
        return '' if text is None else str(text)
    m = re.search(r"```(?:json)?\s*([\s\S]*?)```", text, flags=re.IGNORECASE)
    if m:
        return m.group(1).strip()
    return text.strip()


def parse_responsibilities_from_content(content: Any) -> Optional[List[str]]:
    """将模型 content 转为职责列表。
    兼容形式：
    - 代码块包裹的 JSON：```json {"responsibilities": [...]} ```
    - 纯字符串 JSON：{"responsibilities": [...]} 或 直接返回列表
    - 无法解析则返回 None
    """
    if content is None:
        return None
    if isinstance(content, list):
        return [str(x).strip() for x in content]

    text = extract_code_block_text(str(content))

    # 优先尝试 json.loads
    try:
        obj = json.loads(text)
        if isinstance(obj, dict) and isinstance(obj.get('responsibilities'), list):
            return [str(x).strip() for x in obj['responsibilities']]
        if isinstance(obj, list):
            return [str(x).strip() for x in obj]
    except Exception:
        pass

    # 再尝试 literal_eval（兼容单引号等）
    try:
        obj = literal_eval(text)
        if isinstance(obj, dict) and isinstance(obj.get('responsibilities'), list):
            return [str(x).strip() for x in obj['responsibilities']]
        if isinstance(obj, list):
            return [str(x).strip() for x in obj]
    except Exception:
        pass

    # 简单兜底：若出现明确的 none/null 表达
    if text.strip().lower() in {'none', 'null', '无', '没有'}:
        return None

    return None


In [2]:
# 解析 OK JSONL -> batch_ok_df
ok_rows = []
with open(ok_path, 'r', encoding='utf-8') as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        custom_id = obj.get('custom_id')
        resp = obj.get('response', {})
        body = resp.get('body', {}) or {}
        choices = (body.get('choices') or [])
        content = None
        if choices:
            content = ((choices[0] or {}).get('message') or {}).get('content')
        usage = body.get('usage', {}) or {}
        model = body.get('model')
        status_code = resp.get('status_code')
        responsibilities = parse_responsibilities_from_content(content)
        ok_rows.append({
            'custom_id': custom_id,
            'status_code': status_code,
            'model': model,
            'prompt_tokens': usage.get('prompt_tokens'),
            'completion_tokens': usage.get('completion_tokens'),
            'total_tokens': usage.get('total_tokens'),
            'raw_content': content,
            'responsibilities': responsibilities
        })

batch_ok_df = pd.DataFrame(ok_rows)
for col in ['prompt_tokens', 'completion_tokens', 'total_tokens']:
    batch_ok_df[col] = pd.to_numeric(batch_ok_df[col], errors='coerce').astype('Int64')

len(batch_ok_df), batch_ok_df.head(3)


FileNotFoundError: [Errno 2] No such file or directory: '.\\中间文件\\batch_return_files\\output_202509081250.jsonl'

In [None]:
# 解析 ERROR JSONL -> batch_err_df
err_rows = []
with open(err_path, 'r', encoding='utf-8') as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        custom_id = obj.get('custom_id')
        resp = obj.get('response', {})
        status_code = resp.get('status_code')
        body = resp.get('body', {}) or {}
        model = body.get('model') or (resp.get('body') or {}).get('model')
        error_obj = obj.get('error') or body.get('error') or {}
        error_code = (error_obj or {}).get('code')
        error_message = (error_obj or {}).get('message')
        err_rows.append({
            'custom_id': custom_id,
            'status_code': status_code,
            'model': model,
            'error_code': error_code,
            'error_message': error_message
        })

batch_err_df = pd.DataFrame(err_rows)
len(batch_err_df), batch_err_df.head(3)


In [None]:
# 预览统计与样例
print('OK数:', len(batch_ok_df))
print('ERROR数:', len(batch_err_df))

# OK 的 responsibilities 非空占比
non_empty = batch_ok_df['responsibilities'].map(lambda x: isinstance(x, list) and len(x) > 0)
print('OK中解析出职责的条数:', non_empty.sum())

batch_ok_df.head(5), batch_err_df.head(5)


In [None]:
# 解析基础请求 JSONL -> batch_req_df
req_rows = []
with open(os.path.join('.', '中间文件', 'zhipu_batches_dedup', 'batch_responsibilities_zhipu_dedup_009.jsonl'), 'r', encoding='utf-8') as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        custom_id = obj.get('custom_id')
        body = obj.get('body', {}) or {}
        messages = body.get('messages') or []
        model = body.get('model')
        temperature = body.get('temperature')
        max_tokens = body.get('max_tokens')
        # 抓取 system/user 文本，便于回查
        sys_text = None
        user_text = None
        for m in messages:
            role = m.get('role')
            if role == 'system' and sys_text is None:
                sys_text = m.get('content')
            if role == 'user' and user_text is None:
                user_text = m.get('content')
        req_rows.append({
            'custom_id': custom_id,
            'req_model': model,
            'req_temperature': temperature,
            'req_max_tokens': max_tokens,
            'req_system': sys_text,
            'req_user': user_text
        })

batch_req_df = pd.DataFrame(req_rows)
len(batch_req_df), batch_req_df.head(3)


In [None]:
# 按 custom_id 合并：请求 + OK + ERROR
# 有的 custom_id 只会出现在 OK 或 ERROR 之一
merged_all = (batch_req_df
              .merge(batch_ok_df, on='custom_id', how='left', suffixes=('', '_ok'))
              .merge(batch_err_df, on='custom_id', how='left', suffixes=('', '_err')))

print('总请求数:', len(batch_req_df))
print('匹配到OK的数:', merged_all['status_code'].notna().sum())
print('匹配到ERROR的数:', merged_all['error_code'].notna().sum())

merged_all.head(5)


In [None]:
merged_all.sample(100).to_excel('preview.xlsx')

In [None]:
merged_all.shape