In [1]:
import pandas as pd
df = pd.read_parquet('matched_all.parquet')

In [5]:
import re
import html
from typing import List


def clean_description(text: object) -> str:
    """Clean raw HTML-ish job description text.

    - Trim whitespace/newlines at both ends
    - Replace \xa0 and &nbsp; with spaces
    - Remove common HTML tags; keep line breaks where sensible
    - Collapse repeated spaces/newlines
    """
    if text is None:
        return ""
    if not isinstance(text, str):
        try:
            text = str(text)
        except Exception:
            return ""

    s = text.replace("\xa0", " ")
    s = s.replace("&nbsp;", " ")
    # Decode common HTML entities (e.g., &amp;, &lt;, &gt;)
    s = html.unescape(s)

    # Preserve structural breaks for <br> and <p>, then strip remaining tags
    s = re.sub(r"<br\s*/?>", "\n", s, flags=re.IGNORECASE)
    s = re.sub(r"</?p[^>]*>", "\n", s, flags=re.IGNORECASE)
    s = re.sub(r"</?(div|span|ul|ol|li|strong|em|font)[^>]*>", "\n", s, flags=re.IGNORECASE)
    s = re.sub(r"<[^>]+>", " ", s)  # any remaining tags

    # Normalize whitespace
    s = s.replace("\r", "\n")
    s = re.sub(r"\t+", " ", s)
    s = re.sub(r"\n+", "\n", s)
    s = re.sub(r"[ \u3000]{2,}", " ", s)

    return s.strip()


def extract_responsibilities(text: object) -> List[str]:
    """Extract ordered list items (n. or n、) for job responsibilities.

    Rules:
    - Find the first ordered list that starts with a numeric marker like "1." or "1、".
    - Collect subsequent items until a new ordered list restart is detected (the next "1."/"1、").
    - Ignore everything after the restart.
    """
    if text is None:
        return []
    if not isinstance(text, str):
        try:
            text = str(text)
        except Exception:
            return []

    content = text

    # Match numbers like 1. or 1、 (1-2 digits), not preceded by a digit
    marker_pattern = re.compile(r"(?<!\d)(\d{1,2})[\.、]\s*")
    matches = list(marker_pattern.finditer(content))
    if not matches:
        return []

    # Find the first occurrence of marker == 1
    first_idx = None
    for idx, m in enumerate(matches):
        if m.group(1) == "1":
            first_idx = idx
            break
    if first_idx is None:
        return []

    items: List[str] = []
    for i in range(first_idx, len(matches)):
        current = matches[i]
        # Stop if a new list restarts at 1 (excluding the very first 1)
        if i > first_idx and current.group(1) == "1":
            break
        start_pos = current.end()
        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content)

        raw = content[start_pos:end_pos]
        # Trim leading bullets/punctuations and trailing sentence punctuation
        raw = re.sub(r"^[：:;；\-\s·•]+", "", raw)
        cleaned = raw.strip()
        if cleaned:
            cleaned = re.sub(r"[；;。\s]+$", "", cleaned)
            items.append(cleaned)

    return items


In [None]:
# 生成用于批量调用的大模型请求 JSONL（200条）#百炼
import os, json

system_prompt = "请提取岗位描述信息中的岗位职责，返回Python列表。若岗位描述信息中没有岗位职责，返回none。"

out_dir = os.path.join('.', '中间文件')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, 'batch_responsibilities_200.jsonl')

with open(out_path, 'w', encoding='utf-8') as f:
    for _, row in df_sample.reset_index().iterrows():
        original_idx = int(row['index']) if 'index' in row else _
        desc = row.get('岗位描述', '')
        user_text = clean_description(desc) if pd.notna(desc) else ""
        payload = {
            "custom_id": f"resp-{original_idx:06d}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "qwen-flash",
                "temperature": 0,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_text}
                ]
            }
        }
        f.write(json.dumps(payload, ensure_ascii=False) + "\n")

print('写入完成:', out_path)
# 预览前两行
with open(out_path, 'r', encoding='utf-8') as f:
    for _ in range(2):
        print(f.readline().rstrip())


In [None]:
# 生成用于批量调用的大模型请求 JSONL（200条）#智谱
import os, json

system_prompt = "请提取岗位描述信息中的岗位职责或任务，输出{\"responsibilities\":[\"…\",\"…\"]}。若岗位描述信息中没有岗位职责，输出{\"responsibilities\":[]}。"

out_dir = os.path.join('.', '中间文件')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, 'batch_responsibilities_200_zhipu.jsonl')

with open(out_path, 'w', encoding='utf-8') as f:
    for _, row in df_sample.reset_index().iterrows():
        original_idx = int(row['index']) if 'index' in row else _
        desc = row.get('岗位描述', '')
        user_text = clean_description(desc) if pd.notna(desc) else ""
        payload = {
            "custom_id": f"resp-{original_idx:06d}",
            "method": "POST",
            "url": "/v4/chat/completions",
            "body": {
                "model": "glm-4-flash",
                "temperature": 0.1,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_text}
                ]
            }
        }
        f.write(json.dumps(payload, ensure_ascii=False) + "\n")

print('写入完成:', out_path)
# 预览前两行
with open(out_path, 'r', encoding='utf-8') as f:
    for _ in range(2):
        print(f.readline().rstrip())


In [None]:
# 记录 custom_id 与原始行号映射，便于回写
map_path = os.path.join(out_dir, 'batch_responsibilities_200_map.csv')
(
    df_sample.reset_index()[['index']]
      .rename(columns={'index': 'row_index'})
      .assign(custom_id=lambda d: d['row_index'].apply(lambda x: f'resp-{int(x):06d}'))
      .to_csv(map_path, index=False, encoding='utf-8')
)
print('映射写入完成:', map_path)
!powershell -NoProfile -Command "Get-Content -TotalCount 3 -Path \"$env:CD\\中间文件\\batch_responsibilities_200_map.csv\" | % {$_}"


In [None]:
# 解析返回的 JSONL
ret_path = os.path.join(out_dir, '471b56b6-c1e4-4710-910d-949d98a544d1_1757233593021_success.jsonl')
rows = []
with open(ret_path, 'r', encoding='utf-8') as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        custom_id = obj.get('custom_id')
        # 优先从 choices[0].message.content 取内容
        content = None
        try:
            content = obj['response']['body']['choices'][0]['message']['content']
        except Exception:
            content = None
        rows.append({'custom_id': custom_id, 'raw_content': content})

import pandas as pd
ret_df = pd.DataFrame(rows)
ret_df.head(3)


In [None]:
# 结果归一化：将raw_content解析为Python列表或None
from ast import literal_eval

def normalize_to_list_or_none(text: object):
    if text is None:
        return None
    if isinstance(text, list):
        return text
    if not isinstance(text, str):
        text = str(text)
    s = text.strip()
    # 允许两种常见形式：1) Python 列表字符串 2) JSON 数组字符串
    # 优先尝试literal_eval（兼容单引号），失败再尝试json.loads
    try:
        val = literal_eval(s)
        if isinstance(val, list):
            return [str(x).strip() for x in val]
    except Exception:
        pass
    try:
        val = json.loads(s)
        if isinstance(val, list):
            return [str(x).strip() for x in val]
    except Exception:
        pass
    # 有时模型会返回"none"、"null"等
    if s.lower() in {"none", "null", "无", "没有"}:
        return None
    return None

ret_df['岗位职责_列表'] = ret_df['raw_content'].map(normalize_to_list_or_none)
ret_df.head(3)


In [None]:
# 合并回200条样本（通过mapping）
map_df = pd.read_csv(os.path.join(out_dir, 'batch_responsibilities_200_map.csv'), encoding='utf-8')
merged = (map_df.merge(ret_df[['custom_id','岗位职责_列表']], on='custom_id', how='left')
                .merge(df_sample.reset_index()[['index','岗位描述','岗位描述_清洗','岗位职责列表']], left_on='row_index', right_on='index', how='left'))

# 新列命名：模型抽取 vs 规则抽取
merged = merged.rename(columns={'岗位职责_列表': '岗位职责_模型抽取', '岗位职责列表': '岗位职责_规则抽取'})

# 预览成功解析条数
print('模型抽取非空条数:', merged['岗位职责_模型抽取'].map(lambda x: isinstance(x, list) and len(x) > 0).sum())
print('总条数:', len(merged))
merged.head(5)


In [None]:
# 导出预览到 Excel
preview_cols = ['row_index', 'custom_id', '岗位描述', '岗位描述_清洗', '岗位职责_规则抽取', '岗位职责_模型抽取']
excel_path = os.path.join('.', 'responsibility_preview.xlsx')
(
    merged[preview_cols]
        .sort_values('row_index')
        .to_excel(excel_path, index=False)
)
print('Excel 写入完成:', excel_path)


In [None]:
# 提取每个请求的输入/输出 token 数量到 DataFrame
usage_rows = []
with open(ret_path, 'r', encoding='utf-8') as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        custom_id = obj.get('custom_id')
        resp = obj.get('response', {})
        status_code = resp.get('status_code')
        body = resp.get('body', {}) or {}
        usage = body.get('usage', {}) or {}
        model = body.get('model')
        usage_rows.append({
            'custom_id': custom_id,
            'status_code': status_code,
            'model': model,
            'prompt_tokens': usage.get('prompt_tokens'),
            'completion_tokens': usage.get('completion_tokens'),
            'total_tokens': usage.get('total_tokens')
        })

usage_df = pd.DataFrame(usage_rows)
for col in ['prompt_tokens', 'completion_tokens', 'total_tokens']:
    usage_df[col] = pd.to_numeric(usage_df[col], errors='coerce').astype('Int64')

print('行数:', len(usage_df))
usage_df.head(5)


In [None]:
# 重写 batch_responsibilities_200.jsonl 为简化schema（仅 custom_id + body），温度=1
import os, json

# 准备数据与函数（兜底）
if 'pd' not in globals():
    import pandas as pd
if 'df_sample' not in globals():
    if 'df' not in globals():
        df = pd.read_parquet('matched_all.parquet')
    df_sample = df[['岗位描述']].head(200).copy()
if 'clean_description' not in globals():
    def clean_description(text):
        if text is None:
            return ''
        return str(text).strip()

system_prompt = "请提取岗位描述信息中的岗位职责，返回Python列表。若岗位描述信息中没有岗位职责，返回none。"

out_dir = os.path.join('.', '中间文件')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, 'batch_responsibilities_200_doubao.jsonl')

with open(out_path, 'w', encoding='utf-8') as f:
    for i, row in df_sample.reset_index().iterrows():
        original_idx = int(row['index']) if 'index' in row else i
        desc = row.get('岗位描述', '')
        user_text = clean_description(desc) if pd.notna(desc) else ''
        obj = {
            "custom_id": f"resp-{original_idx:06d}",
            "body": {
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_text}
                ],
                "temperature": 0,
                "thinking ":{"type":"disabled"}
            }
        }
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print('已重写:', out_path)
with open(out_path, 'r', encoding='utf-8') as f:
    for _ in range(2):
        print(f.readline().rstrip())


In [None]:
print(usage_df['prompt_tokens'].sum())
print(usage_df['completion_tokens'].sum())
comsumption = usage_df['prompt_tokens'].sum()/1000*0.00015/2 + usage_df['completion_tokens'].sum()/1000*0.0015/2
print('comsumption:', comsumption)
print('estimated total comsumption:', df.shape[0]/200*comsumption)

In [None]:
usage_df.sort_values('prompt_tokens', ascending=False).head(5)

In [1]:
# 全量清洗并按5万条切分导出为智谱JSONL，带进度条
import os, json, math
from tqdm import tqdm

# 兜底导入
if 'pd' not in globals():
    import pandas as pd  # type: ignore

# 读取全量数据（若未读取）
if 'df' not in globals():
    df = pd.read_parquet('matched_all.parquet')

# 清洗函数（若之前未定义，则提供兜底版本）
if 'clean_description' in globals():
    clean_fn = clean_description
else:
    import re, html
    def clean_fn(text: object) -> str:
        if text is None:
            return ''
        if not isinstance(text, str):
            try:
                text = str(text)
            except Exception:  # noqa: BLE001
                return ''
        s = text.replace('\xa0', ' ').replace('&nbsp;', ' ')
        s = html.unescape(s)
        s = re.sub(r"<br\s*/?>", "\n", s, flags=re.IGNORECASE)
        s = re.sub(r"</?p[^>]*>", "\n", s, flags=re.IGNORECASE)
        s = re.sub(r"</?(div|span|ul|ol|li|strong|em|font)[^>]*>", "\n", s, flags=re.IGNORECASE)
        s = re.sub(r"<[^>]+>", " ", s)
        s = s.replace("\r", "\n")
        s = re.sub(r"\t+", " ", s)
        s = re.sub(r"\n+", "\n", s)
        s = re.sub(r"[ \u3000]{2,}", " ", s)
        return s.strip()

# 生成清洗列
df_all = df.copy()
df_all['岗位描述_清洗'] = df_all['岗位描述'].map(lambda x: clean_fn(x) if pd.notna(x) else '')
df_all.dropna(subset=['岗位描述_清洗'], inplace=True)
# 输出目录（新建文件夹）
base_out_dir = os.path.join('.', '中间文件', 'zhipu_batches')
os.makedirs(base_out_dir, exist_ok=True)

# 智谱请求体参数
system_prompt = (
    "请提取岗位描述信息中的**岗位职责或任务**，输出{\"responsibilities\":[\"…\",\"…\"]}。"
    "若岗位描述信息中**没有**岗位职责，输出{\"responsibilities\":[]}。注意：不要提取学历、年龄、技能、经验等**任职要求**。"
)

# 切分参数：每个文件 50,000 条
chunk_size = 50_000
n_rows = len(df_all)
import math as _math
n_chunks = _math.ceil(n_rows / chunk_size) if n_rows > 0 else 0

# 使用 reset_index 保留原始行号，custom_id 使用原始行号格式化
with_index = df_all.reset_index()

# 进度条
with tqdm(total=n_rows, desc='Writing Zhipu JSONL') as pbar:
    for chunk_idx in range(n_chunks):
        start = chunk_idx * chunk_size
        end = min((chunk_idx + 1) * chunk_size, n_rows)
        chunk = with_index.iloc[start:end][['index', '岗位描述_清洗']]

        out_path = os.path.join(base_out_dir, f'batch_responsibilities_zhipu_{chunk_idx + 1:03d}.jsonl')
        with open(out_path, 'w', encoding='utf-8') as f:
            # 使用 itertuples(name=None) 以获得更快的迭代
            for original_idx, user_text in chunk.itertuples(index=False, name=None):
                payload = {
                    "custom_id": f"resp-{int(original_idx):06d}",
                    "method": "POST",
                    "url": "/v4/chat/completions",
                    "body": {
                        "model": "glm-4-flash",
                        "temperature": 0.1,
                        "messages": [
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": user_text or ''}
                        ]
                    }
                }
                f.write(json.dumps(payload, ensure_ascii=False) + "\n")
                pbar.update(1)

print(f'总记录数: {n_rows}, 总文件数: {n_chunks}, 输出目录: {base_out_dir}')

# 预览前两个文件的首行
import glob as _glob
preview_files = sorted(_glob.glob(os.path.join(base_out_dir, '*.jsonl')))[:2]
for fp in preview_files:
    print('预览:', fp)
    with open(fp, 'r', encoding='utf-8') as f:
        print(f.readline().rstrip())



Writing Zhipu JSONL: 100%|██████████| 5028331/5028331 [00:36<00:00, 139447.01it/s]

总记录数: 5028331, 总文件数: 101, 输出目录: .\中间文件\zhipu_batches
预览: .\中间文件\zhipu_batches\batch_responsibilities_zhipu_001.jsonl
{"custom_id": "resp-000000", "method": "POST", "url": "/v4/chat/completions", "body": {"model": "glm-4-flash", "temperature": 0.1, "messages": [{"role": "system", "content": "请提取岗位描述信息中的**岗位职责或任务**，输出{\"responsibilities\":[\"…\",\"…\"]}。若岗位描述信息中**没有**岗位职责，输出{\"responsibilities\":[]}。注意：不要提取学历、年龄、技能、经验等**任职要求**。"}, {"role": "user", "content": "1、性别：男2、年龄：35岁以下3、教育背景：医学相关专业、贸易、管理等专业，本科优先。 4、工作经验 2年以上医疗器械、耗材产品采购经验者 5、技能与素质（1）熟悉医疗器械管理条例；（2）大学专或本科学历,医学、药学相关专业 ；（3）Windows系统熟练，精通Excel。（4）良好的学习能力和沟通能力。（5）有相关工作经验 6、工作概要（1） 负责医疗器械、耗材的采购，库存管理，订单制定，退换货处理。 （2）与公司上游商业客户和公司内部各部门的沟通协调，支持公司销售目标的达成。"}]}}
预览: .\中间文件\zhipu_batches\batch_responsibilities_zhipu_002.jsonl
{"custom_id": "resp-050000", "method": "POST", "url": "/v4/chat/completions", "body": {"model": "glm-4-flash", "temperature": 0.1, "messages": [{"role": "system", "content": "请提取岗位描述信息中的**岗位职责或任务**，输出{\"responsibilities\":[\"




In [6]:
# 去重导出（智谱）：仅导出唯一的 岗位描述_清洗，并写出回写映射
import os, json, math
from tqdm import tqdm
import pandas as pd

# 若不存在 df_all，则构建（含 岗位描述_清洗）
if 'df_all' not in globals() or '岗位描述_清洗' not in df_all.columns:
    if 'df' not in globals():
        df = pd.read_parquet('matched_all.parquet')
    # 使用已定义的 clean_description，否则提供兜底
    if 'clean_description' in globals():
        clean_fn = clean_description
    else:
        import re, html
        def clean_fn(text: object) -> str:
            if text is None:
                return ''
            if not isinstance(text, str):
                try:
                    text = str(text)
                except Exception:
                    return ''
            s = text.replace('\xa0', ' ').replace('&nbsp;', ' ')
            s = html.unescape(s)
            s = re.sub(r"<br\s*/?>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"</?p[^>]*>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"</?(div|span|ul|ol|li|strong|em|font)[^>]*>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"<[^>]+>", " ", s)
            s = s.replace("\r", "\n")
            s = re.sub(r"\t+", " ", s)
            s = re.sub(r"\n+", "\n", s)
            s = re.sub(r"[ \u3000]{2,}", " ", s)
            return s.strip()
    df_all = df.copy()
    df_all['岗位描述_清洗'] = df_all['岗位描述'].map(lambda x: clean_fn(x) if pd.notna(x) else '')

# 1) 生成去重ID：按首次出现顺序 factorize
with_index = df_all.reset_index()[['index', '岗位描述_清洗']]
codes, uniques = pd.factorize(with_index['岗位描述_清洗'], sort=False)
with_index['desc_id'] = pd.Series(codes, index=with_index.index).map(lambda c: f'desc-{int(c):07d}')

# 2) 写出映射：原始行号 -> desc_id；唯一描述表 desc_id -> 文本
dedup_dir = os.path.join('.', '中间文件', 'zhipu_batches_dedup')
os.makedirs(dedup_dir, exist_ok=True)

row2desc_path = os.path.join(dedup_dir, 'map_row_to_desc_id.parquet')
with_index[['index', 'desc_id']].to_parquet(row2desc_path, index=False)

unique_df = pd.DataFrame({
    'desc_id': [f'desc-{i:07d}' for i in range(len(uniques))],
    '岗位描述_清洗': uniques
})
unique_path = os.path.join(dedup_dir, 'unique_cleaned_descriptions.parquet')
unique_df.to_parquet(unique_path, index=False)

print('唯一描述数:', len(unique_df), '映射文件:', row2desc_path)

# # 3) 导出去重后的请求为 Zhipu 批处理 JSONL（每文件 50,000 条）
# system_prompt = (
#     "请提取岗位描述信息中的**岗位职责或任务**，输出{\"responsibilities\":[\"…\",\"…\"]}。"
#     "若岗位描述信息中**没有**岗位职责，输出{\"responsibilities\":[]}。注意：不要提取学历、年龄、技能、经验等**任职要求**。"
# )

# chunk_size = 50_000
# n_rows = len(unique_df)
# base_out_dir = dedup_dir

# with tqdm(total=n_rows, desc='Writing Dedup Zhipu JSONL') as pbar:
#     for chunk_idx in range(math.ceil(n_rows / chunk_size)):
#         start = chunk_idx * chunk_size
#         end = min((chunk_idx + 1) * chunk_size, n_rows)
#         chunk = unique_df.iloc[start:end][['desc_id', '岗位描述_清洗']]

#         out_path = os.path.join(base_out_dir, f'batch_responsibilities_zhipu_dedup_{chunk_idx + 1:03d}.jsonl')
#         with open(out_path, 'w', encoding='utf-8') as f:
#             for desc_id, user_text in chunk.itertuples(index=False, name=None):
#                 payload = {
#                     "custom_id": desc_id,
#                     "method": "POST",
#                     "url": "/v4/chat/completions",
#                     "body": {
#                         "model": "glm-4-flash",
#                         "temperature": 0.1,
#                         "messages": [
#                             {"role": "system", "content": system_prompt},
#                             {"role": "user", "content": user_text or ''}
#                         ]
#                     }
#                 }
#                 f.write(json.dumps(payload, ensure_ascii=False) + "\n")
#                 pbar.update(1)

# print(f'去重导出完成。唯一数: {n_rows}, 输出目录: {base_out_dir}')

# # 4) 预览首个文件首行
# import glob
# preview_files = sorted(glob.glob(os.path.join(base_out_dir, 'batch_responsibilities_zhipu_dedup_*.jsonl')))[:1]
# for fp in preview_files:
#     print('预览:', fp)
#     with open(fp, 'r', encoding='utf-8') as f:
#         print(f.readline().rstrip())


唯一描述数: 3813824 映射文件: .\中间文件\zhipu_batches_dedup\map_row_to_desc_id.parquet


In [None]:
merged_before = pd.read_parquet('id_responsibility_match.parquet')
merged_before.rename(columns={'custom_id':'desc_id'},inplace=True)
x = pd.merge(with_index,merged_before,on='desc_id',how='left')
x.drop(['desc_id', 'index', '岗位描述_清洗'],axis=1,inplace=True)
df_all.reset_index(drop=True,inplace=True)
x.reset_index(drop=True,inplace=True)
df_out = pd.concat([df_all,x],axis=1)
df_out.drop(['name_raw_norm', 'name_std_norm', '岗位描述_清洗'],axis=1,inplace=True)
df_out.to_parquet('matched_all_with_responsibility.parquet')

In [1]:
# 使用 ZhipuAiClient 批量上传去重后的 JSONL 文件
import os, glob
import pandas as pd
from zai import ZhipuAiClient

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")  # TODO: 替换为你的API Key

upload_dir = os.path.join('.', '中间文件', 'zhipu_batches_dedup')
files = sorted(glob.glob(os.path.join(upload_dir, 'batch_responsibilities_zhipu_dedup_*.jsonl')))
print('待上传文件数:', len(files))

records = []
for fp in files:
    fo = client.files.create(file=open(fp, 'rb'), purpose='batch')
    # 兼容对象字段名
    rec = {
        'local_path': fp,
        'id': getattr(fo, 'id', None) or fo.get('id') if isinstance(fo, dict) else None,
        'filename': getattr(fo, 'filename', None) or fo.get('filename') if isinstance(fo, dict) else os.path.basename(fp),
        'bytes': getattr(fo, 'bytes', None) or fo.get('bytes') if isinstance(fo, dict) else None,
        'created_at': getattr(fo, 'created_at', None) or fo.get('created_at') if isinstance(fo, dict) else None,
        'object': getattr(fo, 'object', None) or fo.get('object') if isinstance(fo, dict) else None,
        'purpose': getattr(fo, 'purpose', None) or fo.get('purpose') if isinstance(fo, dict) else 'batch',
        'status': getattr(fo, 'status', None) or fo.get('status') if isinstance(fo, dict) else None,
    }
    print('已上传:', rec['filename'], '->', rec['id'])
    records.append(rec)

manifest = pd.DataFrame(records)
manifest_path = os.path.join(upload_dir, 'upload_manifest.parquet')
manifest.to_parquet(manifest_path, index=False)
print('清单写入:', manifest_path)
manifest.head(3)


待上传文件数: 76
已上传: batch_responsibilities_zhipu_dedup_002.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_003.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_004.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_005.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_006.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_007.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_008.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_009.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_010.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_011.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_012.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_013.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_014.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_015.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_016.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_017.jsonl -> None
已上传: batch_responsibilities_zhipu_dedup_018.jsonl -> None
已上传

Unnamed: 0,local_path,id,filename,bytes,created_at,object,purpose,status
0,.\中间文件\zhipu_batches_dedup\batch_responsibilit...,,batch_responsibilities_zhipu_dedup_002.jsonl,,,,batch,
1,.\中间文件\zhipu_batches_dedup\batch_responsibilit...,,batch_responsibilities_zhipu_dedup_003.jsonl,,,,batch,
2,.\中间文件\zhipu_batches_dedup\batch_responsibilit...,,batch_responsibilities_zhipu_dedup_004.jsonl,,,,batch,


In [None]:
from zai import ZhipuAiClient

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")
# 创建批处理任务
batch = client.batches.create(
    input_file_id="1757305452_063ec1b5f40a42fb8bf8673a28354df6",
    endpoint="/v4/chat/completions",
    auto_delete_input_file=True,
)
print(batch)

Batch(id='batch_1964952575041208320', completion_window=None, created_at=1757316175600, endpoint='/v4/chat/completions', input_file_id='1757305452_063ec1b5f40a42fb8bf8673a28354df6', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=None, failed=None, total=50000))


In [1]:
import pandas as pd
import time
from zai import ZhipuAiClient

sha = pd.read_excel(r'中间文件\code.xlsx')
sha = sha['sha'].to_list()

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")

# 循环创建批处理任务
for file_id in sha:
    batch = client.batches.create(
        input_file_id=file_id,
        endpoint="/v4/chat/completions",
        auto_delete_input_file=True,
    )
    print(f"Created batch for file_id: {file_id}")
    print(batch)
    time.sleep(2)  # 停顿2秒

Created batch for file_id: 1757305516_af9613a996a547c7b49c51dc55d995e1
Batch(id='batch_1964959304710619136', completion_window=None, created_at=1757317780078, endpoint='/v4/chat/completions', input_file_id='1757305516_af9613a996a547c7b49c51dc55d995e1', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=None, failed=None, total=50000))
Created batch for file_id: 1757305579_99e88ffc0b5c483689ed75989df9a72f
Batch(id='batch_1964959315162443776', completion_window=None, created_at=1757317782570, endpoint='/v4/chat/completions', input_file_id='1757305579_99e88ffc0b5c483689ed75989df9a72f', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=None, 

In [1]:
from zai import ZhipuAiClient
import pandas as pd

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")  # 填写您自己的APIKey

# client.batches.list返回了SyncCursorPage
batch_list = client.batches.list(limit=100)

batch_records = []
# 直接遍历batch_list中的每个batch对象
for batch in batch_list:
    # 将每个batch对象转换为字典并添加到列表中
    batch_dict = {
        'id': getattr(batch, 'id', None),
        'object': getattr(batch, 'object', None),
        'endpoint': getattr(batch, 'endpoint', None),
        'errors': getattr(batch, 'errors', None),
        'input_file_id': getattr(batch, 'input_file_id', None),
        'completion_window': getattr(batch, 'completion_window', None),
        'status': getattr(batch, 'status', None),
        'output_file_id': getattr(batch, 'output_file_id', None),
        'error_file_id': getattr(batch, 'error_file_id', None),
        'created_at': getattr(batch, 'created_at', None),
        'in_progress_at': getattr(batch, 'in_progress_at', None),
        'expires_at': getattr(batch, 'expires_at', None),
        'finalizing_at': getattr(batch, 'finalizing_at', None),
        'completed_at': getattr(batch, 'completed_at', None),
        'failed_at': getattr(batch, 'failed_at', None),
        'expired_at': getattr(batch, 'expired_at', None),
        'cancelling_at': getattr(batch, 'cancelling_at', None),
        'cancelled_at': getattr(batch, 'cancelled_at', None),
        'request_counts': getattr(batch, 'request_counts', None),
        'metadata': getattr(batch, 'metadata', None)
    }
    batch_records.append(batch_dict)

# 创建DataFrame
batch_df = pd.DataFrame(batch_records)
print(f"共获取到 {len(batch_df)} 个batch")
batch_df

共获取到 81 个batch


Unnamed: 0,id,object,endpoint,errors,input_file_id,completion_window,status,output_file_id,error_file_id,created_at,in_progress_at,expires_at,finalizing_at,completed_at,failed_at,expired_at,cancelling_at,cancelled_at,request_counts,metadata
0,batch_1964669396686340096,batch,,,1757248602_676026d46e654084a179b75869f468f1,24h,completed,1757261956_53692c0b80934e1c8fd1d68b980e8d38,,1757248660000,1757248801000,,1757261761000,1757261956000,,,,,"BatchRequestCounts(completed=200, failed=0, to...",
1,batch_1964675124331413504,batch,,,1757249776_74823f2dd9ad4ade9c316c11dd006b5f,24h,completed,1757261980_e5a0bebcb0cb440e95360bf369c91554,1757261980_f7d11716a49d4601b2d11e6546426a48,1757250026000,1757250795000,,1757261821000,1757261980000,,,,,"BatchRequestCounts(completed=49995, failed=5, ...",
2,batch_1964675146516004864,batch,,,1757249886_00f4203c334f4c468a7fd9fd3613d53f,24h,completed,1757262000_17a62c21472341a18448063718c03744,1757262000_be9bbeac0c6849a59fc76cf3d41258a4,1757250031000,1757250800000,,1757261821000,1757262000000,,,,,"BatchRequestCounts(completed=49908, failed=92,...",
3,batch_1964676078912012288,batch,,,1757250233_171a269149e0484abfa6d5c0c64abe19,24h,completed,1757262024_3f6a2a15a85c4343acd9c1d4c6fdae6e,1757262024_237d0299aca94d98a3f58d604a746743,1757250253000,1757250800000,,1757261821000,1757262024000,,,,,"BatchRequestCounts(completed=49960, failed=40,...",
4,batch_1964897795249676288,batch,,,1757303090_9c01feb553cf4c9f991139cca4395171,24h,completed,1757304020_d2bb46308620477bbd333bf94437ff45,1757304020_8f7474c7dd454cf28f305260cacdc87f,1757303115000,1757303560000,,1757303940000,1757304020000,,,,,"BatchRequestCounts(completed=49996, failed=4, ...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,batch_1964959697921380352,batch,/v4/chat/completions,,1757307959_b50970c69d9c4807a8b8305f1a8ede46,,completed,1757423704_aba74796cc534b7798c3d5d03ffd13ea,1757423704_2e58f674a1b2487d91a3eaec97711a85,1757317873000,1757369646000,,1757423640000,1757423704000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",
77,batch_1964959708458516480,batch,/v4/chat/completions,,1757308023_a2a70d2351fe4425afc448a5cd1b4146,,completed,1757423710_2519cf250aa64e628a10a5fd7c0b7f52,1757423710_d11d30c6a4d245d9be49ae798ffaeefc,1757317876000,1757369646000,,1757423640000,1757423710000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",
78,batch_1964959718992912384,batch,/v4/chat/completions,,1757308087_2494de23a51a4ab686be7658da401771,,completed,1757423716_61d35bf9f9154e5ba31603da0f8156c7,,1757317878000,1757369644000,,1757423640000,1757423716000,,,,,"BatchRequestCounts(completed=50000, failed=0, ...",
79,batch_1964959729463209984,batch,/v4/chat/completions,,1757308152_7ba5d596670b48e9861f6eb44ccac49b,,completed,1757423723_fb098e8d3df14152ba8aebd63e24e617,1757423723_720aaccd3834421f939f6508d45fece2,1757317881000,1757369643000,,1757423640000,1757423723000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",


In [5]:
batch_df['code'] = batch_df['id'].str[6:].astype(int)
batch_df_download = batch_df[batch_df['code']>=1964897795249676288]
batch_df_download = batch_df_download[batch_df_download['status']=='completed']

In [6]:
batch_df_download.sort_values('created_at')

Unnamed: 0,id,object,endpoint,errors,input_file_id,completion_window,status,output_file_id,error_file_id,created_at,...,expires_at,finalizing_at,completed_at,failed_at,expired_at,cancelling_at,cancelled_at,request_counts,metadata,code
4,batch_1964897795249676288,batch,,,1757303090_9c01feb553cf4c9f991139cca4395171,24h,completed,1757304020_d2bb46308620477bbd333bf94437ff45,1757304020_8f7474c7dd454cf28f305260cacdc87f,1757303115000,...,,1757303940000,1757304020000,,,,,"BatchRequestCounts(completed=49996, failed=4, ...",,1964897795249676288
5,batch_1964900191938686976,batch,,,1757303415_ec18e87111344fe59ff7deadac20b57f,24h,completed,1757306906_7b02b9dcecc44e29929f646d9a98368b,1757306906_00f8496a84e14f01b549af7f15bbe5cc,1757303686000,...,,1757306820000,1757306906000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",,1964900191938686976
6,batch_1964900232416727040,batch,,,1757303475_89160190f49c47a3b2ed1de1ede572da,24h,completed,1757306929_4ad0957d12834d608bb611f75be2be91,,1757303696000,...,,1757306820000,1757306930000,,,,,"BatchRequestCounts(completed=50000, failed=0, ...",,1964900232416727040
7,batch_1964900313258266624,batch,,,1757303537_97414ae4a3ed4255bee5980bf14b0362,24h,completed,1757306953_bceae0498f8a441a9d4d1d8eec221306,1757306953_a6114225f5eb46be86d12d33ce9a8884,1757303715000,...,,1757306820000,1757306953000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",,1964900313258266624
8,batch_1964900330912755712,batch,,,1757303597_04627b5bf19b4b649576ef7294a9788b,24h,completed,1757306975_2ddf74c34dd149cdbb3a0991756d6bc4,1757306975_4ff616a38908455fb01fcfca402a2ffa,1757303719000,...,,1757306760000,1757306975000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",,1964900330912755712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,batch_1964959697921380352,batch,/v4/chat/completions,,1757307959_b50970c69d9c4807a8b8305f1a8ede46,,completed,1757423704_aba74796cc534b7798c3d5d03ffd13ea,1757423704_2e58f674a1b2487d91a3eaec97711a85,1757317873000,...,,1757423640000,1757423704000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",,1964959697921380352
77,batch_1964959708458516480,batch,/v4/chat/completions,,1757308023_a2a70d2351fe4425afc448a5cd1b4146,,completed,1757423710_2519cf250aa64e628a10a5fd7c0b7f52,1757423710_d11d30c6a4d245d9be49ae798ffaeefc,1757317876000,...,,1757423640000,1757423710000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",,1964959708458516480
78,batch_1964959718992912384,batch,/v4/chat/completions,,1757308087_2494de23a51a4ab686be7658da401771,,completed,1757423716_61d35bf9f9154e5ba31603da0f8156c7,,1757317878000,...,,1757423640000,1757423716000,,,,,"BatchRequestCounts(completed=50000, failed=0, ...",,1964959718992912384
79,batch_1964959729463209984,batch,/v4/chat/completions,,1757308152_7ba5d596670b48e9861f6eb44ccac49b,,completed,1757423723_fb098e8d3df14152ba8aebd63e24e617,1757423723_720aaccd3834421f939f6508d45fece2,1757317881000,...,,1757423640000,1757423723000,,,,,"BatchRequestCounts(completed=49999, failed=1, ...",,1964959729463209984


In [7]:
import os
from zai import ZhipuAiClient

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")  # 填写您自己的APIKey

# 创建目标文件夹
output_dir = "中间文件/batch_return_files"
os.makedirs(output_dir, exist_ok=True)

# 遍历batch_df_download中的每个batch，下载结果文件
for index, row in batch_df_download.iterrows():
    output_file_id = row['output_file_id']
    batch_id = row['id']
    
    if output_file_id:
        try:
            # 下载文件内容
            content = client.files.content(output_file_id)
            
            # 构建输出文件名
            output_filename = f"batch_result_{batch_id}.jsonl"
            output_path = os.path.join(output_dir, output_filename)
            
            # 写入文件
            content.write_to_file(output_path)
            print(f"已下载: {batch_id} -> {output_filename}")
            
        except Exception as e:
            print(f"下载失败 {batch_id}: {e}")
    else:
        print(f"跳过 {batch_id}: 没有output_file_id")

已下载: batch_1964897795249676288 -> batch_result_batch_1964897795249676288.jsonl
已下载: batch_1964900191938686976 -> batch_result_batch_1964900191938686976.jsonl
已下载: batch_1964900232416727040 -> batch_result_batch_1964900232416727040.jsonl
已下载: batch_1964900313258266624 -> batch_result_batch_1964900313258266624.jsonl
已下载: batch_1964900330912755712 -> batch_result_batch_1964900330912755712.jsonl
已下载: batch_1964900355129081856 -> batch_result_batch_1964900355129081856.jsonl
已下载: batch_1964900388576428032 -> batch_result_batch_1964900388576428032.jsonl
已下载: batch_1964900680410988544 -> batch_result_batch_1964900680410988544.jsonl
已下载: batch_1964900903157637120 -> batch_result_batch_1964900903157637120.jsonl
已下载: batch_1964950777622560768 -> batch_result_batch_1964950777622560768.jsonl
已下载: batch_1964950805544579072 -> batch_result_batch_1964950805544579072.jsonl
已下载: batch_1964951432877051904 -> batch_result_batch_1964951432877051904.jsonl
已下载: batch_1964951474803712000 -> batch_result_batch