In [None]:
import pandas as pd
df = pd.read_parquet('matched_all.parquet')

In [None]:
import re
import html
from typing import List


def clean_description(text: object) -> str:
    """Clean raw HTML-ish job description text.

    - Trim whitespace/newlines at both ends
    - Replace \xa0 and &nbsp; with spaces
    - Remove common HTML tags; keep line breaks where sensible
    - Collapse repeated spaces/newlines
    """
    if text is None:
        return ""
    if not isinstance(text, str):
        try:
            text = str(text)
        except Exception:
            return ""

    s = text.replace("\xa0", " ")
    s = s.replace("&nbsp;", " ")
    # Decode common HTML entities (e.g., &amp;, &lt;, &gt;)
    s = html.unescape(s)

    # Preserve structural breaks for <br> and <p>, then strip remaining tags
    s = re.sub(r"<br\s*/?>", "\n", s, flags=re.IGNORECASE)
    s = re.sub(r"</?p[^>]*>", "\n", s, flags=re.IGNORECASE)
    s = re.sub(r"</?(div|span|ul|ol|li|strong|em|font)[^>]*>", "\n", s, flags=re.IGNORECASE)
    s = re.sub(r"<[^>]+>", " ", s)  # any remaining tags

    # Normalize whitespace
    s = s.replace("\r", "\n")
    s = re.sub(r"\t+", " ", s)
    s = re.sub(r"\n+", "\n", s)
    s = re.sub(r"[ \u3000]{2,}", " ", s)

    return s.strip()


def extract_responsibilities(text: object) -> List[str]:
    """Extract ordered list items (n. or n、) for job responsibilities.

    Rules:
    - Find the first ordered list that starts with a numeric marker like "1." or "1、".
    - Collect subsequent items until a new ordered list restart is detected (the next "1."/"1、").
    - Ignore everything after the restart.
    """
    if text is None:
        return []
    if not isinstance(text, str):
        try:
            text = str(text)
        except Exception:
            return []

    content = text

    # Match numbers like 1. or 1、 (1-2 digits), not preceded by a digit
    marker_pattern = re.compile(r"(?<!\d)(\d{1,2})[\.、]\s*")
    matches = list(marker_pattern.finditer(content))
    if not matches:
        return []

    # Find the first occurrence of marker == 1
    first_idx = None
    for idx, m in enumerate(matches):
        if m.group(1) == "1":
            first_idx = idx
            break
    if first_idx is None:
        return []

    items: List[str] = []
    for i in range(first_idx, len(matches)):
        current = matches[i]
        # Stop if a new list restarts at 1 (excluding the very first 1)
        if i > first_idx and current.group(1) == "1":
            break
        start_pos = current.end()
        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(content)

        raw = content[start_pos:end_pos]
        # Trim leading bullets/punctuations and trailing sentence punctuation
        raw = re.sub(r"^[：:;；\-\s·•]+", "", raw)
        cleaned = raw.strip()
        if cleaned:
            cleaned = re.sub(r"[；;。\s]+$", "", cleaned)
            items.append(cleaned)

    return items


In [None]:
# 生成用于批量调用的大模型请求 JSONL（200条）#百炼
import os, json

system_prompt = "请提取岗位描述信息中的岗位职责，返回Python列表。若岗位描述信息中没有岗位职责，返回none。"

out_dir = os.path.join('.', '中间文件')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, 'batch_responsibilities_200.jsonl')

with open(out_path, 'w', encoding='utf-8') as f:
    for _, row in df_sample.reset_index().iterrows():
        original_idx = int(row['index']) if 'index' in row else _
        desc = row.get('岗位描述', '')
        user_text = clean_description(desc) if pd.notna(desc) else ""
        payload = {
            "custom_id": f"resp-{original_idx:06d}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "qwen-flash",
                "temperature": 0,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_text}
                ]
            }
        }
        f.write(json.dumps(payload, ensure_ascii=False) + "\n")

print('写入完成:', out_path)
# 预览前两行
with open(out_path, 'r', encoding='utf-8') as f:
    for _ in range(2):
        print(f.readline().rstrip())


In [None]:
# 生成用于批量调用的大模型请求 JSONL（200条）#智谱
import os, json

system_prompt = "请提取岗位描述信息中的岗位职责或任务，输出{\"responsibilities\":[\"…\",\"…\"]}。若岗位描述信息中没有岗位职责，输出{\"responsibilities\":[]}。"

out_dir = os.path.join('.', '中间文件')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, 'batch_responsibilities_200_zhipu.jsonl')

with open(out_path, 'w', encoding='utf-8') as f:
    for _, row in df_sample.reset_index().iterrows():
        original_idx = int(row['index']) if 'index' in row else _
        desc = row.get('岗位描述', '')
        user_text = clean_description(desc) if pd.notna(desc) else ""
        payload = {
            "custom_id": f"resp-{original_idx:06d}",
            "method": "POST",
            "url": "/v4/chat/completions",
            "body": {
                "model": "glm-4-flash",
                "temperature": 0.1,
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_text}
                ]
            }
        }
        f.write(json.dumps(payload, ensure_ascii=False) + "\n")

print('写入完成:', out_path)
# 预览前两行
with open(out_path, 'r', encoding='utf-8') as f:
    for _ in range(2):
        print(f.readline().rstrip())


In [None]:
# 记录 custom_id 与原始行号映射，便于回写
map_path = os.path.join(out_dir, 'batch_responsibilities_200_map.csv')
(
    df_sample.reset_index()[['index']]
      .rename(columns={'index': 'row_index'})
      .assign(custom_id=lambda d: d['row_index'].apply(lambda x: f'resp-{int(x):06d}'))
      .to_csv(map_path, index=False, encoding='utf-8')
)
print('映射写入完成:', map_path)
!powershell -NoProfile -Command "Get-Content -TotalCount 3 -Path \"$env:CD\\中间文件\\batch_responsibilities_200_map.csv\" | % {$_}"


In [None]:
# 解析返回的 JSONL
ret_path = os.path.join(out_dir, '471b56b6-c1e4-4710-910d-949d98a544d1_1757233593021_success.jsonl')
rows = []
with open(ret_path, 'r', encoding='utf-8') as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        custom_id = obj.get('custom_id')
        # 优先从 choices[0].message.content 取内容
        content = None
        try:
            content = obj['response']['body']['choices'][0]['message']['content']
        except Exception:
            content = None
        rows.append({'custom_id': custom_id, 'raw_content': content})

import pandas as pd
ret_df = pd.DataFrame(rows)
ret_df.head(3)


In [None]:
# 结果归一化：将raw_content解析为Python列表或None
from ast import literal_eval

def normalize_to_list_or_none(text: object):
    if text is None:
        return None
    if isinstance(text, list):
        return text
    if not isinstance(text, str):
        text = str(text)
    s = text.strip()
    # 允许两种常见形式：1) Python 列表字符串 2) JSON 数组字符串
    # 优先尝试literal_eval（兼容单引号），失败再尝试json.loads
    try:
        val = literal_eval(s)
        if isinstance(val, list):
            return [str(x).strip() for x in val]
    except Exception:
        pass
    try:
        val = json.loads(s)
        if isinstance(val, list):
            return [str(x).strip() for x in val]
    except Exception:
        pass
    # 有时模型会返回"none"、"null"等
    if s.lower() in {"none", "null", "无", "没有"}:
        return None
    return None

ret_df['岗位职责_列表'] = ret_df['raw_content'].map(normalize_to_list_or_none)
ret_df.head(3)


In [None]:
# 合并回200条样本（通过mapping）
map_df = pd.read_csv(os.path.join(out_dir, 'batch_responsibilities_200_map.csv'), encoding='utf-8')
merged = (map_df.merge(ret_df[['custom_id','岗位职责_列表']], on='custom_id', how='left')
                .merge(df_sample.reset_index()[['index','岗位描述','岗位描述_清洗','岗位职责列表']], left_on='row_index', right_on='index', how='left'))

# 新列命名：模型抽取 vs 规则抽取
merged = merged.rename(columns={'岗位职责_列表': '岗位职责_模型抽取', '岗位职责列表': '岗位职责_规则抽取'})

# 预览成功解析条数
print('模型抽取非空条数:', merged['岗位职责_模型抽取'].map(lambda x: isinstance(x, list) and len(x) > 0).sum())
print('总条数:', len(merged))
merged.head(5)


In [None]:
# 导出预览到 Excel
preview_cols = ['row_index', 'custom_id', '岗位描述', '岗位描述_清洗', '岗位职责_规则抽取', '岗位职责_模型抽取']
excel_path = os.path.join('.', 'responsibility_preview.xlsx')
(
    merged[preview_cols]
        .sort_values('row_index')
        .to_excel(excel_path, index=False)
)
print('Excel 写入完成:', excel_path)


In [None]:
# 提取每个请求的输入/输出 token 数量到 DataFrame
usage_rows = []
with open(ret_path, 'r', encoding='utf-8') as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        custom_id = obj.get('custom_id')
        resp = obj.get('response', {})
        status_code = resp.get('status_code')
        body = resp.get('body', {}) or {}
        usage = body.get('usage', {}) or {}
        model = body.get('model')
        usage_rows.append({
            'custom_id': custom_id,
            'status_code': status_code,
            'model': model,
            'prompt_tokens': usage.get('prompt_tokens'),
            'completion_tokens': usage.get('completion_tokens'),
            'total_tokens': usage.get('total_tokens')
        })

usage_df = pd.DataFrame(usage_rows)
for col in ['prompt_tokens', 'completion_tokens', 'total_tokens']:
    usage_df[col] = pd.to_numeric(usage_df[col], errors='coerce').astype('Int64')

print('行数:', len(usage_df))
usage_df.head(5)


In [None]:
# 重写 batch_responsibilities_200.jsonl 为简化schema（仅 custom_id + body），温度=1
import os, json

# 准备数据与函数（兜底）
if 'pd' not in globals():
    import pandas as pd
if 'df_sample' not in globals():
    if 'df' not in globals():
        df = pd.read_parquet('matched_all.parquet')
    df_sample = df[['岗位描述']].head(200).copy()
if 'clean_description' not in globals():
    def clean_description(text):
        if text is None:
            return ''
        return str(text).strip()

system_prompt = "请提取岗位描述信息中的岗位职责，返回Python列表。若岗位描述信息中没有岗位职责，返回none。"

out_dir = os.path.join('.', '中间文件')
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, 'batch_responsibilities_200_doubao.jsonl')

with open(out_path, 'w', encoding='utf-8') as f:
    for i, row in df_sample.reset_index().iterrows():
        original_idx = int(row['index']) if 'index' in row else i
        desc = row.get('岗位描述', '')
        user_text = clean_description(desc) if pd.notna(desc) else ''
        obj = {
            "custom_id": f"resp-{original_idx:06d}",
            "body": {
                "messages": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_text}
                ],
                "temperature": 0,
                "thinking ":{"type":"disabled"}
            }
        }
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

print('已重写:', out_path)
with open(out_path, 'r', encoding='utf-8') as f:
    for _ in range(2):
        print(f.readline().rstrip())


In [None]:
print(usage_df['prompt_tokens'].sum())
print(usage_df['completion_tokens'].sum())
comsumption = usage_df['prompt_tokens'].sum()/1000*0.00015/2 + usage_df['completion_tokens'].sum()/1000*0.0015/2
print('comsumption:', comsumption)
print('estimated total comsumption:', df.shape[0]/200*comsumption)

In [None]:
usage_df.sort_values('prompt_tokens', ascending=False).head(5)

In [None]:
# 全量清洗并按5万条切分导出为智谱JSONL，带进度条
import os, json, math

# 进度条优先使用 tqdm；若不可用则降级为分片进度打印
try:
    from tqdm import tqdm  # type: ignore
except Exception:  # noqa: BLE001
    tqdm = None  # type: ignore

# 兜底导入
if 'pd' not in globals():
    import pandas as pd  # type: ignore

# 读取全量数据（若未读取）
if 'df' not in globals():
    df = pd.read_parquet('matched_all.parquet')

# 清洗函数（若之前未定义，则提供兜底版本）
if 'clean_description' in globals():
    clean_fn = clean_description
else:
    import re, html
    def clean_fn(text: object) -> str:
        if text is None:
            return ''
        if not isinstance(text, str):
            try:
                text = str(text)
            except Exception:  # noqa: BLE001
                return ''
        s = text.replace('\xa0', ' ').replace('&nbsp;', ' ')
        s = html.unescape(s)
        s = re.sub(r"<br\s*/?>", "\n", s, flags=re.IGNORECASE)
        s = re.sub(r"</?p[^>]*>", "\n", s, flags=re.IGNORECASE)
        s = re.sub(r"</?(div|span|ul|ol|li|strong|em|font)[^>]*>", "\n", s, flags=re.IGNORECASE)
        s = re.sub(r"<[^>]+>", " ", s)
        s = s.replace("\r", "\n")
        s = re.sub(r"\t+", " ", s)
        s = re.sub(r"\n+", "\n", s)
        s = re.sub(r"[ \u3000]{2,}", " ", s)
        return s.strip()

# 生成清洗列
df_all = df.copy()
df_all['岗位描述_清洗'] = df_all['岗位描述'].map(lambda x: clean_fn(x) if pd.notna(x) else '')

# 输出目录（新建文件夹）
base_out_dir = os.path.join('.', '中间文件', 'zhipu_batches')
os.makedirs(base_out_dir, exist_ok=True)

# 智谱请求体参数
system_prompt = (
    "请提取岗位描述信息中的岗位职责或任务，输出{\"responsibilities\":[\"…\",\"…\"]}。"
    "若岗位描述信息中没有岗位职责，输出{\"responsibilities\":[]}。"
)

# 切分参数：每个文件 50,000 条
chunk_size = 50_000
n_rows = len(df_all)
import math as _math
n_chunks = _math.ceil(n_rows / chunk_size) if n_rows > 0 else 0

# 进度条
use_tqdm = tqdm is not None and n_rows > 0
pbar = tqdm(total=n_rows, desc='Writing Zhipu JSONL') if use_tqdm else None

# 使用 reset_index 保留原始行号，custom_id 使用原始行号格式化
with_index = df_all.reset_index()

for chunk_idx in range(n_chunks):
    start = chunk_idx * chunk_size
    end = min((chunk_idx + 1) * chunk_size, n_rows)
    chunk = with_index.iloc[start:end][['index', '岗位描述_清洗']]

    out_path = os.path.join(base_out_dir, f'batch_responsibilities_zhipu_{chunk_idx + 1:03d}.jsonl')
    with open(out_path, 'w', encoding='utf-8') as f:
        # 使用 itertuples(name=None) 以获得更快的迭代
        for original_idx, user_text in chunk.itertuples(index=False, name=None):
            payload = {
                "custom_id": f"resp-{int(original_idx):06d}",
                "method": "POST",
                "url": "/v4/chat/completions",
                "body": {
                    "model": "glm-4-flash",
                    "temperature": 0.1,
                    "messages": [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_text or ''}
                    ]
                }
            }
            f.write(json.dumps(payload, ensure_ascii=False) + "\n")
            if use_tqdm:
                pbar.update(1)

    # 分片完成提示（即便无 tqdm 也能看到进度）
    print(f'写入完成: {out_path}  [{start}-{end})')

if use_tqdm and pbar is not None:
    pbar.close()

print(f'总记录数: {n_rows}, 总文件数: {n_chunks}, 输出目录: {base_out_dir}')

# 预览前两个文件的首行
import glob as _glob
preview_files = sorted(_glob.glob(os.path.join(base_out_dir, '*.jsonl')))[:2]
for fp in preview_files:
    print('预览:', fp)
    with open(fp, 'r', encoding='utf-8') as f:
        print(f.readline().rstrip())



In [2]:
from zai import ZhipuAiClient

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")

# 上传批处理文件
file_object = client.files.create(
    file=open(r"中间文件\zhipu_batches\batch_responsibilities_zhipu_003.jsonl", "rb"),
    purpose="batch"
)
print(file_object)

FileObject(id='1757250233_171a269149e0484abfa6d5c0c64abe19', bytes=57815255, created_at=1757250233, filename='batch_responsibilities_zhipu_003.jsonl', object='file', purpose='batch', status=None, status_details=None)


0.0.3.4
