# 加载数据，导出输入json

In [1]:
#2022年数据pipline
import pandas as pd
df = pd.read_parquet(r'中间文件\data_after_2022.parquet')
df.dropna(subset=['ticker_symbol'],inplace=True)
df_all = df.copy()

In [None]:
# 去重导出（智谱）：仅导出唯一的 岗位描述_清洗，并写出回写映射
import os, json, math
from tqdm import tqdm
import pandas as pd

# 若不存在 df_all，则构建（含 岗位描述_清洗）
if 'df_all' not in globals() or '岗位描述_清洗' not in df_all.columns:
    if 'df' not in globals():
        df = pd.read_parquet('matched_all.parquet')
    # 使用已定义的 clean_description，否则提供兜底
    if 'clean_description' in globals():
        clean_fn = clean_description
    else:
        import re, html
        def clean_fn(text: object) -> str:
            if text is None:
                return ''
            if not isinstance(text, str):
                try:
                    text = str(text)
                except Exception:
                    return ''
            s = text.replace('\xa0', ' ').replace('&nbsp;', ' ')
            s = html.unescape(s)
            s = re.sub(r"<br\s*/?>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"</?p[^>]*>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"</?(div|span|ul|ol|li|strong|em|font)[^>]*>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"<[^>]+>", " ", s)
            s = s.replace("\r", "\n")
            s = re.sub(r"\t+", " ", s)
            s = re.sub(r"\n+", "\n", s)
            s = re.sub(r"[ \u3000]{2,}", " ", s)
            return s.strip()
    df_all = df.copy()
    df_all['岗位描述_清洗'] = df_all['position_responsibilities'].map(lambda x: clean_fn(x) if pd.notna(x) else '')

# 1) 生成去重ID：按首次出现顺序 factorize
with_index = df_all.reset_index()[['index', '岗位描述_清洗']]
codes, uniques = pd.factorize(with_index['岗位描述_清洗'], sort=False)
with_index['desc_id'] = pd.Series(codes, index=with_index.index).map(lambda c: f'desc-{int(c):07d}')

# 2) 写出映射：原始行号 -> desc_id；唯一描述表 desc_id -> 文本
dedup_dir = os.path.join('.', '中间文件', 'zhipu_batches_dedup')
os.makedirs(dedup_dir, exist_ok=True)

row2desc_path = os.path.join(dedup_dir, 'map_row_to_desc_id2022.parquet')
with_index[['index', 'desc_id']].to_parquet(row2desc_path, index=False)

unique_df = pd.DataFrame({
    'desc_id': [f'desc-{i:07d}' for i in range(len(uniques))],
    '岗位描述_清洗': uniques
})
unique_path = os.path.join(dedup_dir, 'unique_cleaned_descriptions2022.parquet')
unique_df.to_parquet(unique_path, index=False)

print('唯一描述数:', len(unique_df), '映射文件:', row2desc_path)

# 3) 导出去重后的请求为 Zhipu 批处理 JSONL（每文件 50,000 条）
system_prompt = (
    "请提取岗位描述信息中的**岗位职责或任务**，输出{\"responsibilities\":[\"…\",\"…\"]}。"
    "若岗位描述信息中**没有**岗位职责，输出{\"responsibilities\":[]}。注意：不要提取学历、年龄、技能、经验等**任职要求**。"
)

chunk_size = 50_000
n_rows = len(unique_df)
base_out_dir = dedup_dir

with tqdm(total=n_rows, desc='Writing Dedup Zhipu JSONL') as pbar:
    for chunk_idx in range(math.ceil(n_rows / chunk_size)):
        start = chunk_idx * chunk_size
        end = min((chunk_idx + 1) * chunk_size, n_rows)
        chunk = unique_df.iloc[start:end][['desc_id', '岗位描述_清洗']]

        out_path = os.path.join(base_out_dir, f'batch_responsibilities_zhipu_dedup_2022_{chunk_idx + 1:03d}.jsonl')
        with open(out_path, 'w', encoding='utf-8') as f:
            for desc_id, user_text in chunk.itertuples(index=False, name=None):
                payload = {
                    "custom_id": desc_id,
                    "method": "POST",
                    "url": "/v4/chat/completions",
                    "body": {
                        "model": "glm-4-flash",
                        "temperature": 0.1,
                        "messages": [
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": user_text or ''}
                        ]
                    }
                }
                f.write(json.dumps(payload, ensure_ascii=False) + "\n")
                pbar.update(1)

print(f'去重导出完成。唯一数: {n_rows}, 输出目录: {base_out_dir}')

# 4) 预览首个文件首行
import glob
preview_files = sorted(glob.glob(os.path.join(base_out_dir, 'batch_responsibilities_zhipu_dedup_2022_*.jsonl')))[:1]
for fp in preview_files:
    print('预览:', fp)
    with open(fp, 'r', encoding='utf-8') as f:
        print(f.readline().rstrip())


In [None]:
# 使用 ZhipuAiClient 批量上传去重后的 JSONL 文件
import os, glob
import pandas as pd
from zai import ZhipuAiClient

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")  # TODO: 替换为你的API Key

upload_dir = os.path.join('.', '中间文件', 'zhipu_batches_dedup')
files = sorted(glob.glob(os.path.join(upload_dir, 'batch_responsibilities_zhipu_dedup_2022_*.jsonl')))
print('待上传文件数:', len(files))

records = []
for fp in files:
    fo = client.files.create(file=open(fp, 'rb'), purpose='batch')
    # 兼容对象字段名
    rec = {
        'local_path': fp,
        'id': getattr(fo, 'id', None) or fo.get('id') if isinstance(fo, dict) else None,
        'filename': getattr(fo, 'filename', None) or fo.get('filename') if isinstance(fo, dict) else os.path.basename(fp),
        'bytes': getattr(fo, 'bytes', None) or fo.get('bytes') if isinstance(fo, dict) else None,
        'created_at': getattr(fo, 'created_at', None) or fo.get('created_at') if isinstance(fo, dict) else None,
        'object': getattr(fo, 'object', None) or fo.get('object') if isinstance(fo, dict) else None,
        'purpose': getattr(fo, 'purpose', None) or fo.get('purpose') if isinstance(fo, dict) else 'batch',
        'status': getattr(fo, 'status', None) or fo.get('status') if isinstance(fo, dict) else None,
    }
    print('已上传:', rec['filename'], '->', rec['id'])
    records.append(rec)

manifest = pd.DataFrame(records)
manifest_path = os.path.join(upload_dir, 'upload_manifest_2022.parquet')
manifest.to_parquet(manifest_path, index=False)
print('清单写入:', manifest_path)
manifest.head(3)


# 批量下载，保存

In [None]:
from zai import ZhipuAiClient
import pandas as pd

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")  # 填写您自己的APIKey

# client.batches.list返回了SyncCursorPage
batch_list = client.batches.list(limit=100)

batch_records = []
# 直接遍历batch_list中的每个batch对象
for batch in batch_list:
    # 将每个batch对象转换为字典并添加到列表中
    batch_dict = {
        'id': getattr(batch, 'id', None),
        'object': getattr(batch, 'object', None),
        'endpoint': getattr(batch, 'endpoint', None),
        'errors': getattr(batch, 'errors', None),
        'input_file_id': getattr(batch, 'input_file_id', None),
        'completion_window': getattr(batch, 'completion_window', None),
        'status': getattr(batch, 'status', None),
        'output_file_id': getattr(batch, 'output_file_id', None),
        'error_file_id': getattr(batch, 'error_file_id', None),
        'created_at': getattr(batch, 'created_at', None),
        'in_progress_at': getattr(batch, 'in_progress_at', None),
        'expires_at': getattr(batch, 'expires_at', None),
        'finalizing_at': getattr(batch, 'finalizing_at', None),
        'completed_at': getattr(batch, 'completed_at', None),
        'failed_at': getattr(batch, 'failed_at', None),
        'expired_at': getattr(batch, 'expired_at', None),
        'cancelling_at': getattr(batch, 'cancelling_at', None),
        'cancelled_at': getattr(batch, 'cancelled_at', None),
        'request_counts': getattr(batch, 'request_counts', None),
        'metadata': getattr(batch, 'metadata', None)
    }
    batch_records.append(batch_dict)

# 创建DataFrame
batch_df = pd.DataFrame(batch_records)
print(f"共获取到 {len(batch_df)} 个batch")
batch_df

In [None]:
batch_df['code'] = batch_df['id'].str[6:].astype(int)
batch_df_download = batch_df[batch_df['code']>=1966328748601114624]
batch_df_download = batch_df_download[batch_df_download['status']=='completed']
batch_df_download.drop_duplicates(subset=['id'],inplace=True)

In [None]:
#批量下载
import os
from zai import ZhipuAiClient

client = ZhipuAiClient(api_key="569c5512417849eca12a693b5dbb562b.RbV8Ms2ChOI2JvAi")  # 填写您自己的APIKey

# 创建目标文件夹
output_dir = "中间文件/batch_return_files"
os.makedirs(output_dir, exist_ok=True)

# 遍历batch_df_download中的每个batch，下载结果文件
for index, row in batch_df_download.iterrows():
    output_file_id = row['output_file_id']
    batch_id = row['id']
    
    if output_file_id:
        try:
            # 下载文件内容
            content = client.files.content(output_file_id)
            
            # 构建输出文件名
            output_filename = f"batch_result_{batch_id}_2022.jsonl"
            output_path = os.path.join(output_dir, output_filename)
            
            # 写入文件
            content.write_to_file(output_path)
            print(f"已下载: {batch_id} -> {output_filename}")
            
        except Exception as e:
            print(f"下载失败 {batch_id}: {e}")
    else:
        print(f"跳过 {batch_id}: 没有output_file_id")

# 解析，并导出到远程

In [1]:
from __future__ import annotations

from pathlib import Path
import json
from json import JSONDecodeError
from typing import Any, Dict, List, Tuple

import pandas as pd

# # 更友好的显示设置
# pd.set_option("display.max_columns", 100)
# pd.set_option("display.width", 160)
# pd.set_option("display.max_colwidth", 200)

# # JSONL 文件路径（相对当前工作目录）
# project_root = Path.cwd()
# jsonl_path = project_root / "中间文件" / "batch_return_files" / "batch_result_batch_1964959511140634624.jsonl"
# print(f"JSONL path: {jsonl_path}")
# assert jsonl_path.exists(), f"文件不存在: {jsonl_path}"


def _extract_json_substring(text: str) -> str | None:
    """在一行文本中提取第一个完整的 { ... } JSON 子串。"""
    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        return text[start : end + 1]
    return None


def parse_json_line(line: str) -> Tuple[Dict[str, Any] | None, str | None]:
    """尝试将单行 JSONL 文本解析为字典。

    返回: (obj, err)
      - obj: 解析成功的字典
      - err: 解析失败的错误信息字符串
    """
    raw = line.strip()
    if not raw:
        return None, None

    # 常见场景：SSE 或日志前缀，如 "data: {...}"
    if raw.startswith("data:"):
        raw = raw[5:].strip()

    # 尝试直接解析
    try:
        return json.loads(raw), None
    except JSONDecodeError as e1:
        # 尝试从文本中提取 JSON 子串
        candidate = _extract_json_substring(raw)
        if candidate:
            try:
                return json.loads(candidate), None
            except JSONDecodeError as e2:
                return None, f"JSONDecodeError after substring extract: {e2} | line sample: {raw[:200]}"
        return None, f"JSONDecodeError: {e1} | line sample: {raw[:200]}"


def load_jsonl(path: Path) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """逐行解析 JSONL，返回 (records, errors)。

    errors: [{line_no, error, line_sample}] 列表
    """
    records: List[Dict[str, Any]] = []
    errors: List[Dict[str, Any]] = []

    with path.open("r", encoding="utf-8-sig", errors="replace") as f:
        for idx, line in enumerate(f, start=1):
            obj, err = parse_json_line(line)
            if obj is not None:
                records.append(obj)
            elif err:
                errors.append({
                    "line_no": idx,
                    "error": err,
                })
            # 空行则忽略
    return records, errors


In [2]:
# 读取与构建 DataFrame
# 读取中间文件\batch_return_files中的所有文件名称
from pathlib import Path

batch_return_dir = Path("中间文件/batch_return_files")
path_list = []

if batch_return_dir.exists() and batch_return_dir.is_dir():
    path_list = [f for f in batch_return_dir.iterdir() if f.is_file()]
    print(f"Found {len(path_list)} files in batch_return_files directory")
else:
    print("batch_return_files directory not found")

# 假设我们要处理第一个文件（如果存在的话）
if path_list:
    jsonl_path = path_list[0]
    print(f"Processing file: {jsonl_path}")
else:
    # 如果没有找到文件，使用默认路径
    jsonl_path = Path("中间文件/batch_return_files/default.jsonl")
    print("No files found, using default path")

Found 9 files in batch_return_files directory
Processing file: 中间文件\batch_return_files\batch_result_batch_1966328748601114624_2022.jsonl


In [3]:
from tqdm import tqdm

all_records = []
all_errors = []

# 循环读取path_list中的所有文件
for jsonl_path in tqdm(path_list, desc="Processing files"):
    records, errors = load_jsonl(jsonl_path)
    
    # 合并到总列表中
    all_records.extend(records)
    all_errors.extend(errors)

print(f"Total parsed records: {len(all_records)} | Total errors: {len(all_errors)}")

df = pd.json_normalize(all_records, max_level=1)
print(df.shape)
df.head(3)


Processing files:   0%|          | 0/9 [00:00<?, ?it/s]

Processing files: 100%|██████████| 9/9 [00:04<00:00,  2.11it/s]


Total parsed records: 433170 | Total errors: 0
(433170, 4)


Unnamed: 0,custom_id,id,response.status_code,response.body
0,desc-0001908,batch_1966328748601114624,200,"{'created': 1757644914, 'usage': {'completion_..."
1,desc-0001958,batch_1966328748601114624,200,"{'created': 1757644910, 'usage': {'completion_..."
2,desc-0002117,batch_1966328748601114624,200,"{'created': 1757644904, 'usage': {'completion_..."


In [4]:
# 解析大模型输出为 Python 列表列 `responsibilities`
from typing import Any, Dict, List, Tuple
from json_repair import repair_json
from tqdm import tqdm
# json_repair 安装与导入（若不可用则跳过）



def parse_content_to_responsibilities(content: Any) -> Tuple[List[str] | None, str | None]:
    if content is None:
        return None, "no_content"

    # 如果已经是 dict，直接从字段中取
    if isinstance(content, dict):
        if "responsibilities" in content and isinstance(content["responsibilities"], list):
            items = [str(x).strip() for x in content["responsibilities"]]
            items = [x for x in items if x]
            return items, None
        # 有些模型会嵌套 content 字段
        if "content" in content:
            return parse_content_to_responsibilities(content["content"])
        return None, "dict_without_responsibilities"

    # 如果是字符串，通常是一个 JSON 字符串，先尝试 json.loads
    if isinstance(content, str):
        s = content.strip()
        try:
            loaded = json.loads(s)
        except Exception as e1:
            # 先尝试从字符串中提取 {...} 子串
            candidate = _extract_json_substring(s)
            # 1) 直接解析子串
            if candidate:
                try:
                    loaded = json.loads(candidate)
                except Exception:
                    loaded = None
            else:
                loaded = None

            # 2) 如果仍失败并且可用，尝试 json_repair 修复（对子串优先）
            if loaded is None:
                try:
                    to_repair = candidate if candidate else s
                    if repair_json is None:
                        raise RuntimeError("json_repair_not_available")
                    repaired = repair_json(to_repair)
                    loaded = json.loads(repaired)
                except Exception as e_repair:
                    return None, f"json_repair_failed:{e_repair}"
        return parse_content_to_responsibilities(loaded)

    return None, f"unsupported_content_type:{type(content).__name__}"


def parse_responsibilities_from_body(body: Any) -> Tuple[List[str] | None, str | None]:
    if not isinstance(body, dict):
        return None, "body_not_dict"

    # 优先从 choices[0].message.content 提取
    content = None
    choices = body.get("choices")
    if isinstance(choices, list) and len(choices) > 0 and isinstance(choices[0], dict):
        first = choices[0]
        msg = first.get("message")
        if isinstance(msg, dict):
            content = msg.get("content")
        if content is None:
            content = first.get("content")
        if content is None and isinstance(first.get("delta"), dict):
            content = first["delta"].get("content")

    # 其他可能位置
    if content is None:
        message_field = body.get("message") if isinstance(body.get("message"), dict) else {}
        content = (
            body.get("output_text")
            or body.get("content")
            or (message_field.get("content") if isinstance(message_field, dict) else None)
        )

    return parse_content_to_responsibilities(content)


# 仅解析 status_code == 200 的行
mask_ok = df["response.status_code"] == 200
print(f"Parsing responsibilities for {mask_ok.sum()} rows with status_code==200...")

# 使用 tqdm 添加进度条
tqdm.pandas(desc="Parsing responsibilities")
parsed_series = df.loc[mask_ok, "response.body"].progress_apply(parse_responsibilities_from_body)

df.loc[mask_ok, "responsibilities"] = parsed_series.map(lambda t: t[0])
df.loc[mask_ok, "responsibilities_error"] = parsed_series.map(lambda t: t[1])

# 统计与预览
num_ok = int(mask_ok.sum())
# num_parsed = int(df.loc[mask_ok, "responsibilities"].apply(lambda x: isinstance(x, list) and len(x) > 0).sum())
num_parsed = int(df.loc[mask_ok, "responsibilities"].apply(lambda x: isinstance(x, list)).sum())
print(f"Rows with status_code==200: {num_ok} | Parsed responsibilities: {num_parsed}")

cols = [c for c in ["custom_id", "id", "responsibilities"] if c in df.columns]
df[cols].head(10)


Parsing responsibilities for 433170 rows with status_code==200...


Parsing responsibilities: 100%|██████████| 433170/433170 [00:02<00:00, 215867.81it/s]


Rows with status_code==200: 433170 | Parsed responsibilities: 433133


Unnamed: 0,custom_id,id,responsibilities
0,desc-0001908,batch_1966328748601114624,"[基于B2B业务特性，输出整合营销策略, 通过各类平台完成用户和市场数据分析挖掘，深度洞察用..."
1,desc-0001958,batch_1966328748601114624,"[根据业务需求，为业务部门提供业务解决方案咨询, 收集业务部门对系统的各项需求，对业务需求进..."
2,desc-0002117,batch_1966328748601114624,"[负责管网泵站的维护、管理、使用, 负责管网泵站的日巡检工作，确保其正常运转, 负责及时、准..."
3,desc-0002861,batch_1966328748601114624,"[通过电话和微信与家长进行顾问式销售，在充分识别家长需求后，为孩子推荐少儿课程, 日常对意向..."
4,desc-0002910,batch_1966328748601114624,"[负责仓库日常物资的验收、入库、码放、保管、盘点、对账等工作, 负责仓库日常物资的拣选、复核..."
5,desc-0002940,batch_1966328748601114624,[负责药品中试生产，非临床质量体系建设和完善，根据国家法规要求组织各部门起草制定必要的质量体...
6,desc-0003038,batch_1966328748601114624,"[ASIC或SOC的芯片FPGA原型验证, 针对黑盒netlist的时钟网络分析]"
7,desc-0003122,batch_1966328748601114624,"[负责落实区域内机械配套客户销售目标与计划，订单跟进与执行, 负责开拓区域内机械配套客户，并..."
8,desc-0003170,batch_1966328748601114624,"[全面负责员工宿舍的日常管理工作, 负责维修宿舍水电、设施, 负责男员工宿舍安全隐患检查, ..."
9,desc-0003247,batch_1966328748601114624,"[数据清洗、多因子选股模型建立和分析, 股票投资经理绩效跟踪分析]"


In [5]:
output = df[["custom_id", "id", "responsibilities",'responsibilities_error']]
output = output[output['responsibilities_error'].isna()]
output = output[output['responsibilities'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
# 对responsibilities列中的列表进行去重处理
# 对responsibilities列中的列表进行去重处理，保持原有顺序
# dict.fromkeys(x) 创建一个字典，键为列表中的元素，由于字典键不能重复，自动去重
# list() 将字典的键转换回列表，保持了元素的原始顺序
# isinstance(x, list) 检查元素是否为列表类型，如果不是则保持原值
output['responsibilities'] = output['responsibilities'].apply(lambda x: list(dict.fromkeys(x)) if isinstance(x, list) else x)


In [None]:
# output[['responsibilities']].to_parquet('list_par3.parquet')

In [6]:
import pandas as pd
dfx = pd.read_parquet('list_par3_with_matches.parquet')

In [7]:
print(dfx.shape)
print(output.shape)
output = output.reset_index(drop=True)
dfx = dfx.reset_index(drop=True)
output.drop('responsibilities',inplace=True,axis =1)
merged = pd.concat([output,dfx],axis = 1)
merged = merged[['custom_id', 'responsibilities', 'responsibilities_match']]
merged.to_parquet('id_responsibility_match_2022.parquet')

(409627, 2)
(409627, 4)


In [16]:
sample_data = merged.sample(10)
x = sample_data[['custom_id','responsibilities']].explode(['responsibilities'],ignore_index=True)
y = sample_data[['custom_id','responsibilities_match']].explode(['responsibilities_match'],ignore_index=True)
test = pd.concat([x,y],axis=1)
test.to_excel('test.xlsx',index=False)

In [None]:
# 去重导出（智谱）：仅导出唯一的 岗位描述_清洗，并写出回写映射
import os, json, math
from tqdm import tqdm
import pandas as pd

# 若不存在 df_all，则构建（含 岗位描述_清洗）
if 'df_all' not in globals() or '岗位描述_清洗' not in df_all.columns:
    if 'df' not in globals():
        df = pd.read_parquet('matched_all.parquet')
    # 使用已定义的 clean_description，否则提供兜底
    if 'clean_description' in globals():
        clean_fn = clean_description
    else:
        import re, html
        def clean_fn(text: object) -> str:
            if text is None:
                return ''
            if not isinstance(text, str):
                try:
                    text = str(text)
                except Exception:
                    return ''
            s = text.replace('\xa0', ' ').replace('&nbsp;', ' ')
            s = html.unescape(s)
            s = re.sub(r"<br\s*/?>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"</?p[^>]*>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"</?(div|span|ul|ol|li|strong|em|font)[^>]*>", "\n", s, flags=re.IGNORECASE)
            s = re.sub(r"<[^>]+>", " ", s)
            s = s.replace("\r", "\n")
            s = re.sub(r"\t+", " ", s)
            s = re.sub(r"\n+", "\n", s)
            s = re.sub(r"[ \u3000]{2,}", " ", s)
            return s.strip()
    df_all = df.copy()
    df_all['岗位描述_清洗'] = df_all['position_responsibilities'].map(lambda x: clean_fn(x) if pd.notna(x) else '')

# 1) 生成去重ID：按首次出现顺序 factorize
with_index = df_all.reset_index()[['index', '岗位描述_清洗']]
codes, uniques = pd.factorize(with_index['岗位描述_清洗'], sort=False)
with_index['desc_id'] = pd.Series(codes, index=with_index.index).map(lambda c: f'desc-{int(c):07d}')

# 2) 写出映射：原始行号 -> desc_id；唯一描述表 desc_id -> 文本
dedup_dir = os.path.join('.', '中间文件', 'zhipu_batches_dedup')
os.makedirs(dedup_dir, exist_ok=True)

row2desc_path = os.path.join(dedup_dir, 'map_row_to_desc_id2022.parquet')
with_index[['index', 'desc_id']].to_parquet(row2desc_path, index=False)

unique_df = pd.DataFrame({
    'desc_id': [f'desc-{i:07d}' for i in range(len(uniques))],
    '岗位描述_清洗': uniques
})
unique_path = os.path.join(dedup_dir, 'unique_cleaned_descriptions2022.parquet')
unique_df.to_parquet(unique_path, index=False)

# print('唯一描述数:', len(unique_df), '映射文件:', row2desc_path)

# # 3) 导出去重后的请求为 Zhipu 批处理 JSONL（每文件 50,000 条）
# system_prompt = (
#     "请提取岗位描述信息中的**岗位职责或任务**，输出{\"responsibilities\":[\"…\",\"…\"]}。"
#     "若岗位描述信息中**没有**岗位职责，输出{\"responsibilities\":[]}。注意：不要提取学历、年龄、技能、经验等**任职要求**。"
# )

# chunk_size = 50_000
# n_rows = len(unique_df)
# base_out_dir = dedup_dir

# with tqdm(total=n_rows, desc='Writing Dedup Zhipu JSONL') as pbar:
#     for chunk_idx in range(math.ceil(n_rows / chunk_size)):
#         start = chunk_idx * chunk_size
#         end = min((chunk_idx + 1) * chunk_size, n_rows)
#         chunk = unique_df.iloc[start:end][['desc_id', '岗位描述_清洗']]

#         out_path = os.path.join(base_out_dir, f'batch_responsibilities_zhipu_dedup_2022_{chunk_idx + 1:03d}.jsonl')
#         with open(out_path, 'w', encoding='utf-8') as f:
#             for desc_id, user_text in chunk.itertuples(index=False, name=None):
#                 payload = {
#                     "custom_id": desc_id,
#                     "method": "POST",
#                     "url": "/v4/chat/completions",
#                     "body": {
#                         "model": "glm-4-flash",
#                         "temperature": 0.1,
#                         "messages": [
#                             {"role": "system", "content": system_prompt},
#                             {"role": "user", "content": user_text or ''}
#                         ]
#                     }
#                 }
#                 f.write(json.dumps(payload, ensure_ascii=False) + "\n")
#                 pbar.update(1)

# print(f'去重导出完成。唯一数: {n_rows}, 输出目录: {base_out_dir}')

# # 4) 预览首个文件首行
# import glob
# preview_files = sorted(glob.glob(os.path.join(base_out_dir, 'batch_responsibilities_zhipu_dedup_2022_*.jsonl')))[:1]
# for fp in preview_files:
#     print('预览:', fp)
#     with open(fp, 'r', encoding='utf-8') as f:
#         print(f.readline().rstrip())


In [None]:
merged_before = pd.read_parquet('id_responsibility_match_2022.parquet')
merged_before.rename(columns={'custom_id':'desc_id'},inplace=True)
x = pd.merge(with_index,merged_before,on='desc_id',how='left')
x.drop(['desc_id', 'index', '岗位描述_清洗'],axis=1,inplace=True)
df_all.reset_index(drop=True,inplace=True)
x.reset_index(drop=True,inplace=True)
df_out = pd.concat([df_all,x],axis=1)
df_out.drop(['岗位描述_清洗'],axis=1,inplace=True)
df_out.to_parquet('matched_all_with_responsibility_2022.parquet')

In [1]:
import pandas as pd
df = pd.read_parquet('matched_all_with_responsibility_2022.parquet')

In [2]:
df

Unnamed: 0,ticker_symbol,title,employer,salary_range,annual_salary_range_start,annual_salary_range_end,working_experience,working_location,working_address,educational_requirement,published_at,language_requirements,created_at_x,position_temptations,position_responsibilities,position_requirements,responsibilities,responsibilities_match
0,002840,饲料品控科长,浙江华统肉制品股份有限公司,0.8-1万/月,96000.0,120000.0,5-7年经验,义乌,浙江华统肉制品股份有限公司,大专,2021-12-31 04:00:39,,2022-01-01 00:04:26,,1、负责制定饲料厂的生产工艺的质量控制标准，对不符合的生产工艺要求的操作纠偏，根据客户需求调...,,"[制定饲料厂的生产工艺的质量控制标准, 对不符合的生产工艺要求的操作纠偏, 根据客户需求调整...","[确定食品生产方法。, 建议变更或纠正程序。, 在加工过程中调整模具的位置。, 指导质量控制..."
1,002840,饲料设备主管,浙江华统肉制品股份有限公司,0.8-1万/月,96000.0,120000.0,5-7年经验,义乌,浙江华统肉制品股份有限公司,中专,2021-12-31 04:00:39,,2022-01-01 00:04:27,,1、负责饲料厂设备的维护与保养，保障生产的顺利进行；2、负责饲料厂设备操作的培训；3、负责饲...,,"[负责饲料厂设备的维护与保养，保障生产的顺利进行, 负责饲料厂设备操作的培训, 负责饲料厂生...","[维护生产或加工设备。, 对他人进行操作规程的培训。, 监督生产或支持人员。, 监督维护人员。]"
2,300021,市场营销内务经理,云南大禹节水有限公司,6-8千/月,72000.0,96000.0,3-4年经验,长沙,长沙市芙蓉区五一大道158号人瑞潇湘大厦2025号,本科,2021-12-31 11:27:44,,2022-01-01 00:15:37,"通讯补贴,绩效奖金,六险一金,补充医疗保险,员工旅游,专业培训,股票期权,年终奖金,餐饮补贴...",岗位职责：1、协助营销副总分解、落实各项销售指标和营销任务，组织部门人员对各区域市场营销工作...,,"[协助营销副总分解、落实各项销售指标和营销任务, 组织部门人员对各区域市场营销工作提供售前、...","[与他人合作制定或实施市场营销策略。, 监督销售或支持人员。, 与他人合作制定或实施市场营销..."
3,600036,销售代表不压工资保底6千,招商信诺人寿保险有限公司,1.5-2万/月,180000.0,240000.0,无需经验,上海,陆家嘴软件园9号楼,高中,2021-12-31 21:30:04,,2022-01-01 00:17:47,"带薪年假,五险一金,绩效奖金,全勤奖,节日福利,高温补贴,专业培训,交通补贴,学历免费提升,...",岗位不用外出，也不用自己找客户，电话量也没有很多，每天30个到50个左右的新名单。通过电话跟...,,"[通过电话和网络的方式与客户沟通, 向客户推荐合适的产品]","[与客户沟通，回答问题或解决投诉。, 向客户推荐产品或服务。]"
4,002717,营销总监,德马吉国际展览有限公司,4-5万/月,480000.0,600000.0,8-9年经验,上海,浦东东方金融园东方路2981号2楼,本科,2021-12-31 22:00:01,,2022-01-01 00:17:47,"五险一金,免费班车,员工旅游,交通补贴,餐饮补贴,出国机会,专业培训,年终奖金,股票期权,绩效奖金",1、带领公司市场部，根据公司战略要求，拓展公司的线上线下营销系统；2、能修订及执行公司战略规...,,"[带领公司市场部，根据公司战略要求，拓展公司的线上线下营销系统；, 修订及执行公司战略规划及...","[与他人合作制定或实施市场营销策略。, 制定业务或市场战略。, 制定运营策略、计划或程序。,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597601,002528,人力资源主管,深圳英飞拓科技股份有限公司,1-1.5万·14薪,120000.0,180000.0,,深圳,深圳市龙华区观澜环观南路高新技术产业开发区英飞拓园区,,2023-02-21 00:00:00,,2023-02-21 11:29:18,,主要职责1、协助建立健全公司人力资源管理制度；2、根据公司的发展需求，搭建及落实培训体系，并...,,"[协助建立健全公司人力资源管理制度, 搭建及落实培训体系，并不断完善, 进行培训需求调研、分...","[管理人力资源活动。, 开展员工培训项目。, 负责主导员工培训项目。, 支持他人的职业发展。..."
597602,601628,人事实习生,中国人寿保险股份有限公司南京市分公司,4-6千,48000.0,72000.0,,南京,南京市-鼓楼区-汉中路180号星汉大厦15楼,,2023-02-21 00:00:00,,2023-02-21 11:29:26,,岗位职责1、利用各类招聘渠道发布招聘广告和信息，筛选候选人，与候选人沟通职位信息进行意向确认...,,[利用各类招聘渠道发布招聘广告和信息，筛选候选人，与候选人沟通职位信息进行意向确认，邀约面试...,"[协调人员招聘活动。, 协调人员招聘活动。, 维护人事档案。, 协调人员招聘活动。]"
597603,600360,人力行政专员（子公司）,吉林华微电子股份有限公司,4-5千,48000.0,60000.0,,吉林,高新区深圳街97号,,2023-02-21 00:00:00,,2023-02-21 11:29:27,,技能要求：人力资源管理任职要求：1.人力资源、企业管理、工商管理或相关专业本科及以上学历；2...,,,
597604,300191,子公司人事主管,潜能恒信能源技术股份有限公司,9千-1.2万,144000.0,1080000.0,,北京,北苑路甲13号北辰新纪元大厦2塔22层,,2023-02-21 00:00:00,,2023-02-21 11:29:29,,1、支持协助各部门进行人力资源管理；2、负责招聘、培训、绩效、薪酬福利、员工关系等模块工作的...,,"[支持协助各部门进行人力资源管理, 负责招聘、培训、绩效、薪酬福利、员工关系等模块工作的执行...","[管理人力资源活动。, 执行人力资源活动。, 监督组织成员或合作伙伴的绩效。, 管理人力资源..."
