In [1]:
import os
import time
import json
import logging
import warnings
import tiktoken
import pandas as pd
from edgar import *
from tqdm import tqdm
from openai import OpenAI

warnings.simplefilter("ignore")
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger("openai").setLevel(logging.ERROR)

# GPT 設定
api_key = "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"
client = OpenAI(api_key=api_key)
model = "gpt-4o-mini"  # model_name = 'gpt-4o-mini-2024-07-18' 'o3-mini-2025-01-31'
max_input_tokens = 127000
max_output_tokens = 1000  # 128000=max_input_tokens+max_output_tokens
temperature = 0.3
API_TOKEN_LIMIT_PER_MIN = 200000
price = {
    "gpt-4o-mini": {"input": 0.15, "output": 0.60},
    "o3-mini": {"input": 1.10, "output": 15.00},
}
START_TIME = time.time()
used_tokens = 0
encoding = tiktoken.encoding_for_model(model)

# 公司名單
companies_range = 100
companies = (
    pd.read_csv("sp500_companies.csv")
    .drop_duplicates(subset=["Shortname"], keep="first")["Symbol"]
    .values[:companies_range]
)
with open("companies.json", "r", encoding="utf-8") as f:
    data = json.load(f)
missing_companies = [entry["ticker"] for entry in data["missing_companies"]]
find_cik = {entry["ticker"]: entry["cik"] for entry in data["find_cik"]}
companies = [c for c in companies if c not in missing_companies]

# 參數設定
set_identity("ansa ansa1019@gmail.com")
search_queries = ["IT capability", "organizational resilience"]
keywords = {q: [q] for q in search_queries}
report_item = {"paper_7": ["7"], "paper_17": ["1A", "7"]}
report_papers = "papers.json"
financials_file = "financials.csv"
report_year = [2014, 2023]
minlen = 1500
keyword_num = 30

# 初始化資料夾
roots = list(report_item.keys())
data_folder = "data/"
define_folder = "define/"
report_folder = {}
filter_folder = {}
summary_folder = {}
summary_report_folder = {}
summary_filter_folder = {}
for root in roots:
    report_folder[root] = f"{root}/report/"
    filter_folder[root] = f"{root}/filter/"
    summary_folder[root] = f"{root}/summary/"
    summary_report_folder[root] = f"{root}/summary/report/"
    summary_filter_folder[root] = f"{root}/summary/filter/"
    if not os.path.exists(summary_folder[root]):
        os.makedirs(summary_folder[root])
    if not os.path.exists(summary_report_folder[root]):
        os.makedirs(summary_report_folder[root])
    if not os.path.exists(summary_filter_folder[root]):
        os.makedirs(summary_filter_folder[root])
    for search_query in search_queries:
        folder = summary_report_folder[root] + search_query + "/"
        if not os.path.exists(folder):
            os.makedirs(folder)
        for x in range(report_year[0], report_year[1] + 1):
            fol = folder + str(x) + "/"
            if not os.path.exists(fol):
                os.makedirs(fol)
        folder = summary_filter_folder[root] + search_query + "/"
        if not os.path.exists(folder):
            os.makedirs(folder)
        for x in range(report_year[0], report_year[1] + 1):
            fol = folder + str(x) + "/"
            if not os.path.exists(fol):
                os.makedirs(fol)

In [None]:
# def
# 儲存檔案
def save_to_file(content, path):
    with open(path, "w", encoding="utf-8") as file:
        file.write(content)


# 儲存json
def save_to_json(data, path):
    with open(path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


def GPT(system, user):
    global model, used_tokens, START_TIME, max_output_tokens, temperature

    # 控制使用限制
    def check_token_limit():
        encoding = tiktoken.encoding_for_model(model)
        tokens = encoding.encode(system + user)
        text_token_count = len(tokens)
        elapsed_time = time.time() - START_TIME
        if used_tokens + text_token_count > API_TOKEN_LIMIT_PER_MIN:
            wait_time = 60 - elapsed_time
            if wait_time > 0:
                time.sleep(wait_time)
            START_TIME = time.time()
            used_tokens = 0
        used_tokens += text_token_count

    check_token_limit()
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": user},
        ],
        max_tokens=max_output_tokens,
        temperature=temperature,
    )
    total_tokens_used = response.usage.total_tokens
    log = f"本次請求使用了 {total_tokens_used} tokens\n"

    # 計算費用
    input_tokens = response.usage.prompt_tokens  # 輸入 tokens
    output_tokens = response.usage.completion_tokens  # 輸出 tokens
    input_cost = (input_tokens / 1_000_000) * price[model]["input"]
    output_cost = (output_tokens / 1_000_000) * price[model]["output"]
    total_cost = input_cost + output_cost
    log += f"本次請求費用: ${total_cost:.6f}\n"

    return response.choices[0].message.content, log

In [27]:
from datetime import datetime

files_list=client.files.list().data
for file in [f for f in files_list]:
    print(file.id)
    client.files.delete(file.id)

file-NYrkR5niGSs69bwT9rxdDA
file-WU8pyA9T6zUZKNKBrxhafa
file-TrCGtMWJi8z6fNzjtcLxr2
file-549B7ScC1QHMeoddsyWBeC
file-678eWPwea3ZxaYjYYsxEj7
file-748YvdMeK536g5rPMVMyYd
file-Q2soz24sTU8PhaPU99eaEh
file-ExtVFaXyjf3868vSPT6QQ9
file-UeSNMpw6pLRGRii3dY9e5m
file-NbDQ2N5o1r8VemZVH5ZYN3


In [26]:
with open("test.jsonl", "rb") as f:
    batch_file = client.files.create(file=f, purpose="batch")
print(f"上傳 jsonl 檔案，file_id:{batch_file.id}")

batch = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": f"generate summaries for {root}"},
)
print(f"建立 batch 任務，batch_id:{batch.id}")

print("等待 batch 完成")
while True:
    current_batch = client.batches.retrieve(batch.id)
    status = current_batch.status
    if status == "completed":
        break
    elif status in {"failed", "cancelled", "expired"}:
        raise RuntimeError(f"Batch 任務失敗，狀態為：{status}")
    time.sleep(10)  # 每 30 秒檢查一次

print("讀取 batch 回應")
output_file_id = current_batch.output_file_id
file_response = client.files.content(output_file_id)
print(file_response)

上傳 jsonl 檔案，file_id:file-WU8pyA9T6zUZKNKBrxhafa
建立 batch 任務，batch_id:batch_6818d56b3e9881909de191ea6eea8bce
等待 batch 完成


KeyboardInterrupt: 

In [None]:
# prompt
system_prompt = """
You are an assistant tasked with identifying and summarizing disclosures related to "{search_query}" in 10-K annual reports. Your objective is to produce technically accurate, high-fidelity summaries by selectively extracting key statements and structured information from the source text. Follow these criteria:

1. Focus exclusively on identifying content directly or indirectly related to "{search_query}", especially {search_prompt}.
2. Prefer extracting original sentence structures or minimally modified phrases from the source text.
3. Avoid commentary, interpretation, or speculative reasoning not present in the original disclosure.
4. Maintain a formal, technical tone aligned with SEC filing standards.
5. Do not include section headings, bullet points, or markdown formatting.
6. Use continuous prose in plain text, without line breaks between sentences.
7. Preserve extractive fidelity by aligning summary content with source-level phrasing and structure.

Evaluation Metrics:
- Factual Consistency: >90%
- Semantic Density: High
- Extractive Fidelity: Measured by Sentence Overlap, Cosine Similarity, and ROUGE-L
"""
user_prompt = """
Definition:
\"\"\"{definition}\"\"\"

Text:
\"\"\"{text}\"\"\"

Instruction:
Identify and extract sentences or phrases that are directly or indirectly related to "{search_query}", including but not limited to: {search_prompt}.

Then, generate a faithful summary by selectively reusing or lightly rephrasing content from the source:
- Do not invent, paraphrase extensively, or add any interpretation.
- Avoid bullet points, headings, markdown formatting, and direct quotations.
- Do not include introductions, conclusions, or restate definitions.
- Use a formal, technical tone consistent with regulatory documents.
- Return plain text only, in a single block of continuous prose, limited to 500 words.
- Retain original wordings and factual alignment as much as possible to ensure traceability.

Return only the summary text, ready for evaluation and vectorization.
"""

In [None]:
# 單篇摘要
log = ""
search_query = search_queries[0]
log += search_query + "\n----------\n"
input_file = "BAC.txt"
output_file = input_file.split(".")[0] + "_summary.txt"
with open(input_file, "r", encoding="utf-8") as f:
    text = re.sub(r"[\s\n]+", " ", normalize(f.read()))
sentences = list(filter(None, re.split(r"(?<=[。．\.])\s*", text)))
text_tokens = len(encoding.encode(text))
prompt_vars_base = {
    "text": "",
    "minlen": minlen,
    "maxlen": minlen * 2,
    "definition": definition[search_query],
    "search_query": search_query,
    "search_prompt": search_prompt[search_query],
}
system_base = system_prompt.format(**prompt_vars_base)
user_base = user_prompt.format(**prompt_vars_base)
max_text_tokens = max_input_tokens - len(encoding.encode(system_base + user_base))
if text_tokens <= max_text_tokens:
    chunks = [text]
    n_chunks = 1
else:
    n_chunks = math.ceil(text_tokens / max_text_tokens)
    target_chunk_tokens = text_tokens // n_chunks
    sentence_tokens_list = [
        (sentence, len(encoding.encode(sentence))) for sentence in sentences
    ]
    chunks = []
    current_chunk = ""
    current_tokens = 0
    for sentence, sentence_tokens in sentence_tokens_list:
        if current_tokens + sentence_tokens <= target_chunk_tokens:
            current_chunk += sentence
            current_tokens += sentence_tokens
        else:
            temp_tokens = len(encoding.encode(current_chunk + sentence))
            if temp_tokens <= target_chunk_tokens:
                current_chunk += sentence
                current_tokens = temp_tokens
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence
                current_tokens = sentence_tokens
    if current_chunk:
        chunks.append(current_chunk)
base_chunk_minlen = minlen // n_chunks
remaining = minlen - (base_chunk_minlen * n_chunks)
all_contents = []
for idx, chunk in enumerate(chunks):
    chunk_minlen = base_chunk_minlen + (1 if idx < remaining else 0)
    chunk_maxlen = int(chunk_minlen * 2)

    prompt_vars = {
        "text": chunk,
        "minlen": chunk_minlen,
        "maxlen": chunk_maxlen,
        "definition": definition[search_query],
        "search_query": search_query,
        "search_prompt": search_prompt[search_query],
    }
    system = system_prompt.format(**prompt_vars)
    user = user_prompt.format(**prompt_vars)
    total_tokens = len(encoding.encode(system)) + len(encoding.encode(user))
    print(f"第{idx+1}段 - 文本詞元數: {total_tokens}")
    content, res = GPT(system, user)
    all_contents.append(content)
    print(res)
content = "\n".join(all_contents)
save_to_file(content, output_file)
print(f"摘要已保存到: {output_file}")

IT capability
----------
文本詞元數: 13750
本次請求使用了 15288 tokens
本次請求費用: $0.002670
摘要已保存到: AAPL_summary.txt
----------



In [None]:
# 全文
for root in roots:
    print(root)
    for search_query in search_queries:
        print(search_query)
        result = []
        prompt_vars_base = {
            "text": "",
            "minlen": minlen,
            "maxlen": minlen * 2,
            "definition": definition[search_query],
            "search_query": search_query,
            "search_prompt": search_prompt[search_query],
        }
        system_base = system_prompt.format(**prompt_vars_base)
        user_base = user_prompt.format(**prompt_vars_base)
        max_text_tokens = max_input_tokens - len(
            encoding.encode(system_base + user_base)
        )
        loop = tqdm(range(report_year[0], report_year[1] + 1))
        for year in loop:
            report_files = [f for f in os.listdir(f"{report_folder[root]}/{str(year)}")]
            files_sum = len(report_files)
            for i, file in enumerate(report_files):
                ticker = file.split(".")[0]
                loop.set_description(
                    f"篩選 {year} 年文本：正在處理 {i}/{files_sum} 檔案"
                )
                input_file = f"{report_folder[root]}/{str(year)}/{file}"
                output_file = (
                    f"{summary_report_folder[root]}{search_query}/{str(year)}/{file}"
                )
                if not os.path.exists(output_file):
                    with open(input_file, "r", encoding="utf-8") as f:
                        text = re.sub(r"[\s\n]+", " ", normalize(f.read()))
                    sentences = list(filter(None, re.split(r"(?<=[。．\.])\s*", text)))
                    text_tokens = len(encoding.encode(text))
                    if text_tokens <= max_text_tokens:
                        chunks = [text]
                        n_chunks = 1
                    else:
                        n_chunks = math.ceil(text_tokens / max_text_tokens)
                        target_chunk_tokens = text_tokens // n_chunks
                        sentence_tokens_list = [
                            (sentence, len(encoding.encode(sentence)))
                            for sentence in sentences
                        ]
                        chunks = []
                        current_chunk = ""
                        current_tokens = 0
                        for sentence, sentence_tokens in sentence_tokens_list:
                            if current_tokens + sentence_tokens <= target_chunk_tokens:
                                current_chunk += sentence
                                current_tokens += sentence_tokens
                            else:
                                temp_tokens = len(
                                    encoding.encode(current_chunk + sentence)
                                )
                                if temp_tokens <= target_chunk_tokens:
                                    current_chunk += sentence
                                    current_tokens = temp_tokens
                                else:
                                    if current_chunk:
                                        chunks.append(current_chunk)
                                    current_chunk = sentence
                                    current_tokens = sentence_tokens
                        if current_chunk:
                            chunks.append(current_chunk)
                    base_chunk_minlen = minlen // n_chunks
                    remaining = minlen - (base_chunk_minlen * n_chunks)
                    all_contents = []
                    for idx, chunk in enumerate(chunks):
                        chunk_minlen = base_chunk_minlen + (1 if idx < remaining else 0)
                        chunk_maxlen = int(chunk_minlen * 2)
                        prompt_vars = {
                            "text": chunk,
                            "minlen": chunk_minlen,
                            "maxlen": chunk_maxlen,
                            "definition": definition[search_query],
                            "search_query": search_query,
                            "search_prompt": search_prompt[search_query],
                        }
                        system = system_prompt.format(**prompt_vars)
                        user = user_prompt.format(**prompt_vars)
                        total_tokens = len(encoding.encode(system)) + len(
                            encoding.encode(user)
                        )
                        content, res = GPT(system, user)
                        all_contents.append(content)
                        result.append(
                            f"{ticker}_{year} - 第{idx+1}段 文本詞元數: {total_tokens}\n{res}"
                        )
                    content = "\n".join(all_contents)
                    save_to_file(content, output_file)
        print(f"摘要已保存到: {summary_report_folder[root]}{search_query}")
        if result:
            print(f'{"-"*20}\n{search_query}\n{"-"*20}')
            for res in result:
                print(res)

IT capability


篩選 2023 年文本：正在處理 417/418 檔案: 100%|██████████| 10/10 [00:00<00:00, 15.71it/s]


organizational resilience


篩選 2019 年文本：正在處理 346/422 檔案:  50%|█████     | 5/10 [1:56:09<1:56:09, 1393.90s/it]


KeyboardInterrupt: 

In [None]:
# 篩選
for root in roots:
    print(root)
    log = "----------\n"
    similarity = {}
    for search_query in search_queries:
        print(search_query)
        log += search_query + "\n----------\n"
        loop = tqdm(range(report_year[0], report_year[1] + 1))
        for year in loop:
            summary_files = [
                f
                for f in os.listdir(
                    f"{summary_filter_folder[root]}{search_query}/{str(year)}"
                )
            ]
            loop.set_description(f"計算 {year} 年 語義相似度")
            for summary_file in summary_files:
                folder = f"{summary_filter_folder[root]}{search_query}/{str(year)}/"
                company = summary_file.split(".")[0]
                with open(folder + summary_file, "r", encoding="utf-8") as f:
                    summary_text = f.read()
                sim = compare_texts(
                    definition[search_query], summary_text, similarity_model
                )
                log += f"{summary_file} 相似度: {sim}\n"
                if company not in similarity:
                    similarity[company] = {}
                similarity[company][year] = sim
        df = pd.DataFrame(similarity).T
        df.index.name = "ticker"
        df.to_csv(f"{root}/{search_query}_filter_similarity.csv")
    print(f"✅ 任務完成")

IT capability


篩選 2021 年文本：正在處理 350/439 檔案:  70%|███████   | 7/10 [4:30:27<1:55:54, 2318.25s/it]


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-AmyiXpVraKvq3vaFQJvIyqEV on requests per day (RPD): Limit 10000, Used 10000, Requested 1. Please try again in 8.64s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}