In [5]:
import os
import glob
import json
import numpy as np
import pandas as pd
import polars as pl

from tqdm import tqdm
from dotenv import load_dotenv

In [6]:
load_dotenv(r"/rdrive/workspace/perfectdays/.env")

True

In [7]:
NEWS_PARQUET_MONTH_DIR = os.environ["NEWS_PARQUET_MONTH_DIR"]

In [8]:
mmfiles = sorted(glob.glob(os.path.join(NEWS_PARQUET_MONTH_DIR, "*3pty_ko.npz")))
mmfiles[:5]

['/rdrive/rtrs_news/monthly/1999-03_embeddings_3pty_ko.npz',
 '/rdrive/rtrs_news/monthly/1999-04_embeddings_3pty_ko.npz',
 '/rdrive/rtrs_news/monthly/1999-05_embeddings_3pty_ko.npz',
 '/rdrive/rtrs_news/monthly/1999-06_embeddings_3pty_ko.npz',
 '/rdrive/rtrs_news/monthly/1999-07_embeddings_3pty_ko.npz']

In [26]:
mmfile = mmfiles[-1]

In [27]:
# read npz file
mm_data = np.load(mmfile)

In [28]:
mm_data.keys()

KeysView(NpzFile '/rdrive/rtrs_news/monthly/2025-10_embeddings_3pty_ko.npz' with keys: ids, embeddings)

In [29]:
# print shape of each array
for key in mm_data.keys():
    print(f"{key}: {mm_data[key].shape}")   

ids: (173580,)


KeyboardInterrupt: 

In [None]:
# merge id and embedding into a dataframe
mm_df = pl.DataFrame({
    "id": mm_data["ids"],
    "embedding": [emb for emb in mm_data["embeddings"]]
})

In [None]:
mm_df

id,embedding
i64,"array[f32, 768]"
0,"[0.00873, -0.014797, … -0.004965]"
1,"[0.014095, -0.009662, … 0.000396]"
2,"[0.021123, 0.010042, … 0.02011]"
3,"[0.01345, -0.01291, … 0.005197]"
4,"[0.005833, -0.001751, … -0.020308]"
…,…
173575,"[0.002113, -0.013208, … -0.000254]"
173576,"[0.031447, -0.00351, … 0.006743]"
173577,"[0.031447, -0.00351, … 0.006743]"
173578,"[0.031027, -0.003222, … 0.005619]"


In [None]:
mm_df.shape

(173580, 2)

In [9]:
newsfile = os.path.join(NEWS_PARQUET_MONTH_DIR, '2025-10.parquet')

In [10]:
dfnews = pd.read_parquet(newsfile)
dfnews['ids'] = dfnews.index.astype(str)
dfnews.head(2)

Unnamed: 0,guid,version_created,title,lang_code,subject_qcodes,content,src,ids
0,"tag:reuters.com,2025-10-01:newsml_BwHJtdMa:1",2025-10-01T00:00:00.078Z,VFC INVESTORS: Kirby McInerney LLP Reminds V.F...,en,"B:1221, B:1323, B:195, B:234, B:239, B:242, B:...",For best results when printing this announceme...,3PTY,0
1,"tag:reuters.com,2025-10-01:newsml_Bw2HLJs9a:1",2025-10-01T00:00:00.116Z,Thermo Fisher Scientific Prices Offering of US...,en,"A:2, B:148, B:149, B:150, B:151, B:156, B:1700...",For best results when printing this announceme...,3PTY,1


In [11]:
# sampling 1% of dfnews
dfnews = dfnews[(dfnews['lang_code'] == 'ko') & (dfnews['src'] == '3PTY')]
# 1% sampling
dfnews = dfnews.sample(frac=0.01, random_state=42)
dfnews.shape

(1736, 8)

In [13]:
dfnews['text'] = dfnews.apply(lambda x: x['title'] + '\n\n' + x['content'], axis=1)
dfnews.head(2)

Unnamed: 0,guid,version_created,title,lang_code,subject_qcodes,content,src,ids,text
1785758,"tag:reuters.com,2025-10-16:newsml_EDYG00389:1",2025-10-16T22:48:44.801Z,"[AI시그널] 티에프이, 상승세 지속 중 거래량 증가로 긍정적 신호",ko,"B:163, B:164, B:1740, B:278, B:279, G:1, G:6, ...",\r\n\r\n\r\n\r\n\r\n [이데일리 이머니 기자] 기술적 ...,3PTY,1785758,"[AI시그널] 티에프이, 상승세 지속 중 거래량 증가로 긍정적 신호\n\n\r\n\..."
2013268,"tag:reuters.com,2025-10-18:newsml_EDYI00023:2",2025-10-18T23:23:55.309Z,고1 ‘성적 인플레’ 심화…내신 경쟁 ‘원점수’까지 확대,ko,"M:1QD, M:2CX",- 내신 9등급→ 5등급 개편되며 변별력 약화\r\n- 학생 간 내신 경쟁 ‘등급’...,3PTY,2013268,고1 ‘성적 인플레’ 심화…내신 경쟁 ‘원점수’까지 확대\n\n- 내신 9등급→ 5...


In [1]:
from transformers import pipeline
import torch

#usloth oss 120
# model_id = r'/rdrive_pvc/huggingface_cache/hub/models--unsloth--gpt-oss-120b/snapshots/212ea6d47b66c92f5bfd27956ef07bc160f5ea68'
#openai oss 20
# model_id = r'/rdrive_pvc/huggingface_cache/hub/models--openai--gpt-oss-20b/snapshots/2e8f8052ee2aeee907f76e08c08b9fdde8677ca8'

# gemma 27-it
model_id = r'/rdrive_pvc/huggingface_cache/hub/models--google--gemma-3-27b-it/snapshots/005ad3404e59d6023443cb575daa05336842228a'

pipe = pipeline(
    "text-generation",
    model=model_id,
    dtype="auto",
    device_map="cuda:0",
)

messages = [
    {"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
]

outputs = pipe(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 12/12 [00:28<00:00,  2.37s/it]
Device set to use cuda:0


{'role': 'assistant', 'content': "## Quantum Mechanics: Explained Simply\n\nQuantum mechanics is the physics of the **very small** - atoms and the particles *within* them. It's weird, counterintuitive, but incredibly successful at explaining how the universe works at that level. Here's the gist:\n\n**1. Quantization:** Energy, like light and electricity, isn't continuous. It comes in tiny, discrete packets called **quanta**. Think of it like stairs vs. a ramp - you can only stand on specific steps (quanta) not anywhere in between.\n\n**2. Wave-Particle Duality:**  Particles (like electrons) can behave like waves, and waves (like light) can behave like particles. It's not *either/or*, it's *both*.  Imagine something that spreads out like a wave, but also hits a target like a particle.\n\n**3. Uncertainty:** There's a fundamental limit to how precisely we can know certain pairs of properties simultaneously.  The most famous example is **Heisenberg's Uncertainty Principle**: the more accu

## Financial Sentiment Prompt

In [2]:
sentiment_prompt = (
    "You are a bilingual (Korean/English) financial market analyst. Read the Korean news excerpt and assess the expected market impact on affected financial assets. "
    "Follow the instructions precisely and return only strict JSON.\n"
    "\nTask\n"
    "1) Classify sentiment using ONLY one of: very positive, positive, neutral, negative, very negative.\n"
    "2) Base your decision on expected near-term market impact (equity prices, credit spreads/default risk, FX, interest rates, macro risk).\n"
    "3) Provide a single-sentence justification under 50 words that cites concrete drivers (e.g., earnings/guidance, regulation, policy, demand/supply, prices, FX, rates, geopolitics, disruptions).\n"
    "4) Identify the most relevant NAICS industry code(s) mentioned or clearly implied. Use the most specific 6-digit codes when possible. Include the code and its label in the form \"CODE - Label\". If unclear, return an empty array []. Limit to up to 3 entries.\n"
    "5) List directly mentioned or clearly implicated company names (Korean or English official names). Do not guess. If none, return an empty array [].\n"
    "\nLabeling guidance (be conservative)\n"
    "- very positive: strong broad upside (e.g., beat + raised guidance, major policy support, material cost relief) likely to lift assets.\n"
    "- positive: modest upside or favorable development.\n"
    "- neutral: limited, mixed, or insufficient information.\n"
    "- negative: modest downside (e.g., miss, mild regulatory risk, soft demand).\n"
    "- very negative: severe downside (e.g., default/bankruptcy risk, sanctions, major accidents, sharp demand collapse).\n"
    "\nFormatting rules\n"
    "- Output must be STRICT JSON with keys: sentiment, justification, related_industry_naics, related_company_names.\n"
    "- Values: sentiment = string; justification = string (<50 words); related_industry_naics = array of strings; related_company_names = array of strings.\n"
    "- No extra text, explanations, comments, or markdown. JSON only.\n"
    "- If the article is in Korean, you may write the justification in Korean or English; keep it concise and factual.\n"
    "\nExample response\n"
    "{{\n"
    "  \"sentiment\": \"very negative\",\n"
    "  \"justification\": \"수출 감소와 가격 하락이 예고되며 마진 압박이 커져 단기 주가와 신용 스프레드에 부정적.\",\n"
    "  \"related_industry_naics\": [\"334413 - Semiconductor and Related Device Manufacturing\"],\n"
    "  \"related_company_names\": [\"Samsung Electronics\", \"SK hynix\"]\n"
    "}}\n"
    "\nText: {text}\n"
)

In [3]:
def analyse_sentiment(text: str) -> dict[str, object]:
    allowed = {"very positive", "positive", "very negative", "negative", "neutral"}

    def normalize_sentiment(s: str) -> str:
        s = (s or "").strip().lower()
        mapping = {
            "very positive": "very positive",
            "positive": "positive",
            "very negative": "very negative",
            "negative": "negative",
            "neutral": "neutral",
            "pos": "positive",
            "neg": "negative",
            "bullish": "positive",
            "bearish": "negative",
            "unknown": "neutral",
            "error": "neutral",
        }
        return mapping.get(s, "neutral")

    def trim_words(s: str, max_words: int = 50) -> str:
        if not isinstance(s, str):
            return ""
        words = s.strip().split()
        return s.strip() if len(words) <= max_words else " ".join(words[:max_words])

    def to_string_list(val) -> list[str]:
        if val is None:
            return []
        if isinstance(val, list):
            out = []
            for x in val:
                if isinstance(x, (str, int, float)):
                    out.append(str(x))
            return out
        if isinstance(val, (str, int, float)):
            return [str(val)]
        return []

    def make_fallback(justification: str) -> dict[str, object]:
        return {
            "sentiment": "neutral",
            "justification": trim_words(justification, 50),
            "related_industry_naics": [],
            "related_company_names": [],
        }

    if not isinstance(text, str) or not text.strip():
        return make_fallback("Input text missing or empty.")

    safe_text = text.replace("{", "{{").replace("}", "}}").strip()
    # Avoid str.format to prevent KeyError from JSON braces in the prompt
    prompt_content = sentiment_prompt.replace("{text}", safe_text)

    messages = [
        {"role": "user", "content": prompt_content},
    ]
    try:
        outputs = pipe(messages, max_new_tokens=256)
    except Exception as exc:
        return make_fallback(f"Pipeline failure: {exc}")

    # Extract model content robustly
    content = ""
    try:
        gen = outputs[0].get("generated_text", "")
        if isinstance(gen, list):
            last = gen[-1]
            if isinstance(last, dict) and "content" in last:
                content = (last["content"] or "").strip()
            else:
                content = (str(last) or "").strip()
        elif isinstance(gen, dict) and "content" in gen:
            content = (gen["content"] or "").strip()
        elif isinstance(gen, str):
            content = gen.strip()
        else:
            content = (str(gen) or "").strip()
    except Exception as exc:
        return make_fallback(f"Malformed pipeline output: {exc}")

    # Parse JSON strictly; if fail, try to salvage substring between first { and last }
    def parse_json_payload(s: str):
        try:
            return json.loads(s)
        except json.JSONDecodeError:
            start = s.find("{")
            end = s.rfind("}")
            if start != -1 and end != -1 and end > start:
                try:
                    return json.loads(s[start : end + 1])
                except json.JSONDecodeError:
                    return None
            return None

    parsed = parse_json_payload(content)
    if not isinstance(parsed, dict):
        return make_fallback(content or "Model returned non-JSON output.")

    sentiment = normalize_sentiment(parsed.get("sentiment", "neutral"))
    if sentiment not in allowed:
        sentiment = "neutral"

    justification = trim_words(parsed.get("justification", ""))
    related_industry_naics = to_string_list(parsed.get("related_industry_naics"))
    related_company_names = to_string_list(parsed.get("related_company_names"))

    return {
        "sentiment": sentiment,
        "justification": justification,
        "related_industry_naics": related_industry_naics,
        "related_company_names": related_company_names,
    }

In [14]:
# Refactored sampling loop compatible with the updated prompt and analyse_sentiment()

required_columns = {"text", "title"}
missing = required_columns - set(dfnews.columns)
if missing:
    raise ValueError(f"Missing required columns: {missing}")

sample_size = min(len(dfnews), 5)
sentiment_samples: list[dict] = []

if sample_size > 0:
    # Sample rows and iterate directly to avoid Series/DataFrame ambiguity on duplicated index
    sample_df = dfnews.sample(n=sample_size, random_state=0)

    def _to_str(x) -> str:
        if x is None or (isinstance(x, float) and pd.isna(x)):
            return ""
        return str(x)

    max_chars = 5000  # prevent overly long inputs to the model

    for row in tqdm(sample_df.itertuples(index=True), total=sample_size):
        try:
            title = _to_str(getattr(row, "title", "")).strip()
            body = _to_str(getattr(row, "text", "")).strip()

            # Combine title and body to improve context for Korean articles
            combined_text = (title + "\n\n" + body).strip() if (title or body) else ""
            if len(combined_text) > max_chars:
                combined_text = combined_text[:max_chars]

            result = analyse_sentiment(combined_text) if combined_text else {
                "sentiment": "neutral",
                "justification": "No content provided.",
                "related_industry_naics": [],
                "related_company_names": [],
            }

            sentiment_samples.append({
                "index": row.Index,
                "headline": title,
                "sentiment": result.get("sentiment", "neutral"),
                "justification": result.get("justification", ""),
                "related_industry_naics": result.get("related_industry_naics", []),
                "related_company_names": result.get("related_company_names", []),
            })
        except Exception as exc:
            # Per-row fail-safe so one bad row doesn't stop the loop
            sentiment_samples.append({
                "index": row.Index,
                "headline": _to_str(getattr(row, "title", "")).strip(),
                "sentiment": "neutral",
                "justification": f"Processing error: {exc}",
                "related_industry_naics": [],
                "related_company_names": [],
            })
else:
    sentiment_samples = []

100%|██████████| 5/5 [00:45<00:00,  9.10s/it]


In [15]:
sentiment_samples[1]

{'index': 1462049,
 'headline': "AIBIZ, 세미콘웨스트 3년 연속 참가.. '글로벌 반도체 협력 모색'",
 'sentiment': 'positive',
 'justification': "AIBIZ's continued participation in SEMICON WEST and positive reception of its AI solutions suggest increased collaboration potential and standardization efforts, likely supporting modest growth.",
 'related_industry_naics': ['541715 - Research and Development in the Physical, Engineering, and Life Sciences',
  '334413 - Semiconductor and Related Device Manufacturing',
  '511210 - Software Publishers'],
 'related_company_names': ['AIBIZ']}

In [None]:
outfile = os.path.join('/rdrive/rtrs_news', f'{parquet_file[:-8]}.sentiment.parquet')
pd.DataFrame(sentiment_samples).to_parquet(outfile)

In [16]:
dfres = pd.DataFrame(sentiment_samples)
dfres

Unnamed: 0,index,headline,sentiment,justification,related_industry_naics,related_company_names
0,1218843,"'도시문제' 해결책 찾아…충남혁신센터-천안시, 스타트업 발굴",neutral,The announcement details a startup support pro...,[541511 - Custom Computer Programming Services...,"[Chungnam Creative Economy Innovation Center, ..."
1,1462049,"AIBIZ, 세미콘웨스트 3년 연속 참가.. '글로벌 반도체 협력 모색'",positive,AIBIZ's continued participation in SEMICON WES...,[541715 - Research and Development in the Phys...,[AIBIZ]
2,1276109,"'어린이 위협 시구' 논란 최현욱, 자필 편지로 사과 전할까",negative,The incident involving a potentially dangerous...,"[711110 - Performing Arts Companies, 511110 - ...","[SSG 랜더스, 삼성 라이온즈]"
3,78750,추석 車보험 사고 피해자 1.6배 증가…“차간 거리 유지해야”,neutral,The article reports an increase in car insuran...,"[524113 - Direct Life, Health, and Medical Ins...","[Insurance Development Institute, General Insu..."
4,1296545,"삼성도 찜한 혈액 진단…""한국선 '통행세' 내야 해"" 기업들 해외로",negative,규제 미비로 인해 국내 체외진단 기업들이 해외 투자 유치 및 사업 확장에 어려움을 ...,"[334516 - Analytical Laboratory Services, 6215...","[삼성물산, 삼성바이오로직스, 삼성바이오에피스, C2N 다이그노스틱스]"


In [None]:
dfnews.loc[78750]

guid                   tag:reuters.com,2025-10-01:newsml_EDY000694:1
version_created                             2025-10-01T03:00:49.035Z
title                             추석 車보험 사고 피해자 1.6배 증가…“차간 거리 유지해야”
lang_code                                                         ko
subject_qcodes                                          M:1QD, M:2CX
content            \r\n[이데일리 김형일 기자] 추석 당일 자동차보험 사고 피해자가 1.6배 증가한...
src                                                             3PTY
ids                                                            78750
text               추석 車보험 사고 피해자 1.6배 증가…“차간 거리 유지해야”\n\n\r\n[이데일...
Name: 78750, dtype: object

In [None]:
dfres.sentiment.value_counts()

sentiment
neutral    5
Name: count, dtype: int64

In [None]:
dfres = pd.DataFrame(sentiment_samples) if sentiment_samples else pd.DataFrame(columns=["index", "headline", "sentiment", "justification"])
dfres