In [1]:
import os
import glob
import polars as pl
from dotenv import load_dotenv

In [2]:
load_dotenv(r"C:\Users\by003457\workspace\perfectdays\.env")

True

In [3]:
NEWS_PARQUET_MONTH_DIR = os.environ["NEWS_PARQUET_MONTH_DIR"]

In [4]:
mmfiles = sorted(glob.glob(os.path.join(NEWS_PARQUET_MONTH_DIR, "*.parquet")))

In [5]:
mmfile = mmfiles[50]

In [6]:
# load parquet file with polars
df = pl.read_parquet(mmfile, n_rows=100)

In [8]:
df.head()

guid,version_created,title,lang_code,subject_qcodes,content,src
str,str,str,str,str,str,str
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:00:50.000Z""","""City Holding Company Announces…","""en""","""L:en""",""" City Holding Company Announc…","""3PTY"""
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:01:01.000Z""","""Flooring America, Inc. Announc…","""en""","""L:en""","""Flooring America, Inc. Announc…","""3PTY"""
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:04:09.000Z""","""ON24 Video Investor Alert: ON2…","""en""","""L:en""","""ON24 Video Investor Alert: ON2…","""3PTY"""
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:08:01.000Z""","""Sacramento Commercial Bank Pur…","""en""","""L:en""",""" Sacramento Commercial …","""3PTY"""
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:08:15.000Z""","""Ezenet Corp. Equity Financing …","""en""","""L:en""",""" (Full text of press release f…","""3PTY"""


In [12]:
df.shape

(100, 7)

In [11]:
total_rows = pl.scan_parquet(mmfile).count().collect()
print("Total rows in file:", total_rows)

Total rows in file: shape: (1, 7)
┌─────────┬─────────────────┬─────────┬───────────┬────────────────┬─────────┬─────────┐
│ guid    ┆ version_created ┆ title   ┆ lang_code ┆ subject_qcodes ┆ content ┆ src     │
│ ---     ┆ ---             ┆ ---     ┆ ---       ┆ ---            ┆ ---     ┆ ---     │
│ u32     ┆ u32             ┆ u32     ┆ u32       ┆ u32            ┆ u32     ┆ u32     │
╞═════════╪═════════════════╪═════════╪═══════════╪════════════════╪═════════╪═════════╡
│ 2063090 ┆ 2063090         ┆ 2063090 ┆ 2063090   ┆ 2063090        ┆ 2063090 ┆ 2063090 │
└─────────┴─────────────────┴─────────┴───────────┴────────────────┴─────────┴─────────┘


#### News embedding analysis

In [7]:
import os
import glob
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_path = r"C:\Temp\models--google--embeddinggemma-300m\snapshots\c5cfa06e5e282a820e85d57f7fb053207494f41d"


In [None]:
model = SentenceTransformer(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=True)

In [5]:
npzfile = r"L:\MED\TRAN\2025_BankRegDataCollection\rtrs_news\monthly\2024-04_embeddings_3pty_ko.npz"
pqfile = r"L:\MED\TRAN\2025_BankRegDataCollection\rtrs_news\monthly\2024-04.parquet"

In [8]:
with np.load(npzfile) as data:
    ids = data['ids']
    embeddings = data['embeddings']
print(f'Loaded {embeddings.shape[0]} embeddings of dimension {embeddings.shape[1]}')

Loaded 304350 embeddings of dimension 768


In [9]:
lstnpz = []
for id in ids:
    lstnpz.append({'id': int(id), 'embedding': embeddings[id]})
len(lstnpz)

304350

In [10]:
lstnpz[0]

{'id': 0,
 'embedding': array([ 1.73827901e-03, -2.86501776e-02, -1.04929414e-03,  3.66673851e-03,
        -1.47453975e-02,  3.59708024e-03, -7.45207537e-04, -3.71312932e-03,
         1.30137792e-02, -8.62511806e-03,  4.26352490e-03, -1.91094112e-02,
         4.87805298e-03, -5.14652114e-03,  6.35775388e-04,  6.89417263e-03,
         1.22776879e-02,  9.78220720e-03, -3.71215283e-03,  4.81559522e-02,
         1.42198252e-02,  1.15909777e-03,  1.12511069e-02, -1.11234039e-02,
         3.56185585e-02, -2.17604488e-02, -1.48913572e-02,  1.84146836e-02,
        -8.38238187e-03, -2.66954023e-03, -6.40428439e-03,  5.01478836e-03,
         4.43601422e-02,  1.53239071e-03,  1.06064312e-03,  3.14831897e-03,
        -1.83265598e-03,  5.57996146e-03, -2.77008377e-02, -1.57998390e-02,
        -1.05011733e-02,  1.07738981e-02, -2.46155038e-02,  6.32466655e-03,
        -5.73626906e-03, -1.72179900e-02, -1.53014879e-03, -1.60140004e-02,
        -9.03452933e-03, -3.33913714e-02,  1.98513884e-02,  3.629

In [None]:
df = pd.read_parquet(pqfile)
# drop na values of title and content
df2 = df[(df.lang_code == 'ko') & (df.src == '3PTY')].dropna(subset=['title', 'content']).reset_index(drop=True).copy()


rec = df2.iloc[0]
text = f"{rec['title'].encode('utf-8')}\n\n{rec['content'].encode('utf-8')}"

inputs = tokenizer(
                text,
                padding=True,
                truncation=True,
                max_length=9048,
                return_tensors="pt",
            )

inputs = {k: v for k, v in inputs.items()}

text_embedding = model(inputs)
npz_embedding = lstnpz[0]['embedding']

# for rec2 in lstnpz:
#     npz_embedding = rec2['embedding']

#     #compare the two embeddings
#     if np.allclose(text_embedding, npz_embedding):
#         print(f"Match found for id {rec2['id']}")
#         break

In [11]:

# Cell 1: Setup
import os
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import pyarrow.parquet as pq
from transformers import AutoTokenizer, AutoModel, AutoConfig

In [12]:


# Optional: reduce tokenizers' CPU threading like the script
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

FILTER_SRC = "3PTY"
FILTER_LANG = "ko"
normalize_embeddings = True   # same as script's normalize_embeddings
torch_dtype = "auto"          # "auto", "float16", "bfloat16", "float32"
user_max_length = None        # None to mirror script's logic
batch_size = 1024             # adjust to your VRAM (script uses per-GPU batch)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def parse_torch_dtype(dtype_str):
    if dtype_str is None or dtype_str == "auto": return None
    s = str(dtype_str).lower()
    if s in ("fp16", "float16", "half"): return torch.float16
    if s in ("bf16", "bfloat16"): return torch.bfloat16
    if s in ("fp32", "float32"): return torch.float32
    return None



In [13]:
device

device(type='cpu')

In [14]:

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=True)

target_dtype = parse_torch_dtype(torch_dtype)

# Try the straightforward load (single device)
model = AutoModel.from_pretrained(
    model_path,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=target_dtype if target_dtype is not None else None,
    # device_map={"": device.type if device.type == "cpu" else device},  # works for single device
)
model.eval()

# Match script's max_length logic
def get_effective_max_length(tokenizer, user_max_length=None):
    if user_max_length is None or (isinstance(user_max_length, int) and user_max_length <= 0):
        tok_max = getattr(tokenizer, "model_max_length", 2048)
        if tok_max is None or tok_max > 8192 or tok_max == int(1e30):
            return 4096
        return int(tok_max)
    return int(user_max_length)

max_length = get_effective_max_length(tokenizer, user_max_length)
print("Effective max_length:", max_length)


Effective max_length: 2048


In [15]:

# Cell 3: Embedding helper that mirrors the script
@torch.no_grad()
def encode_texts(texts, batch_size=32, normalize=True):
    outs = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)

        if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None:
            emb = outputs.pooler_output  # [B, H]
        else:
            last_hidden = outputs.last_hidden_state  # [B, T, H]
            mask = inputs["attention_mask"].unsqueeze(-1).to(last_hidden.dtype)  # [B, T, 1]
            summed = (last_hidden * mask).sum(dim=1)  # [B, H]
            counts = mask.sum(dim=1).clamp(min=1e-6)  # [B, 1]
            emb = summed / counts

        if normalize:
            emb = torch.nn.functional.normalize(emb, p=2, dim=1)

        outs.append(emb.detach().float().cpu().numpy())

    return np.concatenate(outs, axis=0) if outs else np.empty((0, 0), dtype=np.float32)

pf = pq.ParquetFile(pqfile)
texts = []

for rg_idx in range(pf.num_row_groups):
    table = pf.read_row_group(rg_idx, columns=["title", "content", "src", "lang_code"])
    if table.num_rows == 0:
        continue

    df = table.to_pandas()  # preserves row order within the row group
    mask = (
        df["title"].notna() &
        df["content"].notna() &
        (df["src"] == FILTER_SRC) &
        (df["lang_code"] == FILTER_LANG)
    )
    df = df[mask]
    if len(df) == 0:
        continue

    # Mirror polars' cast to Utf8 by forcing string
    batch_texts = (df["title"].astype(str) + "\n\n" + df["content"].astype(str)).tolist()
    texts.extend(batch_texts)

N = len(texts)
print(f"Total filtered rows: {N}")


Total filtered rows: 304350


In [16]:
def rowwise_cosine(a, b, eps=1e-12):
    # If not normalized, normalize here
    an = a / (np.linalg.norm(a, axis=1, keepdims=True) + eps)
    bn = b / (np.linalg.norm(b, axis=1, keepdims=True) + eps)
    return np.sum(an * bn, axis=1)  # cosine similarity in [-1, 1]

In [17]:
texts[0:2]

['대한항공, 뉴욕 취항 45주년 기념 행사…왕복 항공권 주인공 누구?\n\nFor best results when printing this announcement, please click on link below:\nhttp://newsfile.refinitiv.com/getnewsfile/v1/story?guid=urn:newsml:reuters.com:20240401:nMtd4JbG8V&default-theme=true\n\n[머니투데이 임찬영 기자]\n                                                                                                                                                                                                                                                                                                                                                                                                                                        \n 대한항공이 지난달 29일(현지시간) 오전 뉴욕 존F.케네디(JFK) 국제공항에서 탑승객 대상 기념 행사를 가지고 인천행 대한항공 KE082편 45번째 탑승수속 승객에게 인천~뉴욕 왕복 프레스티지 항공권 1매를 증정했다. 사진은 항공권 당첨자 김지현씨(가운데)와 대한항공 관계자들이 기념사진을 촬영하는 모습/사진= 대한항공                                                                                                                                       

In [31]:
# Cell 5: Compute embeddings and save NPZ (to compare with the script's output)text
texts_embeddings = encode_texts(texts[0:100], batch_size=batch_size, normalize=normalize_embeddings)
print("Embeddings shape:", texts_embeddings.shape)

Embeddings shape: (100, 768)


In [24]:
texts_embeddings2 = encode_texts(texts[0:10], batch_size=batch_size, normalize=normalize_embeddings)
print("Embeddings shape:", texts_embeddings2.shape)

Embeddings shape: (10, 768)


In [32]:
results = []
for i in range(100):
    for j in range(100):
        cosine_sim = rowwise_cosine(texts_embeddings[i].reshape(1,-1) , lstnpz[j]['embedding'].reshape(1, -1)) 
        results.append({'text_idx': i, 'npz_idx': j, 'cosine_sim': float(cosine_sim[0])})

dfres = pd.DataFrame(results)
dfres2 = dfres.pivot(index='text_idx', columns='npz_idx', values='cosine_sim')

In [29]:
results = []
for i in range(10):
    for j in range(10):
        cosine_sim = rowwise_cosine(texts_embeddings[i].reshape(1,-1) ,texts_embeddings2[j].reshape(1,-1)) 
        results.append({'text_idx': i, 'npz_idx': j, 'cosine_sim': float(cosine_sim[0])})

dfres3 = pd.DataFrame(results)
dfres4 = dfres3.pivot(index='text_idx', columns='npz_idx', values='cosine_sim')

In [30]:
dfres4

npz_idx,0,1,2,3,4,5,6,7,8,9
text_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.0,0.841253,0.863596,0.786862,0.793491,0.856968,0.82379,0.791291,0.858125,0.785469
1,0.841253,1.0,0.872949,0.809531,0.831309,0.844081,0.851263,0.82789,0.874289,0.802238
2,0.863596,0.872949,1.0,0.814741,0.813392,0.92698,0.871163,0.825828,0.935007,0.807648
3,0.786862,0.809531,0.814741,1.0,0.791624,0.786825,0.792726,0.818383,0.814762,0.773511
4,0.793491,0.831309,0.813392,0.791624,1.0,0.800758,0.8109,0.779426,0.828192,0.813762
5,0.856968,0.844081,0.92698,0.786825,0.800758,1.0,0.8384,0.803842,0.917606,0.794691
6,0.82379,0.851263,0.871163,0.792726,0.8109,0.8384,1.0,0.816136,0.876911,0.809703
7,0.791291,0.82789,0.825828,0.818383,0.779426,0.803842,0.816136,1.0,0.841029,0.790941
8,0.858125,0.874289,0.935007,0.814762,0.828192,0.917606,0.876911,0.841029,1.0,0.813819
9,0.785469,0.802238,0.807648,0.773511,0.813762,0.794691,0.809703,0.790941,0.813819,1.0
