In [1]:
import os
import glob
import polars as pl
from dotenv import load_dotenv

In [2]:
load_dotenv(r"C:\Users\by003457\workspace\perfectdays\.env")

True

In [3]:
NEWS_PARQUET_MONTH_DIR = os.environ["NEWS_PARQUET_MONTH_DIR"]

In [4]:
mmfiles = sorted(glob.glob(os.path.join(NEWS_PARQUET_MONTH_DIR, "*.parquet")))

In [5]:
mmfile = mmfiles[50]

In [6]:
# load parquet file with polars
df = pl.read_parquet(mmfile, n_rows=100)

In [8]:
df.head()

guid,version_created,title,lang_code,subject_qcodes,content,src
str,str,str,str,str,str,str
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:00:50.000Z""","""City Holding Company Announces…","""en""","""L:en""",""" City Holding Company Announc…","""3PTY"""
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:01:01.000Z""","""Flooring America, Inc. Announc…","""en""","""L:en""","""Flooring America, Inc. Announc…","""3PTY"""
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:04:09.000Z""","""ON24 Video Investor Alert: ON2…","""en""","""L:en""","""ON24 Video Investor Alert: ON2…","""3PTY"""
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:08:01.000Z""","""Sacramento Commercial Bank Pur…","""en""","""L:en""",""" Sacramento Commercial …","""3PTY"""
"""tag:reuters.com,2000-03-04:new…","""2000-03-04T00:08:15.000Z""","""Ezenet Corp. Equity Financing …","""en""","""L:en""",""" (Full text of press release f…","""3PTY"""


In [12]:
df.shape

(100, 7)

In [11]:
total_rows = pl.scan_parquet(mmfile).count().collect()
print("Total rows in file:", total_rows)

Total rows in file: shape: (1, 7)
┌─────────┬─────────────────┬─────────┬───────────┬────────────────┬─────────┬─────────┐
│ guid    ┆ version_created ┆ title   ┆ lang_code ┆ subject_qcodes ┆ content ┆ src     │
│ ---     ┆ ---             ┆ ---     ┆ ---       ┆ ---            ┆ ---     ┆ ---     │
│ u32     ┆ u32             ┆ u32     ┆ u32       ┆ u32            ┆ u32     ┆ u32     │
╞═════════╪═════════════════╪═════════╪═══════════╪════════════════╪═════════╪═════════╡
│ 2063090 ┆ 2063090         ┆ 2063090 ┆ 2063090   ┆ 2063090        ┆ 2063090 ┆ 2063090 │
└─────────┴─────────────────┴─────────┴───────────┴────────────────┴─────────┴─────────┘


#### News embedding analysis

In [54]:
import os
import glob
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

In [None]:
model_path = r"C:\Temp\models--google--embeddinggemma-300m\snapshots\c5cfa06e5e282a820e85d57f7fb053207494f41d"
model = SentenceTransformer(model_path)

In [56]:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=True)

In [15]:
npzfile = r"L:\MED\TRAN\2025_BankRegDataCollection\rtrs_news\monthly\2024-04_embeddings_3pty_ko.npz"
pqfile = r"L:\MED\TRAN\2025_BankRegDataCollection\rtrs_news\monthly\2024-04.parquet"

In [17]:
with np.load(npzfile) as data:
    ids = data['ids']
    embeddings = data['embeddings']
print(f'Loaded {embeddings.shape[0]} embeddings of dimension {embeddings.shape[1]}')

Loaded 304350 embeddings of dimension 768


In [49]:
lstnpz = []
for id in ids:
    lstnpz.append({'id': int(id), 'embedding': embeddings[id]})
len(lstnpz)

304350

In [19]:
df = pd.read_parquet(pqfile)

In [48]:
# drop na values of title and content
df2 = df[(df.lang_code == 'ko') & (df.src == '3PTY')].dropna(subset=['title', 'content']).reset_index(drop=True).copy()
df2.shape

(304350, 7)

In [35]:
df2.head(2)

Unnamed: 0,guid,version_created,title,lang_code,subject_qcodes,content,src
0,"tag:reuters.com,2024-04-01:newsml_Mtd4JbG8V:1",2024-04-01T00:00:08.387Z,"대한항공, 뉴욕 취항 45주년 기념 행사…왕복 항공권 주인공 누구?",ko,"M:1QD, M:1WM, M:1WN, M:2CQ, M:2CX",For best results when printing this announceme...,3PTY
1,"tag:reuters.com,2024-03-31:newsml_Mtd2YHt2w:1",2024-03-31T23:59:47.328Z,"기아, 중고생 대상 판교테크노밸리 진로투어 모집",ko,"B:255, B:69, B:71, M:1QD, M:1WJ, M:1WK, M:2CQ,...",For best results when printing this announceme...,3PTY


In [45]:
'aaa'.encode('utf-8')

b'aaa'

In [69]:
rec = df2.iloc[0]
text = f"{rec['title'].encode('utf-8')}\n\n{rec['content'].encode('utf-8')}"

inputs = tokenizer(
                text,
                padding=True,
                truncation=True,
                max_length=9048,
                return_tensors="pt",
            )
print(f'Input token ids shape: {inputs["input_ids"].shape}')

inputs = {k: v for k, v in inputs.items()}

text_embedding = model(inputs)
npz_embedding = lstnpz[0]['embedding']

# for rec2 in lstnpz:
#     npz_embedding = rec2['embedding']

#     #compare the two embeddings
#     if np.allclose(text_embedding, npz_embedding):
#         print(f"Match found for id {rec2['id']}")
#         break

Input token ids shape: torch.Size([1, 9048])


: 

In [None]:
inputs

{'input_ids': tensor([[     2, 236763,  39957,  ..., 236781,    524,      1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]),
 'token_embeddings': tensor([[[ -7.9859,   5.3992,  -3.6860,  ...,  -1.1690,   2.1762,   7.2932],
          [  1.2271,   2.0435,   0.4570,  ...,   1.3394,  -4.2090,  -0.0736],
          [ -0.9080,  -3.7236,   0.8823,  ..., -10.0703, -10.4421,  -2.3253],
          ...,
          [ -0.0823,  -0.0133,  -0.1674,  ...,   0.8413,  -0.1150,  -0.2957],
          [ -0.2782,   0.0791,  -0.0332,  ...,   0.6877,  -0.0609,   0.0284],
          [ -5.0438,   6.6790,   1.1993,  ..., -10.8118,  -2.5702,  -1.4849]]],
        grad_fn=<MulBackward0>),
 'sentence_embedding': tensor([[-6.7403e-02,  3.1520e-02,  3.3129e-02,  4.0342e-02,  2.4704e-02,
           1.6457e-02, -2.7526e-02,  4.8007e-02, -1.1651e-02, -6.4333e-02,
           1.0257e-02, -8.6296e-04,  2.9670e-03,  3.0102e-02, -1.0822e-02,
           6.7822e-02,  5.8182e-02, -1.2248e-02,  1.5915e-02, -3.8249e-03,
     

In [64]:
text_embedding

{'input_ids': tensor([[     2, 236763,  39957,  ..., 236753, 236785,      1]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]]),
 'token_embeddings': tensor([[[ -6.7609,   5.7027,  -3.7237,  ...,  -1.8259,   1.5205,   7.2419],
          [  2.7635,   1.2319,  -0.1766,  ...,   2.9555,  -2.5173,  -0.2432],
          [ -0.7547,  -4.1170,   2.3600,  ..., -11.8293, -10.4418,  -0.1025],
          ...,
          [ -0.1216,   0.1983,   0.1279,  ...,   1.7577,  -0.2822,  -0.0674],
          [ -0.8828,   0.3088,  -0.4140,  ...,   0.6207,  -1.0165,  -0.5033],
          [ -6.2652,   5.1891,   0.6985,  ..., -11.0701,  -2.8429,  -2.4989]]],
        grad_fn=<MulBackward0>),
 'sentence_embedding': tensor([[-6.8975e-02,  2.5567e-02,  3.5194e-02,  3.4291e-02,  4.6115e-02,
           2.5357e-03, -2.1679e-02,  3.8793e-02, -1.4278e-02, -6.7422e-02,
           2.0154e-02, -7.9040e-03,  2.1011e-02,  2.9596e-02, -2.4708e-02,
           6.5889e-02,  5.4314e-02,  3.5090e-03,  1.0542e-02,  6.7104e-03,
     

In [53]:
npz_embedding[:5]

array([ 0.00173828, -0.02865018, -0.00104929,  0.00366674, -0.0147454 ],
      dtype=float32)