In [1]:
# ! pip install openpyxl pandas numpy

In [2]:
import pandas as pd
import numpy as np

channels = pd.read_excel("../data/tg_channels.xlsx")
#df = pd.read_csv("../data/cleaned_news_exp.csv")[["message_id", "id_channel", "message", "date", "topic"]]
df = pd.read_parquet("../data/tg_news_full.parquet")[["message_id", "id_channel", "message", "date"]]

  warn("Workbook contains no default style, apply openpyxl's default")


In [3]:
def attach_channel_w(
    df: pd.DataFrame,
    channels_df: pd.DataFrame,
    news_id_col: str = "id_channel",
    chan_id_col: str = "id",
    subs_col: str = "subscribers",
    weight_col: str = "channel_w",) -> pd.DataFrame:
    ch = channels_df[[chan_id_col, subs_col]].copy()
    ch[chan_id_col] = pd.to_numeric(ch[chan_id_col], errors="coerce").astype("Int64")
    ch[subs_col] = pd.to_numeric(ch[subs_col], errors="coerce").fillna(0).astype(float)
    id2subs = dict(zip(ch[chan_id_col], ch[subs_col]))

    out = df.copy()
    out[news_id_col] = pd.to_numeric(out[news_id_col], errors="coerce").astype("Int64")

    subs = out[news_id_col].map(id2subs).fillna(0.0).to_numpy(dtype=np.float32)
    log_subs = np.log1p(subs)

    mn, mx = float(log_subs.min()), float(log_subs.max())
    if mx > mn:
        w = (log_subs - mn) / (mx - mn)
    else:
        w = np.zeros_like(log_subs, dtype=np.float32)

    out[weight_col] = w.astype(np.float32)
    return out


In [4]:
ch_map = (channels[["id", "name"]]
          .dropna()
          .assign(id=lambda x: pd.to_numeric(x["id"], errors="coerce"))
          .dropna(subset=["id"])
          .assign(id=lambda x: x["id"].astype(int))
          .set_index("id")["name"]
          .to_dict())

df = df.copy()
df["id_channel"] = pd.to_numeric(df["id_channel"], errors="coerce")
df["channel_name"] = df["id_channel"].map(ch_map).fillna(df["id_channel"].astype("Int64").astype(str))
df = attach_channel_w(df, channels)

In [5]:
import re
import numpy as np
import pandas as pd

def clean_news_text(t: str) -> str:
    t = t or ""
    t = re.sub(r"#\w+", " ", t)
    t = re.sub(r"[‚ö°Ô∏èüìàüìâüá∑üá∫‚úÖ‚ùóÔ∏èüî•‚¨õ ‚¨ú ‚ö´ ‚ö™üîπ]+", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def ensure_datetime(df: pd.DataFrame, col: str = "date") -> pd.DataFrame:
    d = df.copy()
    d[col] = pd.to_datetime(d[col], utc=True, errors="coerce")
    d = d.dropna(subset=[col])
    d["date_day"] = d[col].dt.floor("D")
    return d

df = ensure_datetime(df, "date")
df["message_id"] = df["message_id"].astype(str)
df["message"] = df["message"].fillna("").astype(str).map(clean_news_text)

In [6]:
df.head()

Unnamed: 0,message_id,id_channel,message,date,channel_name,channel_w,date_day
0,275548,3,"–ü—Ä–∞–≤–∏—Ç–µ–ª—å—Å—Ç–≤–æ –°–ª–æ–≤–∞–∫–∏–∏ –æ–±—Å—É–¥–∏—Ç –º–µ—Ä—ã, –∫–æ—Ç–æ—Ä—ã–µ –º...",2025-01-02 17:00:02+00:00,–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏,1.0,2025-01-02 00:00:00+00:00
1,275547,3,–í –î–¢–ü —Å —Ç—É—Ä–∏—Å—Ç–∏—á–µ—Å–∫–∏–º –∞–≤—Ç–æ–±—É—Å–æ–º –≤ –¢–∞–∏–ª–∞–Ω–¥–µ –ø–æ—Å...,2025-01-02 16:40:53+00:00,–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏,1.0,2025-01-02 00:00:00+00:00
2,275546,3,–ü—Ä–µ–º—å–µ—Ä –ò–∑—Ä–∞–∏–ª—è –ù–µ—Ç–∞–Ω—å—è—Ö—É –≤—ã–ø–∏—Å–∞–Ω –∏–∑ –±–æ–ª—å–Ω–∏—Ü—ã ...,2025-01-02 16:20:12+00:00,–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏,1.0,2025-01-02 00:00:00+00:00
3,275545,3,–ü–æ–¥–æ–∑—Ä–µ–≤–∞–µ–º—ã–π –≤ –ø–æ–¥—Ä—ã–≤–µ –∞–≤—Ç–æ–º–æ–±–∏–ª—è Tesla Cyber...,2025-01-02 15:54:29+00:00,–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏,1.0,2025-01-02 00:00:00+00:00
4,275543,3,–°–ø–µ—Ü–æ–ø–µ—Ä–∞—Ü–∏—è. –û–±—Å—Ç–∞–Ω–æ–≤–∫–∞ –∏ –≥–ª–∞–≤–Ω—ã–µ —Å–æ–±—ã—Ç–∏—è –Ω–∞ ...,2025-01-02 15:32:55+00:00,–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏,1.0,2025-01-02 00:00:00+00:00


In [7]:
#! pip install rank_bm25

In [8]:
from rank_bm25 import BM25Okapi

def tokenize_ru(text: str):
    text = text.lower()
    text = re.sub(r"[^0-9a-z–∞-—è—ë\s]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()

corpus_tok = [tokenize_ru(t) for t in df["message"].tolist()]
bm25 = BM25Okapi(corpus_tok)

In [9]:
#! pip install faiss-cpu faiss-gpu-cu12 faiss-gpu-cu11

In [10]:
from sentence_transformers import SentenceTransformer

MODEL_NAME = "intfloat/multilingual-e5-large" 
encoder = SentenceTransformer(MODEL_NAME, device="cuda")  
texts = df["message"].tolist()
doc_inputs = ["passage: " + t for t in texts]

In [11]:
# E_docs = encoder.encode(
#     doc_inputs,
#     batch_size=64,
#     show_progress_bar=True,
#     normalize_embeddings=True,
# ).astype(np.float32)

In [12]:
# import faiss

# dim = E_docs.shape[1]
# index = faiss.IndexFlatIP(dim)      
# index.add(E_docs)

In [13]:
# # save
# from pathlib import Path
# import numpy as np
# import pandas as pd
# import faiss
# import pickle

# OUT = Path("indexes")
# OUT.mkdir(parents=True, exist_ok=True)

# #rowmap
# rowmap = df[["message_id","date","date_day","id_channel","channel_name"]].copy()
# rowmap.to_parquet(OUT / "rowmap.parquet", index=False)

# np.save(OUT / "E_docs_e5_large.npy", E_docs)

# faiss.write_index(index, str(OUT / "faiss_e5_large.index"))

# with open(OUT / "bm25_corpus_tok.pkl", "wb") as f:
#     pickle.dump(corpus_tok, f)


In [14]:
# load
from pathlib import Path
import numpy as np
import pandas as pd
import faiss
import pickle

INP = Path("indexes")

rowmap = pd.read_parquet(INP / "rowmap.parquet")

E_docs = np.load(INP / "E_docs_e5_large.npy")
index = faiss.read_index(str(INP / "faiss_e5_large.index"))

with open(INP / "bm25_corpus_tok.pkl", "rb") as f:
    corpus_tok = pickle.load(f)

from rank_bm25 import BM25Okapi
bm25 = BM25Okapi(corpus_tok)


In [15]:
assert len(rowmap) == E_docs.shape[0]
assert len(rowmap) == index.ntotal

In [16]:
import re
import hashlib
import numpy as np
import pandas as pd

try:
    import faiss
except Exception:
    faiss = None

_URL_RE = re.compile(r"https?://\S+|www\.\S+")
_HANDLE_RE = re.compile(r"@\w+")
_WS_RE = re.compile(r"\s+")

def _normalize_for_dedup(text: str, mask_numbers: bool = True) -> str:
    if not isinstance(text, str):
        return ""
    t = text.lower()
    t = _URL_RE.sub(" ", t)
    t = _HANDLE_RE.sub(" ", t)
    t = re.sub(r"[^\w\s%.,\-]+", " ", t, flags=re.UNICODE)
    if mask_numbers:
        t = re.sub(r"\d+(?:[.,]\d+)?", "<num>", t)
    t = _WS_RE.sub(" ", t).strip()
    return t

def _stable_hash(s: str) -> str:
    return hashlib.md5(s.encode("utf-8", errors="ignore")).hexdigest()

def _union_find(n: int):
    parent = np.arange(n, dtype=np.int32)
    rank = np.zeros(n, dtype=np.int8)
    def find(x: int) -> int:
        while parent[x] != x:
            parent[x] = parent[parent[x]]
            x = parent[x]
        return int(x)
    def union(a: int, b: int) -> None:
        ra, rb = find(a), find(b)
        if ra == rb:
            return
        if rank[ra] < rank[rb]:
            parent[ra] = rb
        elif rank[ra] > rank[rb]:
            parent[rb] = ra
        else:
            parent[rb] = ra
            rank[ra] += 1
    return find, union

def dedup_cluster_candidates_time(
    cand: pd.DataFrame,
    encoder,
    text_col: str = "message",
    date_col: str = "date_day",
    channel_col: str = "channel_name",
    score_col: str = "score_rrf",
    sim_threshold: float = 0.95,
    knn: int = 20,
    keep_per_cluster: int = 1,
    mask_numbers: bool = True,
    max_day_diff: int = 1,
    overwrite_channel: bool = True,
    channel_join: str = "; ",
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    if cand is None or len(cand) == 0:
        return cand, pd.DataFrame(), pd.DataFrame()

    cand = cand.copy().reset_index(drop=True)

    dts = pd.to_datetime(cand[date_col], errors="coerce", utc=True).dt.normalize()
    cand["_dt"] = dts
    cand["_dt_str"] = cand["_dt"].dt.strftime("%Y-%m-%d").fillna("")

    norm = cand[text_col].fillna("").map(lambda s: _normalize_for_dedup(s, mask_numbers=mask_numbers))
    cand["_h"] = norm.map(_stable_hash)

    cand["_hk"] = cand["_h"].astype(str) + "|" + cand["_dt_str"].astype(str)

    if score_col in cand.columns:
        rep_idx = (
            cand.sort_values(score_col, ascending=False)
                .groupby("_hk", as_index=False)
                .head(1)
                .index.to_numpy()
        )
    else:
        rep_idx = cand.groupby("_hk", as_index=False).head(1).index.to_numpy()

    rep = cand.loc[rep_idx].copy().reset_index(drop=True)

    texts = rep[text_col].fillna("").tolist()
    X = encoder.encode(texts, normalize_embeddings=True, show_progress_bar=False).astype(np.float32)

    rep_dt = pd.to_datetime(rep["_dt"], errors="coerce", utc=True)
    rep_dt = rep_dt.dt.tz_convert(None).dt.normalize().to_numpy(dtype="datetime64[D]")

    m = len(rep)
    find, union = _union_find(m)

    if m > 1:
        if faiss is None:
            S = X @ X.T
            for i in range(m):
                js = np.where(S[i, i+1:] >= sim_threshold)[0] + (i + 1)
                for j in js:
                    if np.isnat(rep_dt[i]) or np.isnat(rep_dt[j]):
                        continue
                    day_diff = abs(int((rep_dt[i] - rep_dt[j]).astype("timedelta64[D]").astype(int)))
                    if day_diff <= max_day_diff:
                        union(i, int(j))
        else:
            idx = faiss.IndexFlatIP(X.shape[1])
            idx.add(X)
            D, I = idx.search(X, min(knn, m))
            for i in range(m):
                for score, j in zip(D[i], I[i]):
                    if j < 0 or j == i:
                        continue
                    if float(score) < sim_threshold:
                        continue
                    if np.isnat(rep_dt[i]) or np.isnat(rep_dt[j]):
                        continue
                    day_diff = abs(int((rep_dt[i] - rep_dt[j]).astype("timedelta64[D]").astype(int)))
                    if day_diff <= max_day_diff:
                        union(i, int(j))

    rep_cluster = np.array([find(i) for i in range(m)], dtype=np.int32)
    _, rep_cluster = np.unique(rep_cluster, return_inverse=True)
    rep["_rep_cluster"] = rep_cluster

    hk_to_cluster = dict(zip(rep["_hk"].tolist(), rep["_rep_cluster"].tolist()))
    cand["_cluster_id"] = cand["_hk"].map(hk_to_cluster)

    cand["_cluster_id"] = cand["_cluster_id"].fillna(-1).astype(np.int32)

    cluster_sizes = cand.groupby("_cluster_id").size()

    if channel_col in cand.columns:
        ch_joined = (
            cand.groupby("_cluster_id")[channel_col]
                .apply(lambda s: channel_join.join(sorted({str(x) for x in s.dropna().tolist()})))
        )
    else:
        ch_joined = pd.Series(dtype=str)

    if score_col in cand.columns:
        cand_dedup = (
            cand.sort_values(score_col, ascending=False)
                .groupby("_cluster_id", group_keys=False)
                .head(keep_per_cluster)
                .reset_index(drop=True)
        )
    else:
        cand_dedup = (
            cand.groupby("_cluster_id", group_keys=False)
                .head(keep_per_cluster)
                .reset_index(drop=True)
        )

    cand_dedup["cluster_size"] = cand_dedup["_cluster_id"].map(cluster_sizes).astype(int)

    if channel_col in cand.columns:
        cand_dedup["channel_all"] = cand_dedup["_cluster_id"].map(ch_joined).fillna("")
        cand_dedup["channel_primary"] = cand_dedup[channel_col].astype(str)
        if overwrite_channel and channel_col in cand_dedup.columns:
            cand_dedup[channel_col] = cand_dedup["channel_all"]

    cand_dedup = cand_dedup.drop(columns=["_h", "_hk", "_dt", "_dt_str"], errors="ignore")

    clusters = cand[["_cluster_id"]].copy()
    clusters["cluster_size"] = clusters["_cluster_id"].map(cluster_sizes).astype(int)

    members = cand[["_cluster_id"]].copy()
    for c in ["date_day", "date", "channel_name", "channel", "message_id", "score_rrf"]:
        if c in cand.columns:
            members[c] = cand[c]
    members["text_snip"] = cand[text_col].fillna("").map(lambda s: s[:250])

    return cand_dedup, clusters, members

In [17]:
import numpy as np
import pandas as pd

def snippet(t: str, n: int = 1000) -> str:
    return t[:n]

def _topk_indices_from_scores(scores: np.ndarray, k: int) -> np.ndarray:
    k = min(k, len(scores))
    if k <= 0:
        return np.array([], dtype=int)
    if k == len(scores):
        idx = np.argsort(-scores)
    else:
        idx = np.argpartition(-scores, k - 1)[:k]
        idx = idx[np.argsort(-scores[idx])]
    return idx.astype(int)

def dense_candidates_faiss(index, encoder, query: str, topN: int = 500):
    qv = encoder.encode(
        ["query: " + query],
        normalize_embeddings=True,
        show_progress_bar=False
    ).astype(np.float32)
    scores, idx = index.search(qv, topN)
    return idx[0].astype(int), scores[0].astype(np.float32)

def _compute_time_arrays(df: pd.DataFrame, rowpos: np.ndarray, anchor_date, date_col: str):
    ad = pd.to_datetime(anchor_date, utc=True).normalize()
    dts = pd.to_datetime(df.loc[rowpos, date_col], errors="coerce", utc=True).dt.normalize()
    age = (ad - dts).dt.days.to_numpy(dtype=np.float32)
    age = np.where(np.isfinite(age), age, 1e9).astype(np.float32)
    age = np.where(age < 0, 1e9, age).astype(np.float32)
    return dts, age

def _time_rank_from_age(age_days: np.ndarray) -> np.ndarray:
    order = np.argsort(age_days, kind="stable")
    rank = np.empty_like(order, dtype=np.int32)
    rank[order] = np.arange(1, len(order) + 1, dtype=np.int32)
    return rank

def hybrid_retrieve_rrf(
    df: pd.DataFrame,
    index,
    encoder,
    bm25,
    tokenize_fn,
    query: str,
    k: int = 50,
    topN_each: int = 500,
    k_rrf: int = 60,
    w_dense: float = 1.0,
    w_bm25: float = 1.0,
    anchor_date: str | pd.Timestamp | None = None,
    date_col: str = "date_day",
    max_window_days: int | None = 365,
    w_time: float = 0.5,
    w_channel: float | None = None,
    channel_w_col: str = "channel_w",
) -> pd.DataFrame:
    if anchor_date is not None:
        ad = pd.to_datetime(anchor_date, utc=True).normalize()
        if date_col not in df.columns:
            raise KeyError(f"date_col='{date_col}' not found in df.columns")
        dts_all = pd.to_datetime(df[date_col], errors="coerce", utc=True).dt.normalize()
        allowed = (dts_all <= ad)
        if max_window_days is not None:
            age_all = (ad - dts_all).dt.days
            allowed &= (age_all >= 0) & (age_all <= int(max_window_days))
        allowed_np = allowed.to_numpy(dtype=bool)
    else:
        allowed_np = None

    d_idx, _ = dense_candidates_faiss(index, encoder, query, topN=topN_each)
    if allowed_np is not None and len(d_idx) > 0:
        d_idx = d_idx[allowed_np[d_idx]]
    dense_rank = {int(rowpos): r for r, rowpos in enumerate(d_idx, start=1)}

    if bm25 is None:
        union = d_idx.astype(int)
        if len(union) == 0:
            return df.iloc[[]].copy().reset_index(drop=True)

        rrf = w_dense / (k_rrf + np.arange(1, len(union) + 1, dtype=np.float32))

        rank_time = None
        if anchor_date is not None and w_time and len(union) > 0:
            _, age = _compute_time_arrays(df, union, anchor_date, date_col)
            rank_time = _time_rank_from_age(age)
            rrf = rrf + (w_time / (k_rrf + rank_time.astype(np.float32)))

        order = np.argsort(-rrf)
        union = union[order]
        rrf = rrf[order]
        if rank_time is not None:
            rank_time = rank_time[order]

        out = df.iloc[union].copy()
        out["_rowpos"] = union
        out["score_rrf"] = rrf
        out["rank_dense"] = out["_rowpos"].map(lambda rp: dense_rank.get(int(rp), np.nan))
        out["rank_bm25"] = np.nan

        if anchor_date is not None:
            doc_day, age = _compute_time_arrays(df, union, anchor_date, date_col)
            out["doc_day"] = doc_day.dt.tz_localize(None)
            out["age_days"] = age
            if rank_time is not None:
                out["rank_time"] = rank_time

        if channel_w_col in out.columns:
            if w_channel is None:
                w_channel = 0.10 * float(np.std(out["score_rrf"].to_numpy(dtype=np.float32)) or 1.0)
            out["score_rrf"] = out["score_rrf"] + float(w_channel) * out[channel_w_col].astype(np.float32)
            out = out.sort_values("score_rrf", ascending=False)

        return out.head(k).reset_index(drop=True)

    bm_scores = bm25.get_scores(tokenize_fn(query)).astype(np.float32)
    if allowed_np is not None:
        bm_scores[~allowed_np] = -np.inf
    b_idx = _topk_indices_from_scores(bm_scores, topN_each)
    bm_rank = {int(rowpos): r for r, rowpos in enumerate(b_idx, start=1)}

    union = np.array(sorted(set(dense_rank) | set(bm_rank)), dtype=int)
    if len(union) == 0:
        return df.iloc[[]].copy().reset_index(drop=True)

    rrf = np.zeros(len(union), dtype=np.float32)
    for j, rowpos in enumerate(union):
        if rowpos in dense_rank:
            rrf[j] += w_dense / (k_rrf + dense_rank[rowpos])
        if rowpos in bm_rank:
            rrf[j] += w_bm25 / (k_rrf + bm_rank[rowpos])

    rank_time = None
    if anchor_date is not None and w_time and len(union) > 0:
        _, age = _compute_time_arrays(df, union, anchor_date, date_col)
        rank_time = _time_rank_from_age(age)
        rrf = rrf + (w_time / (k_rrf + rank_time.astype(np.float32)))

    order = np.argsort(-rrf)
    union = union[order]
    rrf = rrf[order]
    if rank_time is not None:
        rank_time = rank_time[order]

    out = df.iloc[union].copy()
    out["_rowpos"] = union
    out["score_rrf"] = rrf
    out["rank_dense"] = out["_rowpos"].map(lambda rp: dense_rank.get(int(rp), np.nan))
    out["rank_bm25"] = out["_rowpos"].map(lambda rp: bm_rank.get(int(rp), np.nan))

    if anchor_date is not None:
        doc_day, age = _compute_time_arrays(df, union, anchor_date, date_col)
        out["doc_day"] = doc_day.dt.tz_localize(None)

        out["age_days"] = age
        if rank_time is not None:
            out["rank_time"] = rank_time

    if channel_w_col in out.columns:
        if w_channel is None:
            w_channel = 0.10 * float(np.std(out["score_rrf"].to_numpy(dtype=np.float32)) or 1.0)
        out["score_rrf"] = out["score_rrf"] + float(w_channel) * out[channel_w_col].astype(np.float32)
        out = out.sort_values("score_rrf", ascending=False)

    return out.head(k).reset_index(drop=True)

In [18]:
df["date_day"].min(), df["date_day"].max()

(Timestamp('2023-09-23 00:00:00+0000', tz='UTC'),
 Timestamp('2025-09-08 00:00:00+0000', tz='UTC'))

In [19]:
# ! pip install transformers torch sentence-transformers accelerate vllm

In [21]:
from transformers import AutoTokenizer, AutoConfig
from vllm import LLM
import torch

MODEL = "Qwen/Qwen2.5-32B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

cfg = AutoConfig.from_pretrained(MODEL, trust_remote_code=True)
native_ctx = getattr(cfg, "max_position_embeddings", None)
print("GPU:", torch.cuda.get_device_name(0))
print("VRAM(GB):", round(torch.cuda.get_device_properties(0).total_memory / (1024**3), 1))
print("Model max_position_embeddings:", native_ctx)

MAX_MODEL_LEN = 19200

model = LLM(
    model=MODEL,
    dtype="bfloat16",
    max_model_len=MAX_MODEL_LEN,
    gpu_memory_utilization=0.88
)


GPU: NVIDIA A100-SXM4-80GB
VRAM(GB): 79.3
Model max_position_embeddings: 32768
INFO 01-10 05:16:14 [utils.py:253] non-default args: {'dtype': 'bfloat16', 'max_model_len': 19200, 'gpu_memory_utilization': 0.88, 'disable_log_stats': True, 'model': 'Qwen/Qwen2.5-32B-Instruct'}
INFO 01-10 05:16:15 [model.py:514] Resolved architecture: Qwen2ForCausalLM
INFO 01-10 05:16:15 [model.py:1661] Using max model len 19200
INFO 01-10 05:16:15 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:16:24 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='Qwen/Qwen2.5-32B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-32B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=19200, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, d

[0;36m(EngineCore_DP0 pid=11355)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:16:29 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')


Loading safetensors checkpoint shards:   0% Completed | 0/17 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   6% Completed | 1/17 [00:00<00:09,  1.67it/s]
Loading safetensors checkpoint shards:  12% Completed | 2/17 [00:01<00:10,  1.48it/s]
Loading safetensors checkpoint shards:  18% Completed | 3/17 [00:02<00:09,  1.44it/s]
Loading safetensors checkpoint shards:  24% Completed | 4/17 [00:02<00:09,  1.40it/s]
Loading safetensors checkpoint shards:  29% Completed | 5/17 [00:03<00:08,  1.37it/s]
Loading safetensors checkpoint shards:  35% Completed | 6/17 [00:04<00:08,  1.37it/s]
Loading safetensors checkpoint shards:  41% Completed | 7/17 [00:05<00:07,  1.35it/s]
Loading safetensors checkpoint shards:  47% Completed | 8/17 [00:05<00:06,  1.33it/s]
Loading safetensors checkpoint shards:  53% Completed | 9/17 [00:06<00:06,  1.33it/s]
Loading safetensors checkpoint shards:  59% Completed | 10/17 [00:07<00:05,  1.31it/s]
Loading safetensors checkpoint shards:  65% Completed | 11/17

[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:16:43 [default_loader.py:308] Loading weights took 12.74 seconds
[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:16:44 [gpu_model_runner.py:3659] Model loading took 61.0375 GiB memory and 17.320794 seconds
[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:16:56 [backends.py:643] Using cache directory: /home/mlcore/.cache/vllm/torch_compile_cache/acdcfb9698/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:16:56 [backends.py:703] Dynamo bytecode transform time: 12.13 s
[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:17:17 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 14.028 s
[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:17:17 [monitor.py:34] torch.compile takes 26.16 s in total
[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:17:18 [gpu_worker.py:375] Available KV cache memory: 7.22 GiB
[0;36m(EngineC

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 51/51 [00:06<00:00,  7.81it/s]
Capturing CUDA graphs (decode, FULL): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 35/35 [00:03<00:00, 10.30it/s]


[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:17:29 [gpu_model_runner.py:4587] Graph capturing finished in 11 secs, took 4.11 GiB
[0;36m(EngineCore_DP0 pid=11355)[0;0m INFO 01-10 05:17:29 [core.py:259] init engine (profile, create kv cache, warmup model) took 44.93 seconds
INFO 01-10 05:17:31 [llm.py:360] Supported tasks: ['generate']


In [22]:
SYSTEM_PROMPT = """–¢—ã ‚Äî –æ—á–µ–Ω—å –≤–Ω–∏–º–∞—Ç–µ–ª—å–Ω—ã–π –Ω–æ–≤–æ—Å—Ç–Ω–æ–π –∞–Ω–∞–ª–∏—Ç–∏–∫. –¢—ã –ø–∏—à–µ—à—å –∞–∫–∫—É—Ä–∞—Ç–Ω—ã–π –¥–∞–π–¥–∂–µ—Å—Ç –ø–æ –Ω–æ–≤–æ—Å—Ç—è–º –≤ —Å—Ç–∏–ª–µ –∫–æ—Ä—Ä–µ—Å–ø–æ–Ω–¥–µ–Ω—Ç–∞.

–í—Ö–æ–¥: –∑–∞–ø—Ä–æ—Å, –∞–∫—Ç—É–∞–ª—å–Ω–∞—è –¥–∞—Ç–∞ (YYYY-MM-DD) –∏ –¥–æ–∫—É–º–µ–Ω—Ç—ã –≤–∏–¥–∞:
[id] date=YYYY-MM-DD channel(s)=<–∫–∞–Ω–∞–ª1; –∫–∞–Ω–∞–ª2; ...>
<—Ç–µ–∫—Å—Ç>
–í—Å–µ –¥–æ–∫—É–º–µ–Ω—Ç—ã –¥–∞—Ç–∏—Ä–æ–≤–∞–Ω—ã –ù–ï –ü–û–ó–ñ–ï –∞–∫—Ç—É–∞–ª—å–Ω–æ–π –¥–∞—Ç—ã.

–û–ë–©–ò–ï –ü–†–ê–í–ò–õ–ê:
1) –ü–∏—à–∏ –Ω–∞ —Ä—É—Å—Å–∫–æ–º. –ò—Å–ø–æ–ª—å–∑—É–π –¢–û–õ–¨–ö–û –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –∏–∑ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤.
2) –ù–µ —É–ø–æ–º–∏–Ω–∞–π –Ω–æ–º–µ—Ä–∞ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ ([id]).
3) –ù–µ –¥–æ–±–∞–≤–ª—è–π –¥–∞—Ç, –∫–æ—Ç–æ—Ä—ã—Ö –Ω–µ—Ç –≤ –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ö. –ù–µ –∏—Å–ø–æ–ª—å–∑—É–π ‚Äú—Å–µ–≥–æ–¥–Ω—è/–≤—á–µ—Ä–∞/–Ω–µ–¥–∞–≤–Ω–æ‚Äù ‚Äî —Ç–æ–ª—å–∫–æ YYYY-MM-DD.
4) –õ—é–±—ã–µ —á–∏—Å–ª–∞/—Ç–æ—á–Ω—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è/–ø–æ—Ä–æ–≥–∏/—Ü–∏—Ç–∞—Ç—ã/—Ñ–æ—Ä–º—É–ª–∏—Ä–æ–≤–∫–∏ ‚Äî —Ç–æ–ª—å–∫–æ –µ—Å–ª–∏ –æ–Ω–∏ –µ—Å—Ç—å –≤ –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ö.
5) –ö–∞–Ω–∞–ª—ã —É–∫–∞–∑—ã–≤–∞–π –¢–û–õ–¨–ö–û –≤ –¥–∞–π–¥–∂–µ—Å—Ç–µ (–≤ —Ç–∞–π–º–ª–∞–π–Ω–µ –∫–∞–Ω–∞–ª–æ–≤ –Ω–µ –±—É–¥–µ—Ç).
6) –ï—Å–ª–∏ —Ä—è–¥–æ–º —Å –∞–∫—Ç—É–∞–ª—å–Ω–æ–π –¥–∞—Ç–æ–π –º–∞–ª–æ –¥–∞–Ω–Ω—ã—Ö ‚Äî —á–µ—Å—Ç–Ω–æ —É–∫–∞–∂–∏ –ø–æ—Å–ª–µ–¥–Ω—é—é –¥–∞—Ç—É –≤ –ø–æ–¥–±–æ—Ä–∫–µ –∏ —Ä–∞–∑—Ä—ã–≤, –±–µ–∑ –¥–æ–º—ã—Å–ª–æ–≤ ‚Äú—á—Ç–æ —Å–µ–π—á–∞—Å‚Äù.
7) –ù–µ–ª—å–∑—è –≤—Å—Ç–∞–≤–ª—è—Ç—å ‚Äú–ø—É—Å—Ç—ã–µ‚Äù –¥–∞—Ç—ã –∏ —Å—Ç—Ä–æ–∫–∏ –≤–∏–¥–∞ ‚Äú–Ω–µ—Ç –¥–∞–Ω–Ω—ã—Ö‚Äù.

–ö–ê–ö –ü–ò–°–ê–¢–¨ –î–ê–ô–î–ñ–ï–°–¢:
- –ù–∞—á–Ω–∏ —Å 2‚Äì4 —Å–∞–º—ã—Ö —Å–≤–µ–∂–∏—Ö –£–ù–ò–ö–ê–õ–¨–ù–´–• –¥–∞—Ç –≤ –ø–æ–¥–±–æ—Ä–∫–µ (—ç—Ç–æ ‚Äú–ø–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ–º—è‚Äù).
- –î–ª—è –∫–∞–∂–¥–æ–≥–æ –∫–ª—é—á–µ–≤–æ–≥–æ —Ñ–∞–∫—Ç–∞ —É–∫–∞–∂–∏ –∏—Å—Ç–æ—á–Ω–∏–∫:
  ‚Äú–ö–∞–Ω–∞–ª(—ã) (YYYY-MM-DD): ‚Ä¶‚Äù
- –ù–µ —Å–º–µ—à–∏–≤–∞–π —Ä–∞–∑–Ω—ã–µ —Ç–∏–ø—ã —Å–∏–≥–Ω–∞–ª–æ–≤ –∫–∞–∫ –æ–¥–Ω–æ –∏ —Ç–æ –∂–µ: —è–≤–Ω–æ —Ä–∞–∑–ª–∏—á–∞–π
  ‚Äú–æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å / –≤–Ω–µ–±–∏—Ä–∂–µ–≤–æ–π –∫—É—Ä—Å / —Ä–∞—Å—á–µ—Ç–Ω—ã–π –∫—É—Ä—Å / –∏–Ω–¥–µ–∫—Å –¥–æ–ª–ª–∞—Ä–∞ / –ø—Ä–æ–≥–Ω–æ–∑‚Äù.
- –ï—Å–ª–∏ –≤—Å—Ç—Ä–µ—á–∞—é—Ç—Å—è –ø–æ–≤—Ç–æ—Ä—ã ‚Äî –æ–±—ä–µ–¥–∏–Ω—è–π.
- –ü–∏—à–∏ –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω–æ –∏ –Ω–µ–π—Ç—Ä–∞–ª—å–Ω–æ, –∫–∞–∫ –≤ –Ω–æ–≤–æ—Å—Ç–Ω–æ–º –¥–∞–π–¥–∂–µ—Å—Ç–µ (–±–µ–∑ —ç–º–æ—Ü–∏–æ–Ω–∞–ª—å–Ω—ã—Ö –æ—Ü–µ–Ω–æ–∫ –∏ –±–µ–∑ –¥–æ–º—ã—Å–ª–æ–≤).

–°–¢–†–£–ö–¢–£–†–ê –û–¢–í–ï–¢–ê (3 –±–ª–æ–∫–∞):

### 1) –ó–∞–ø—Ä–æ—Å –∏ –∞–∫—Ç—É–∞–ª—å–Ω–∞—è –¥–∞—Ç–∞
* –ó–∞–ø—Ä–æ—Å: ...
* –ê–∫—Ç—É–∞–ª—å–Ω–∞—è –¥–∞—Ç–∞: ...

### 2) –î–∞–π–¥–∂–µ—Å—Ç
–°–≤—è–∑–Ω—ã–π —Ç–µ–∫—Å—Ç (8‚Äì20 –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏–π, –Ω–µ –±–æ–ª–µ–µ!), –∫–∞–∫ –∫–æ—Ä—Ä–µ—Å–ø–æ–Ω–¥–µ–Ω—Ç:
- ‚Äú–ü–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ–º—è‚Äù: –Ω–µ—Å–∫–æ–ª—å–∫–æ –∫–ª—é—á–µ–≤—ã—Ö —Ñ–∞–∫—Ç–æ–≤ –ø–æ —Å–∞–º—ã–º —Å–≤–µ–∂–∏–º –¥–∞—Ç–∞–º + –∫—Ç–æ —Å–æ–æ–±—â–∏–ª.
- –ó–∞—Ç–µ–º –∫–æ—Ä–æ—Ç–∫–æ ‚Äú—Ä–∞–Ω—å—à–µ/–ø—Ä–µ–¥—ã—Å—Ç–æ—Ä–∏—è‚Äù: –Ω–µ—Å–∫–æ–ª—å–∫–æ –∫–ª—é—á–µ–≤—ã—Ö —Ñ–∞–∫—Ç–æ–≤ –ø–æ –±–æ–ª–µ–µ —Ä–∞–Ω–Ω–∏–º –¥–∞—Ç–∞–º + –∫—Ç–æ —Å–æ–æ–±—â–∏–ª.
- –í –∫–æ–Ω—Ü–µ 1‚Äì2 –ø—Ä–µ–¥–ª–æ–∂–µ–Ω–∏—è: –≤—ã–≤–æ–¥—ã, –∏—Å—Ö–æ–¥—è –∏–∑ –∞–∫—Ç—É–∞–ª—å–Ω–æ—Å—Ç–∏ –Ω–æ–≤–æ—Å—Ç–µ–π. –ú–æ–∂–Ω–æ –ª–∏ –¥–∞—Ç—å –∞–∫—Ç—É–∞–ª—å–Ω—É—é –æ—Ü–µ–Ω–∫—É –∏–ª–∏ –Ω–µ—Ç. –ù–∞—Å–∫–æ–ª—å–∫–æ —Ç–µ–º–∞ –≤ —Ü–µ–ª–æ–º –∞–∫—Ç—É–∞–ª—å–Ω–∞—è, —Å–≤–µ–∂–∞—è –∏ –≤–∏—Ä—É—Å–Ω–∞—è.

### 3) –¢–∞–π–º–ª–∞–π–Ω (–ø–æ–ª–Ω—ã–π, 1 –¥–∞—Ç–∞ = 1 —Å—Ç—Ä–æ–∫–∞ = 1 —Ñ–∞–∫—Ç, –±–µ–∑ –∫–∞–Ω–∞–ª–æ–≤)
–ü–æ–ª–Ω—ã–π —Å–ø–∏—Å–æ–∫ –í–°–ï–• –£–ù–ò–ö–ê–õ–¨–ù–´–• –¥–∞—Ç –∏–∑ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤, –≤ –ø–æ—Ä—è–¥–∫–µ –æ—Ç —Å–∞–º–æ–π —Å—Ç–∞—Ä–æ–π –∫ —Å–∞–º–æ–π –Ω–æ–≤–æ–π (ascending).
–û–¥–Ω–∞ —Å—Ç—Ä–æ–∫–∞ = –æ–¥–Ω–∞ (1) –¥–∞—Ç–∞.
–û–¥–Ω–∞ —Å—Ç—Ä–æ–∫–∞ = –æ–¥–∏–Ω (1) —Å–∞–º—ã–π —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω—ã–π –∑–∞–ø—Ä–æ—Å—É —Ñ–∞–∫—Ç/–∏–∑–º–µ–Ω–µ–Ω–∏–µ, –∫–æ—Ç–æ—Ä—ã–π —è–≤–Ω–æ –µ—Å—Ç—å –≤ –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ö —ç—Ç–æ–π –¥–∞—Ç—ã.

–§–æ—Ä–º–∞—Ç —Å—Ç—Ä–æ–∫–∏:
* YYYY-MM-DD ‚Äî —á—Ç–æ —Å–æ–æ–±—â–∏–ª–∏ / —á—Ç–æ –∏–∑–º–µ–Ω–∏–ª–æ—Å—å (1 —Ñ–∞–∫—Ç)

–ü–†–ê–í–ò–õ–ê –¢–ê–ô–ú–õ–ê–ô–ù–ê (–∫—Ä–∏—Ç–∏—á–Ω–æ):
- –í —Ç–∞–π–º–ª–∞–π–Ω–µ –ù–ï –£–ö–ê–ó–´–í–ê–ô –∫–∞–Ω–∞–ª—ã –≤–æ–æ–±—â–µ.
- –ù–µ –¥–æ–±–∞–≤–ª—è–π –¥–∞—Ç, –∫–æ—Ç–æ—Ä—ã—Ö –Ω–µ—Ç –≤ –¥–æ–∫—É–º–µ–Ω—Ç–∞—Ö!
- –ù–µ –ø–µ—Ä–µ–Ω–æ—Å–∏ —Ñ–∞–∫—Ç—ã –º–µ–∂–¥—É –¥–∞—Ç–∞–º–∏ ‚Äú–ø–æ —Å–º—ã—Å–ª—É‚Äù: –≤ —Å—Ç—Ä–æ–∫–µ –¥–∞—Ç—ã –º–æ–≥—É—Ç –±—ã—Ç—å —Ç–æ–ª—å–∫–æ —Ñ–∞–∫—Ç—ã –∏–∑ –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ —ç—Ç–æ–π –¥–∞—Ç—ã!
- –ù–∏–∫–∞–∫–∏—Ö –ø—Ä–∏—á–∏–Ω –∏ –≤—ã–≤–æ–¥–æ–≤ ‚Äî —Ç–æ–ª—å–∫–æ ‚Äú—á—Ç–æ —Å–æ–æ–±—â–∏–ª–∏‚Äù!
- –ï—Å–ª–∏ –Ω–∞ –æ–¥–Ω—É –¥–∞—Ç—É –ø—Ä–∏—Ö–æ–¥–∏—Ç—Å—è –Ω–µ—Å–∫–æ–ª—å–∫–æ —Ñ–∞–∫—Ç–æ–≤, –≤—ã–±–µ—Ä–∏ –æ–¥–∏–Ω —Å–∞–º—ã–π –≤–∞–∂–Ω—ã–π –¥–ª—è –∑–∞–ø—Ä–æ—Å–∞ (–æ—Å—Ç–∞–ª—å–Ω–æ–µ –æ—Å—Ç–∞–≤—å –≤ –¥–∞–π–¥–∂–µ—Å—Ç–µ)!
! –ï–°–õ–ò –î–ê–¢–´ –ù–ï–¢ –í–ù–£–¢–†–ò –ö–û–ù–¢–ï–ö–°–¢–ê ‚Äî –ï–Å –ù–ï–õ–¨–ó–Ø –£–ö–ê–ó–´–í–ê–¢–¨. –ï–°–õ–ò –§–ê–ö–¢–ê –ù–ï–¢ –ù–ê –≠–¢–£ –î–ê–¢–£ –í –î–û–ö–£–ú–ï–ù–¢–ê–• ‚Äî –ï–ì–û –ù–ï–õ–¨–ó–Ø –ü–ò–°–ê–¢–¨. !
"""

import numpy as np
import pandas as pd

def build_rag_context(
    query: str,
    cand: pd.DataFrame,
    anchor_date: str,
    k_docs: int = 30,
    snip_chars: int = 850,
    hot_window_days: int = 30,
    hot_ratio: float = 0.8,
) -> str:
    if cand is None or len(cand) == 0:
        return (
            f"–ê–ö–¢–£–ê–õ–¨–ù–ê–Ø –î–ê–¢–ê –û–ë–ó–û–†–ê: {anchor_date}\n"
            f"–í–û–ü–†–û–°/–ó–ê–ü–†–û–°:\n{query}\n\n"
            f"–ò–°–¢–û–ß–ù–ò–ö–ò:\n(–Ω–µ—Ç –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤)\n"
        )

    c = cand.copy()

    date_col = "date_day" if "date_day" in c.columns else "date"
    score_col = "score_temporal" if "score_temporal" in c.columns else "score_rrf"

    if "age_days" not in c.columns:
        ad = pd.to_datetime(anchor_date, utc=True).normalize()
        dts = pd.to_datetime(c[date_col], errors="coerce", utc=True).dt.normalize()
        c["age_days"] = (ad - dts).dt.days.astype("float32")

    age = c["age_days"].to_numpy(dtype=np.float32)
    hot_mask = (age >= 0) & (age <= float(hot_window_days))

    c = c.sort_values(score_col, ascending=False)

    n_hot = int(round(k_docs * float(hot_ratio)))
    n_hot = max(0, min(n_hot, k_docs))

    hot_part = c[hot_mask].head(n_hot)
    rest_part = c[~hot_mask].head(k_docs - len(hot_part))
    picked = pd.concat([hot_part, rest_part], axis=0)

    dd = pd.to_datetime(picked[date_col], errors="coerce", utc=True).dt.normalize()
    picked = picked.assign(_doc_day=dd).sort_values(["_doc_day", score_col], ascending=[False, False]).head(k_docs)

    blocks = []
    for i, row in enumerate(picked.itertuples(index=False), start=1):
        date_day = getattr(row, "date_day", getattr(row, "date", ""))
        if isinstance(date_day, pd.Timestamp):
            date_day = date_day.strftime("%Y-%m-%d")
        date_day = str(date_day)[:10]

        channel = getattr(row, "channel_name")
        text = getattr(row, "message", "")

        blocks.append(f"[{i}] date={date_day} channel(s)={channel}\n document=" + snippet(str(text), snip_chars))

    return (
        f"–ê–ö–¢–£–ê–õ–¨–ù–ê–Ø –î–ê–¢–ê –û–ë–ó–û–†–ê: {anchor_date}\n"
        f"–í–û–ü–†–û–°/–ó–ê–ü–†–û–°:\n{query}\n\n"
        f"–ò–°–¢–û–ß–ù–ò–ö–ò:\n" + "\n\n".join(blocks)
    )


In [23]:
import re, json
import pandas as pd
from vllm import SamplingParams

JUDGE_SYSTEM = """–¢—ã ‚Äî —Å—Ç—Ä–æ–≥–∏–π —ç–∫—Å–ø–µ—Ä—Ç –ø–æ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–æ–Ω–Ω–æ–º—É –ø–æ–∏—Å–∫—É –ø–æ –Ω–æ–≤–æ—Å—Ç—è–º (–≤ —Ç.—á. —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏–º).

–¢–≤–æ—è –∑–∞–¥–∞—á–∞: –æ—Ü–µ–Ω–∏—Ç—å —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç—å –∫–∞–Ω–¥–∏–¥–∞—Ç–Ω–æ–π –Ω–æ–≤–æ—Å—Ç–∏ –∑–∞–ø—Ä–æ—Å—É. –ó–∞–ø—Ä–æ—Å –º–æ–∂–µ—Ç –±—ã—Ç—å:
- –∫–æ—Ä–æ—Ç–∫–∏–º —Ç–æ–ø–∏–∫–æ–º (–Ω–∞–ø—Ä–∏–º–µ—Ä "–∫—É—Ä—Å —Ä—É–±–ª—è –∫ –¥–æ–ª–ª–∞—Ä—É"),
- –∏–ª–∏ —Ç–µ–∫—Å—Ç–æ–º –¥—Ä—É–≥–æ–π –Ω–æ–≤–æ—Å—Ç–∏ (—Ç–æ–≥–¥–∞ –∑–∞–ø—Ä–æ—Å –æ–ø–∏—Å—ã–≤–∞–µ—Ç –∫–æ–Ω–∫—Ä–µ—Ç–Ω—ã–π –∏–Ω—Ñ–æ–ø–æ–≤–æ–¥).

–ò—Å–ø–æ–ª—å–∑—É–π –¢–û–õ–¨–ö–û —Ç–µ–∫—Å—Ç –∫–∞–Ω–¥–∏–¥–∞—Ç–Ω–æ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–∞. –ù–∏—á–µ–≥–æ –Ω–µ –¥–æ–¥—É–º—ã–≤–∞–π.

–®–∫–∞–ª–∞ —Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ—Å—Ç–∏:
2 ‚Äî –¥–æ–∫—É–º–µ–Ω—Ç —è–≤–Ω–æ –ø—Ä–æ —Ç–æ –∂–µ —Å–∞–º–æ–µ: –æ—Ç–≤–µ—á–∞–µ—Ç —Ç–æ–ø–∏–∫—É –ò–õ–ò –æ–ø–∏—Å—ã–≤–∞–µ—Ç —Ç–æ—Ç –∂–µ –∏–Ω—Ñ–æ–ø–æ–≤–æ–¥/—Ñ–∞–∫—Ç/—Å–æ–±—ã—Ç–∏–µ, —á—Ç–æ –∏ –∑–∞–ø—Ä–æ—Å.
1 ‚Äî –¥–æ–∫—É–º–µ–Ω—Ç —Å–≤—è–∑–∞–Ω –ø–æ —Ç–µ–º–µ/–∫–æ–Ω—Ç–µ–∫—Å—Ç—É, –Ω–æ —ç—Ç–æ –Ω–µ–º–Ω–æ–≥–æ –¥—Ä—É–≥–æ–π –∏–Ω—Ñ–æ–ø–æ–≤–æ–¥, –∏–ª–∏ –ø—Ä–æ —Ç–æ –∂–µ, –Ω–æ –±–µ–∑ –ø—Ä—è–º–æ–≥–æ —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤–∏—è.
0 ‚Äî –Ω–µ—Ä–µ–ª–µ–≤–∞–Ω—Ç–Ω–æ —Å–æ–≤—Å–µ–º.

–ü—Ä–∞–≤–∏–ª–æ —Å—Ç—Ä–æ–≥–æ—Å—Ç–∏:
—Å—Ç–∞–≤—å 2 —Ç–æ–ª—å–∫–æ –µ—Å–ª–∏ —Å–≤—è–∑—å –æ—á–µ–≤–∏–¥–Ω–∞ –ø–æ —Ç–µ–∫—Å—Ç—É –¥–æ–∫—É–º–µ–Ω—Ç–∞; –µ—Å–ª–∏ –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏–∏ –Ω–µ–¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ ‚Äî —Å—Ç–∞–≤—å 0 –∏–ª–∏ 1.

–í–µ—Ä–Ω–∏ —Å—Ç—Ä–æ–≥–æ –≤–∞–ª–∏–¥–Ω—ã–π JSON –∏ –Ω–∏—á–µ–≥–æ –±–æ–ª—å—à–µ:
{"relevance": 0|1|2}
"""


def _parse_relevance(text: str) -> int:
    text = text.strip()
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if m:
        blob = m.group(0)
        try:
            obj = json.loads(blob)
            val = int(obj.get("relevance", 0))
            return val if val in (0, 1, 2) else 0
        except Exception:
            pass
    m2 = re.search(r"relevance\"\s*:\s*([012])", text)
    if m2:
        return int(m2.group(1))
    return 0

def judge_filter_candidates(
    cand: pd.DataFrame,
    query: str,
    judge_llm,
    judge_tokenizer,
    *,
    keep_threshold: int = 1,     
    doc_max_chars: int = 1200,
    batch_size: int = 32,
    max_out_tokens: int = 40,
) -> pd.DataFrame:
    if cand is None or len(cand) == 0:
        return cand

    text_col = "message"
    channel_col = "channel_name"
    date_col = "date_day"

    prompts = []
    for _, row in cand.iterrows():
        doc = str(row[text_col])[:doc_max_chars]
        ch = str(row[channel_col]) if channel_col else ""
        dt = str(row[date_col]) if date_col else ""

        user_msg = (
            f"–ó–ê–ü–†–û–°:\n{query}\n\n"
            f"–ö–ê–ù–î–ò–î–ê–¢:\n"
            f"channel={ch}\n"
            f"date={dt}\n"
            f"text:\n{doc}\n"
        )

        messages = [
            {"role": "system", "content": JUDGE_SYSTEM},
            {"role": "user", "content": user_msg},
        ]
        prompt = judge_tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )
        prompts.append(prompt)

    sampling = SamplingParams(
        temperature=0.0,
        top_p=1.0,
        max_tokens=max_out_tokens,
    )

    relevances = []
    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i+batch_size]
        outs = judge_llm.generate(batch_prompts, sampling)
        for o in outs:
            txt = o.outputs[0].text
            relevances.append(_parse_relevance(txt))

    out_df = cand.copy()
    out_df["judge_relevance"] = relevances

    filtered = out_df[out_df["judge_relevance"] >= keep_threshold].copy()
    filtered.reset_index(drop=True, inplace=True)
    return filtered


In [24]:
# import torch

# @torch.inference_mode()
# def rag_summarize(sum_model, sum_tokenizer, query: str, cand: pd.DataFrame, anchor_date, 
#                   k_docs: int = 25, snip_chars: int = 900, max_new_tokens: int = 2000) -> str:
    
#     user = build_rag_context(query, cand, anchor_date=anchor_date, k_docs=k_docs, snip_chars=snip_chars)
#     print("built context...")
    
#     messages = [
#         {"role": "system", "content": SYSTEM_PROMPT},
#         {"role": "user", "content": user},
#     ]
#     prompt = sum_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     enc = sum_tokenizer(prompt, return_tensors="pt", truncation=True).to(sum_model.device)

#     out_ids = sum_model.generate(
#         **enc,
#         max_new_tokens=max_new_tokens,
#         do_sample=False,
#         eos_token_id=sum_tokenizer.eos_token_id,
#         pad_token_id=sum_tokenizer.eos_token_id,
#     )
#     prompt_len = int(enc["attention_mask"][0].sum().item())
    
#     return sum_tokenizer.decode(out_ids[0][prompt_len:], skip_special_tokens=True).strip(), user


In [52]:
from vllm import SamplingParams

def rag_summarize(
    sum_model,
    sum_tokenizer,
    query: str,
    cand: pd.DataFrame,
    anchor_date,
    k_docs: int = 25,
    snip_chars: int = 900,
    max_new_tokens: int = 2000,
    hot_window_days: int = 30,
    hot_ratio: float = 0.8,
):
    user = build_rag_context(
        query=query,
        cand=cand,
        anchor_date=str(anchor_date),
        k_docs=k_docs,
        snip_chars=snip_chars,
        hot_window_days=hot_window_days,
        hot_ratio=hot_ratio,
    )
    print("built context...")

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": user},
    ]

    prompt = sum_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    sampling = SamplingParams(
        temperature=0.0,
        top_p=1.0,
        max_tokens=max_new_tokens,
    )

    result = sum_model.generate([prompt], sampling)[0]
    text = result.outputs[0].text.strip()
    return text, user


In [53]:
def run_rag_hybrid(
    df: pd.DataFrame,
    index,
    encoder,
    bm25,
    tokenize_fn,
    query: str,
    k_retrieve: int = 50,
    topN_each: int = 500,
    k_docs: int = 25,
    snip_chars: int = 1500,
    max_new_tokens: int = 2000,
    anchor_date: str = "2025-09-04",
    max_window_days: int | None = 365,
    w_time: float = 0.5,
    w_channel: float | None = None,
    hot_window_days: int = 30,
    hot_ratio: float = 0.8,
    sum_model=None,
    sum_tokenizer=None,
    judge_llm=None,
    judge_tokenizer=None,
    judge_keep_threshold: int = 1,
    judge_batch_size: int = 32,
):
    cand = hybrid_retrieve_rrf(
        df=df,
        index=index,
        encoder=encoder,
        bm25=bm25,
        tokenize_fn=tokenize_fn,
        query=query,
        k=k_retrieve,
        topN_each=topN_each,
        k_rrf=60,
        w_dense=1.0,
        w_bm25=1.0,
        anchor_date=anchor_date,
        max_window_days=max_window_days,
        w_time=w_time,
        w_channel=w_channel,
    )
    print("retrieval done...")
    cand_before = cand

    cand_clusters = None
    members = None
    if cand is not None and len(cand) > 0 and encoder is not None:
        text_col = "message"
        cand, cand_clusters, members = dedup_cluster_candidates_time(
            cand=cand,
            encoder=encoder,
            text_col=text_col,
            score_col="score_rrf",
            sim_threshold=0.95,
            knn=30,
            keep_per_cluster=1,
            mask_numbers=False,
            max_day_diff=1,
            overwrite_channel=True,
        )
    print("clustering done...")

    cand_after_dedup = cand
    if (
        judge_llm is not None
        and judge_tokenizer is not None
        and cand is not None
        and len(cand) > 0
    ):
        cand = judge_filter_candidates(
            cand=cand,
            query=query,
            judge_llm=judge_llm,
            judge_tokenizer=judge_tokenizer,
            keep_threshold=judge_keep_threshold,
            doc_max_chars=snip_chars,
            batch_size=judge_batch_size,
        )
    print("filtering done...")

    if sum_model is None or sum_tokenizer is None:
        ctx = build_rag_context(
            query,
            cand,
            anchor_date=anchor_date,
            k_docs=min(k_docs, len(cand)) if cand is not None else 0,
            snip_chars=snip_chars,
            hot_window_days=hot_window_days,
            hot_ratio=hot_ratio,
        )
        return {
            "context": ctx,
            "candidates": cand_before,
            "candidates_dedup": cand_after_dedup,
            "candidates_filtered": cand,
            "members": members,
            "clusters": cand_clusters,
            "summary": "No LLM",
        }

    summary, ctx = rag_summarize(
        sum_model,
        sum_tokenizer,
        query,
        cand,
        k_docs=min(k_docs, len(cand)) if cand is not None else 0,
        snip_chars=snip_chars,
        max_new_tokens=max_new_tokens,
        anchor_date=anchor_date,
        hot_window_days=hot_window_days,
        hot_ratio=hot_ratio,
    )
    print("summary done...")

    return {
        "context": ctx,
        "summary": summary,
        "candidates": cand_before,
        "candidates_dedup": cand_after_dedup,
        "candidates_filtered": cand,
        "members": members,
        "clusters": cand_clusters,
    }


In [54]:
q = "–ê–∫—Ç—É–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞"

out = run_rag_hybrid(
    df=df,
    index=index,
    encoder=encoder,
    bm25=bm25,
    tokenize_fn=tokenize_ru,
    query=q,
    k_retrieve=150,
    topN_each=2000,
    k_docs=30,
    snip_chars=1000,
    max_new_tokens=3000,
    anchor_date="2025-09-04",
    max_window_days=365,
    w_time=0.6,
    w_channel=None,
    hot_window_days=30,
    hot_ratio=0.8,
    sum_model=model,
    sum_tokenizer=tokenizer,
    judge_llm=model,
    judge_tokenizer=tokenizer,
    judge_keep_threshold=1,
    judge_batch_size=32)


retrieval done...
clustering done...


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Adding requests:   0%|          | 0/28 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/28 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

filtering done...
built context...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

summary done...


In [55]:
from IPython.display import display, Markdown, HTML
import re

def show_summary(summary: str):
    if summary is None:
        display(HTML("<b>summary is None</b>"))
        return

    s = str(summary)
    s = s.replace("\\n", "\n") 
    s = re.sub(r"\n{3,}", "\n\n", s).strip()

    display(Markdown(s))

show_summary(out["summary"])


### 1) –ó–∞–ø—Ä–æ—Å –∏ –∞–∫—Ç—É–∞–ª—å–Ω–∞—è –¥–∞—Ç–∞
* –ó–∞–ø—Ä–æ—Å: –ê–∫—Ç—É–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞
* –ê–∫—Ç—É–∞–ª—å–Ω–∞—è –¥–∞—Ç–∞: 2025-09-04

### 2) –î–∞–π–¥–∂–µ—Å—Ç
–ü–æ—Å–ª–µ–¥–Ω–µ–µ –≤—Ä–µ–º—è, 2025-09-04, –ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞ –∏ –†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏ —Å–æ–æ–±—â–∏–ª–∏, —á—Ç–æ –∫—É—Ä—Å —Ä—É–±–ª—è –±—É–¥–µ—Ç –∫—Ä–µ–ø—á–µ, —á–µ–º –æ–∂–∏–¥–∞–ª–æ—Å—å, –∏ –ú–∏–Ω—ç–∫–æ–Ω–æ–º—Ä–∞–∑–≤–∏—Ç–∏—è –ø—Ä–æ–≥–Ω–æ–∑–∏—Ä—É–µ—Ç –¥–æ–ª–ª–∞—Ä –ø–æ 94,3 —Ä—É–±–ª—è –≤ 2025 –≥–æ–¥—É, 100 —Ä—É–±–ª–µ–π –≤ 2026 –≥–æ–¥—É –∏ 103,5 —Ä—É–±–ª—è –≤ 2027 –≥–æ–¥—É. –¢–∞–∫–∂–µ –ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞ –∏ –†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏ –æ—Ç–º–µ—Ç–∏–ª–∏, —á—Ç–æ –ì—Ä–µ—Ñ –æ–∂–∏–¥–∞–µ—Ç –æ—Å–ª–∞–±–ª–µ–Ω–∏—è –∫—É—Ä—Å–∞ —Ä—É–±–ª—è –¥–æ 85-90 —Ä—É–±–ª–µ–π –∑–∞ –¥–æ–ª–ª–∞—Ä –∫ –∫–æ–Ω—Ü—É 2025 –≥–æ–¥–∞. –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¶–ë –Ω–∞ 2025-09-04 —Å–æ—Å—Ç–∞–≤–∏–ª 81 —Ä—É–±–ª—å –∑–∞ –¥–æ–ª–ª–∞—Ä, —á—Ç–æ —è–≤–ª—è–µ—Ç—Å—è –º–∞–∫—Å–∏–º—É–º–æ–º —Å 1 –∞–≤–≥—É—Å—Ç–∞.

–†–∞–Ω–µ–µ, 2025-09-02, –ë–ª—É–º–±–µ—Ä–≥ –∏ Forbes Russia —Å–æ–æ–±—â–∏–ª–∏, —á—Ç–æ –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –º–æ–∂–µ—Ç –¥–æ—Å—Ç–∏—á—å 100 —Ä—É–±–ª–µ–π —É–∂–µ –∫ —è–Ω–≤–∞—Ä—é, –∏ —á—Ç–æ –≤ —ç—Ç–æ–º –≥–æ–¥—É –¥–æ–ª–ª–∞—Ä –≤ —è–Ω–≤–∞—Ä–µ –Ω–∞ –ø–∏–∫–µ —Å—Ç–æ–∏–ª 113,7 —Ä—É–±–ª—è, –∞ –Ω–∞ –º–∏–Ω–∏–º—É–º–µ –≤ –∏—é–ª–µ ‚Äî 73,9 —Ä—É–±–ª—è. –í 2025-08-31, –ë–ª—É–º–±–µ—Ä–≥ –∏ –ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞ —Å–æ–æ–±—â–∏–ª–∏, —á—Ç–æ —Ä–µ–∞–ª—å–Ω—ã–π –∫—É—Ä—Å —Ä—É–±–ª—è –¥–æ–ª–∂–µ–Ω —Å–æ—Å—Ç–∞–≤–ª—è—Ç—å 24,5 –∑–∞ –¥–æ–ª–ª–∞—Ä, –∏ —á—Ç–æ —Ä—É–±–ª—å —Å–Ω–æ–≤–∞ –æ–∫–∞–∑–∞–ª—Å—è –æ–¥–Ω–æ–π –∏–∑ —Å–∞–º—ã—Ö –Ω–µ–¥–æ–æ—Ü–µ–Ω–µ–Ω–Ω—ã—Ö –≤–∞–ª—é—Ç –≤ –º–∏—Ä–µ.

–¢–µ–º–∞ –∫—É—Ä—Å–∞ –¥–æ–ª–ª–∞—Ä–∞ –æ—Å—Ç–∞–µ—Ç—Å—è –∞–∫—Ç—É–∞–ª—å–Ω–æ–π –∏ –≤–∏—Ä—É—Å–Ω–æ–π, —Å –º–Ω–æ–∂–µ—Å—Ç–≤–æ–º –ø—Ä–æ—Ç–∏–≤–æ—Ä–µ—á–∏–≤—ã—Ö –ø—Ä–æ–≥–Ω–æ–∑–æ–≤ –∏ –æ—Ü–µ–Ω–æ–∫. –û–¥–Ω–∞–∫–æ, –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø–æ—Å–ª–µ–¥–Ω–∏—Ö –¥–∞–Ω–Ω—ã—Ö, –º–æ–∂–Ω–æ —Å–¥–µ–ª–∞—Ç—å –≤—ã–≤–æ–¥, —á—Ç–æ –∫—É—Ä—Å —Ä—É–±–ª—è –æ—Å—Ç–∞–µ—Ç—Å—è –∫—Ä–µ–ø–∫–∏–º, –Ω–æ —ç–∫—Å–ø–µ—Ä—Ç—ã –æ–∂–∏–¥–∞—é—Ç –µ–≥–æ –æ—Å–ª–∞–±–ª–µ–Ω–∏—è –≤ –±–ª–∏–∂–∞–π—à–µ–µ –≤—Ä–µ–º—è.

### 3) –¢–∞–π–º–ª–∞–π–Ω
* 2025-09-04 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —Å–æ—Å—Ç–∞–≤–∏–ª 81 —Ä—É–±–ª—å
* 2025-09-04 ‚Äî –ú–∏–Ω—ç–∫–æ–Ω–æ–º—Ä–∞–∑–≤–∏—Ç–∏—è –ø—Ä–æ–≥–Ω–æ–∑–∏—Ä—É–µ—Ç –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∞ 2025-2027 –≥–æ–¥—ã
* 2025-09-04 ‚Äî –ì—Ä–µ—Ñ –æ–∂–∏–¥–∞–µ—Ç –æ—Å–ª–∞–±–ª–µ–Ω–∏—è –∫—É—Ä—Å–∞ —Ä—É–±–ª—è –¥–æ 85-90 —Ä—É–±–ª–µ–π –∑–∞ –¥–æ–ª–ª–∞—Ä –∫ –∫–æ–Ω—Ü—É 2025 –≥–æ–¥–∞
* 2025-09-02 ‚Äî –ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –º–æ–∂–µ—Ç –¥–æ—Å—Ç–∏—á—å 100 —Ä—É–±–ª–µ–π —É–∂–µ –∫ —è–Ω–≤–∞—Ä—é
* 2025-09-01 ‚Äî –ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —Å –º–∞—è –ø–æ –∞–≤–≥—É—Å—Ç –¥–µ—Ä–∂–∏—Ç—Å—è –Ω–∞ —É—Ä–æ–≤–Ω–µ 78‚Äì80 —Ä—É–±–ª–µ–π
* 2025-08-31 ‚Äî –†–µ–∞–ª—å–Ω—ã–π –∫—É—Ä—Å —Ä—É–±–ª—è –¥–æ–ª–∂–µ–Ω —Å–æ—Å—Ç–∞–≤–ª—è—Ç—å 24,5 –∑–∞ –¥–æ–ª–ª–∞—Ä
* 2025-08-18 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∞ 19 –∞–≤–≥—É—Å—Ç–∞ —Å–æ—Å—Ç–∞–≤–∏–ª 80.4256 —Ä—É–±–ª—è
* 2025-08-14 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∞ 15 –∞–≤–≥—É—Å—Ç–∞ —Å–æ—Å—Ç–∞–≤–∏–ª 79.7653 —Ä—É–±–ª—è
* 2025-08-07 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∞ 8 –∞–≤–≥—É—Å—Ç–∞ —Å–æ—Å—Ç–∞–≤–∏–ª 79.3847 —Ä—É–±–ª—è
* 2025-07-28 ‚Äî –°–≤–µ–∂–∏–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ—Ç –¶–ë –æ–∫–∞–∑–∞–ª—Å—è –º–µ–Ω—å—à–µ –≤–Ω–µ–±–∏—Ä–∂–µ–≤–æ–≥–æ –Ω–∞ –ø–æ–ª—Ç–æ—Ä–∞ —Ä—É–±–ª—è
* 2025-06-30 ‚Äî –ò–Ω–¥–µ–∫—Å –¥–æ–ª–ª–∞—Ä–∞ –æ–ø—É—Å—Ç–∏–ª—Å—è –Ω–∏–∂–µ 97 –ø—É–Ω–∫—Ç–æ–≤ –≤–ø–µ—Ä–≤—ã–µ —Å –º–∞—Ä—Ç–∞ 2022 –≥–æ–¥–∞
* 2025-06-20 ‚Äî –†–∞–≤–Ω–æ–≤–µ—Å–Ω—ã–º –∫—É—Ä—Å–æ–º —è–≤–ª—è–µ—Ç—Å—è –≤—ã—à–µ 100 —Ä—É–±–ª–µ–π –∑–∞ –¥–æ–ª–ª–∞—Ä
* 2025-06-19 ‚Äî –û–ø—Ç–∏–º–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –≤ —Ç–µ–∫—É—â–µ–π —Å–∏—Ç—É–∞—Ü–∏–∏ –≤ —Ä–∞–π–æ–Ω–µ 100 —Ä—É–±–ª–µ–π –∑–∞ –¥–æ–ª–ª–∞—Ä
* 2025-06-12 ‚Äî –ò–Ω–¥–µ–∫—Å –¥–æ–ª–ª–∞—Ä–∞ –æ–ø—É—Å—Ç–∏–ª—Å—è –¥–æ –º–∏–Ω–∏–º—É–º–∞ —Å –≤–µ—Å–Ω—ã 2022 –≥–æ–¥–∞
* 2025-05-29 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ—Ç –¶–ë –æ–ø—É—Å—Ç–∏–ª—Å—è –Ω–∏–∂–µ 79 —Ä—É–±–ª–µ–π
* 2025-05-21 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∞ 22 –º–∞—è —Å–æ—Å—Ç–∞–≤–∏–ª 79,75 —Ä—É–±–ª—è
* 2025-04-21 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ–ø—É—Å—Ç–∏–ª—Å—è –Ω–∏–∂–µ 81 —Ä—É–±–ª—è
* 2025-04-14 ‚Äî –¶–µ–Ω—Ç—Ä–æ–±–∞–Ω–∫ –ø–æ–Ω–∏–∑–∏–ª –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –±–æ–ª–µ–µ —á–µ–º –Ω–∞ 1 —Ä—É–±–ª—å
* 2025-03-18 ‚Äî –í–Ω–µ–±–∏—Ä–∂–µ–≤–æ–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ–ø—É—Å—Ç–∏–ª—Å—è –Ω–∏–∂–µ 81 —Ä—É–±–ª—è
* 2025-03-17 ‚Äî –†–∞—Å—á–µ—Ç–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —É–ø–∞–ª –Ω–∏–∂–µ 83 —Ä—É–±–ª–µ–π
* 2025-03-11 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ–ø—É—Å—Ç–∏–ª—Å—è –¥–æ 86 —Ä—É–±–ª–µ–π
* 2024-12-24 ‚Äî –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —É–ø–∞–ª –Ω–∏–∂–µ 100 —Ä—É–±–ª–µ–π
* 2024-12-06 ‚Äî –¶–ë –ø–æ–Ω–∏–∑–∏–ª –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∏–∂–µ 100, –µ–≤—Ä–æ ‚Äî –Ω–∏–∂–µ 107 —Ä—É–±–ª–µ–π
* 2024-11-27 ‚Äî –ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –ø–µ—Ä–µ–≤–∞–ª–∏–ª –∑–∞ 111 —Ä—É–±–ª–µ–π
* 2024-11-14 ‚Äî –ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —Ç–µ–ø–µ—Ä—å —Å—Ç–æ–∏—Ç –≤—ã—à–µ 100 —Ä—É–±–ª–µ–π

In [56]:
print("context chars:", len(out["context"]))
len(SYSTEM_PROMPT)

context chars: 8266


2745

In [57]:
show_summary(out["context"])

–ê–ö–¢–£–ê–õ–¨–ù–ê–Ø –î–ê–¢–ê –û–ë–ó–û–†–ê: 2025-09-04
–í–û–ü–†–û–°/–ó–ê–ü–†–û–°:
–ê–∫—Ç—É–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞

–ò–°–¢–û–ß–ù–ò–ö–ò:
[1] date=2025-09-04 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–ö—É—Ä—Å —Ä—É–±–ª—è –±—É–¥–µ—Ç –∫—Ä–µ–ø—á–µ, —á–µ–º –æ–∂–∏–¥–∞–ª–æ—Å—å, –∏ ¬´–Ω–∞–º –≤—Å–µ–º, –∫ —Å–æ–∂–∞–ª–µ–Ω–∏—é, –ø—Ä–∏–¥–µ—Ç—Å—è –∞–¥–∞–ø—Ç–∏—Ä–æ–≤–∞—Ç—å—Å—è¬ª ‚Äî –ú–∏–Ω—ç–∫–æ–Ω–æ–º—Ä–∞–∑–≤–∏—Ç–∏—è. –ï—â—ë –≤ –∞–ø—Ä–µ–ª–µ –≤–µ–¥–æ–º—Å—Ç–≤–æ –ø—Ä–æ–≥–Ω–æ–∑–∏—Ä–æ–≤–∞–ª–æ –¥–æ–ª–ª–∞—Ä –ø–æ 94,3 —Ä—É–±–ª—è –≤ 2025 –≥–æ–¥—É, –≤ 2026-–º ‚Äî 100 —Ä—É–±–ª–µ–π, 103,5 —Ä—É–±–ª—è –≤ 2027-–º. @bankrollo

[2] date=2025-09-04 channel(s)=–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏
 document=–ö—É—Ä—Å —Ä—É–±–ª—è –≤ –ø—Ä–æ–≥–Ω–æ–∑–µ –ú–∏–Ω—ç–∫–æ–Ω–æ–º—Ä–∞–∑–≤–∏—Ç–∏—è –±—É–¥–µ—Ç –∫—Ä–µ–ø—á–µ –ø—Ä–µ–∂–Ω–∏—Ö –æ—Ü–µ–Ω–æ–∫, –≤—Å–µ–º –ø—Ä–∏–¥–µ—Ç—Å—è –∫ —ç—Ç–æ–º—É –∞–¥–∞–ø—Ç–∏—Ä–æ–≤–∞—Ç—å—Å—è, –∑–∞—è–≤–∏–ª –≤ —Ä–∞–º–∫–∞—Ö –í–≠–§ –†–µ—à–µ—Ç–Ω–∏–∫–æ–≤. –ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è –Ω–∞ –†–ò–ê –ù–æ–≤–æ—Å—Ç–∏

[3] date=2025-09-04 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–î–æ–ª–ª–∞—Ä –ø–æ–¥–Ω—è–ª—Å—è –¥–æ 81 —Ä—É–±–ª—è –ø–æ –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω–æ–º—É –∫—É—Ä—Å—É –¶–ë –≤–ø–µ—Ä–≤—ã–µ —Å 1 –∞–≤–≥—É—Å—Ç–∞. @bankrollo

[4] date=2025-09-04 channel(s)=–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏
 document=–ì—Ä–µ—Ñ –æ–∂–∏–¥–∞–µ—Ç –æ—Å–ª–∞–±–ª–µ–Ω–∏—è –∫—É—Ä—Å–∞ —Ä—É–±–ª—è –¥–æ 85-90 —Ä—É–±–ª–µ–π –∑–∞ –¥–æ–ª–ª–∞—Ä –∫ –∫–æ–Ω—Ü—É 2025 –≥–æ–¥–∞. "–ù–∞ –º–æ–π –≤–∑–≥–ª—è–¥, —Ä—É–±–ª—å –ø–µ—Ä–µ—É–∫—Ä–µ–ø–∏–ª—Å—è —Å–µ–π—á–∞—Å, –∏ –≤—Ä—è–¥ –ª–∏ —Å—Ç–æ–∏—Ç –æ–∂–∏–¥–∞—Ç—å —É–∫—Ä–µ–ø–ª–µ–Ω–∏—è —Ä—É–±–ª—è. –ü—Ä–µ–¥—Å–∫–∞–∑—ã–≤–∞—Ç—å —á—Ç–æ-–ª–∏–±–æ —Å–ª–æ–∂–Ω–æ, –ø–æ—Ç–æ–º—É —á—Ç–æ —ç—Ç–æ –º–Ω–æ–≥–æ—Ñ–∞–∫—Ç–æ—Ä–Ω–∞—è –∏—Å—Ç–æ—Ä–∏—è. –ù–æ —Ç–æ, —á—Ç–æ –º—ã –≤–∏–¥–∏–º —Å–µ–≥–æ–¥–Ω—è, –æ—Å–ª–∞–±–ª–µ–Ω–∏–µ –ø–æ–ª–∏—Ç–∏–∫–∏, –∫–æ—Ç–æ—Ä–∞—è —Å–µ–π—á–∞—Å –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç, –ø—Ä–∏–≤–µ–¥–µ—Ç –∫ –æ—Å–ª–∞–±–ª–µ–Ω–∏—é —Ä—É–±–ª—è", - —Å–∫–∞–∑–∞–ª –ì—Ä–µ—Ñ –≤ –∏–Ω—Ç–µ—Ä–≤—å—é –ù–∞–∏–ª–µ –ê—Å–∫–µ—Ä-–∑–∞–¥–µ –Ω–∞ —Ç–µ–ª–µ–∫–∞–Ω–∞–ª–µ "–†–æ—Å—Å–∏—è 1". –ü—Ä–∏ —ç—Ç–æ–º –æ–Ω –∑–∞—Ç—Ä—É–¥–Ω–∏–ª—Å—è –ø—Ä–æ–≥–Ω–æ–∑–∏—Ä–æ–≤–∞—Ç—å –∫—É—Ä—Å –Ω–∞ —Å—Ä–µ–¥–Ω–µ—Å—Ä–æ—á–Ω—ã–π –ø–µ—Ä–∏–æ–¥. "–í –±–ª–∏–∂–∞–π—à–∏–µ —Ç—Ä–∏ –≥–æ–¥–∞, –∫–æ–Ω–µ—á–Ω–æ, –ø—Ä–µ–¥—Å–∫–∞–∑—ã–≤–∞—Ç—å –æ—á–µ–Ω—å —Å–ª–æ–∂–Ω–æ, –Ω–æ, –æ–ø—è—Ç—å, –ø—Ä–∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–∏ –≥–µ–æ–ø–æ–ª–∏—Ç–∏–∫–∏, –ø–æ–∫–∞ —è –Ω–µ –≤–∏–∂—É —á–µ–≥–æ-—Ç–æ, —á—Ç–æ –º–æ–∂–µ—Ç –ø—Ä–∏–≤–µ—Å—Ç–∏ –∫ —Å–∏–ª—å–Ω–æ–º—É —É–∫—Ä–µ–ø–ª–µ–Ω–∏—é —Ä—É–±–ª—è", - –∑–∞—è–≤–∏–ª –±–∞–Ω–∫–∏—Ä. –ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è –Ω–∞ –†–ò–ê –ù–æ–≤–æ—Å—Ç–∏

[5] date=2025-09-02 channel(s)=–ë–ª—É–º–±–µ—Ä–≥
 document=–ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –º–æ–∂–µ—Ç –¥–æ—Å—Ç–∏—á—å 100 —Ä—É–±–ª–µ–π —É–∂–µ –∫ —è–Ω–≤–∞—Ä—é, –∑–∞—è–≤–∏–ª –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–æ–Ω–Ω—ã–π —ç–∫—Å–ø–µ—Ä—Ç –ê–ª–µ–∫—Å–∞–Ω–¥—Ä –ë–∞—Ö—Ç–∏–Ω. @bIoombusiness

[6] date=2025-09-02 channel(s)=Forbes Russia
 document=–í —ç—Ç–æ–º –≥–æ–¥—É –¥–æ–ª–ª–∞—Ä –≤ —è–Ω–≤–∞—Ä–µ –Ω–∞ –ø–∏–∫–µ —Å—Ç–æ–∏–ª 113,7 —Ä—É–±–ª—è, –∞ –Ω–∞ –º–∏–Ω–∏–º—É–º–µ –≤ –∏—é–ª–µ ‚Äî 73,9 —Ä—É–±–ª—è. –î–æ—à–ª–æ –¥–æ —Ç–æ–≥–æ, —á—Ç–æ Bloomberg –Ω–∞–∑–≤–∞–ª —Ä–æ—Å—Å–∏–π—Å–∫—É—é –≤–∞–ª—é—Ç—É —Å–∞–º–æ–π –ª—É—á—à–µ–π –≤ 2025 –≥–æ–¥—É. –ï—â–µ –±—ã, —Ä—É–±–ª—å —É–∫—Ä–µ–ø–∏–ª—Å—è —Å –Ω–∞—á–∞–ª–∞ –≥–æ–¥–∞ –Ω–∞ 30%. –° —Ç–æ—á–∫–∏ –∑—Ä–µ–Ω–∏—è –æ–±—ã—á–Ω–æ–≥–æ —Ä–æ—Å—Å–∏—è–Ω–∏–Ω–∞, —Å –Ω–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–π –≤–∞–ª—é—Ç–æ–π –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç —á—Ç–æ-—Ç–æ –Ω–µ–æ–±—ã—á–Ω–æ–µ –∏ –Ω–µ—è—Å–Ω–æ, –∫–∞–∫–æ–≤ –µ–µ —Å–ø—Ä–∞–≤–µ–¥–ª–∏–≤—ã–π –∫—É—Ä—Å, —Å—á–∏—Ç–∞–µ—Ç –≥–µ–Ω–µ—Ä–∞–ª—å–Ω—ã–π –¥–∏—Ä–µ–∫—Ç–æ—Ä –£–ö ¬´–ê—Ä–∏–ö–∞–ø–∏—Ç–∞–ª¬ª –ê–ª–µ–∫—Å–µ–π –¢—Ä–µ—Ç—å—è–∫–æ–≤. –ö–∞–∫–∏–º –≤–æ–æ–±—â–µ –¥–æ–ª–∂–µ–Ω –±—ã—Ç—å —Å–ø—Ä–∞–≤–µ–¥–ª–∏–≤—ã–π –∫—É—Ä—Å —Ä—É–±–ª—è? –ß—Ç–æ–±—ã –æ—Ç–≤–µ—Ç–∏—Ç—å –Ω–∞ —ç—Ç–æ—Ç –≤–æ–ø—Ä–æ—Å, –Ω—É–∂–Ω–æ –ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å, –∫–∞–∫–∏–º –±—ã–ª –∫—É—Ä—Å –≤ –ø–µ—Ä–∏–æ–¥ —Å—Ç–∞–±–∏–ª—å–Ω–æ—Å—Ç–∏, —Å–¥–µ–ª–∞—Ç—å –∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä–æ–≤–∫—É —Å —É—á–µ—Ç–æ–º –∏–Ω—Ñ–ª—è—Ü–∏–∏ –∏ —Ç–µ—Ö —Ñ—É–Ω–¥–∞–º–µ–Ω—Ç–∞–ª—å–Ω—ã—Ö –∏–∑–º–µ–Ω–µ–Ω–∏–π, –∫–æ—Ç–æ—Ä—ã–µ –ø—Ä–æ–∏–∑–æ—à–ª–∏ –∑–∞ —ç—Ç–æ –≤—Ä–µ–º—è. –í –Ω–∞—á–∞–ª–µ 2015 –≥–æ–¥–∞ –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –≤ —Ä—É–±–ª—è—Ö –º–æ–∂–Ω–æ –Ω–∞–∑–≤–∞—Ç—å –¥–µ–π—Å—Ç–≤–∏—Ç–µ–ª—å–Ω–æ —Å–±–∞–ª–∞–Ω—Å–∏—Ä–æ–≤–∞–Ω–Ω—ã–º. –í —Ç–µ—á–µ–Ω–∏–µ –ø–æ—Å–ª–µ–¥—É—é—â–∏—Ö –ø—è—Ç–∏ –ª–µ—Ç (–≤–ø–ª–æ—Ç—å –¥–æ –Ω–∞—á–∞–ª–∞ –ø–∞–Ω–¥–µ–º–∏–∏ –∫–æ—Ä–æ–Ω–∞–≤–∏—Ä—É—Å–∞ –≤ 2020 –≥–æ–¥—É) –∫—É—Ä—Å –±–æ–ª—å—à—É—é —á–∞—Å—Ç—å –≤—Ä–µ–º–µ–Ω–∏ –æ—Å—Ç–∞–≤–∞–ª—Å—è –≤ —Ä–∞–º–∫–∞—Ö –¥–æ—Å—Ç–∞—Ç–æ—á–Ω–æ —É–∑–∫–æ–≥–æ –¥–∏–∞–ø–∞–∑–æ–Ω–∞ 60‚Äì70 —Ä—É–±–ª–µ–π –∑–∞ –¥–æ–ª–ª–∞—Ä –°–®–ê. –°–∫–æ—Ä—Ä–µ–∫—Ç–∏—Ä—É–µ–º —ç—Ç–æ—Ç –∫—É—Ä—Å –Ω–∞ –Ω–∞–∫–æ–ø–ª–µ–Ω–Ω—É—é —Ä—É–±–ª–µ–≤—É—é –∏–Ω—Ñ–ª—è—Ü–∏—é. –°–æ–≥–ª–∞—Å–Ω–æ –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–º –¥

[7] date=2025-09-01 channel(s)=Forbes Russia
 document=–ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —Å –º–∞—è –ø–æ –∞–≤–≥—É—Å—Ç –¥–µ—Ä–∂–∏—Ç—Å—è –Ω–∞ –∫–æ–º—Ñ–æ—Ä—Ç–Ω–æ–º –¥–ª—è –Ω–∞—Å–µ–ª–µ–Ω–∏—è —É—Ä–æ–≤–Ω–µ 78‚Äì80 —Ä—É–±–ª–µ–π. –°–∏—Ç—É–∞—Ü–∏—è —á–µ–º-—Ç–æ –ø–æ—Ö–æ–∂–∞ –Ω–∞ 2022 –≥–æ–¥, –∫–æ–≥–¥–∞ –ø–æ—Å–ª–µ –Ω–∞—á–∞–ª–∞ –†–æ—Å—Å–∏–µ–π ¬´—Å–ø–µ—Ü–æ–ø–µ—Ä–∞—Ü–∏–∏¬ª–∏ —à–æ–∫–∞ –Ω–∞ –≤–∞–ª—é—Ç–Ω–æ–º —Ä—ã–Ω–∫–µ –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —Å –∏—é–ª—è –ø–æ –¥–µ–∫–∞–±—Ä—å –Ω–∞—Ö–æ–¥–∏–ª—Å—è –Ω–∞ —É—Ä–æ–≤–Ω–µ 60 —Ä—É–±–ª–µ–π –∏–∑-–∑–∞ –¥–∏—Å–±–∞–ª–∞–Ω—Å–∞ –≤–æ –≤–Ω–µ—à–Ω–µ–π —Ç–æ—Ä–≥–æ–≤–ª–µ ‚Äî –∏–º–ø–æ—Ä—Ç –æ–±–≤–∞–ª–∏–ª—Å—è, —Å–ø—Ä–æ—Å–∞ –Ω–∞ –≤–∞–ª—é—Ç—É –Ω–µ –±—ã–ª–æ. –ù–∞ –ø—Ä–æ—Ç—è–∂–µ–Ω–∏–∏ –ø–æ—Å–ª–µ–¥–Ω–µ–≥–æ –≥–æ–¥–∞ —ç–∫—Å–ø–µ—Ä—Ç—ã –≥–æ–≤–æ—Ä–∏–ª–∏, —á—Ç–æ –∫—É—Ä—Å –æ–ø—Ä–µ–¥–µ–ª—è–µ—Ç—Å—è –ø–æ—Ç–æ–∫–∞–º–∏ —ç–∫—Å–ø–æ—Ä—Ç–∞ –∏ –∏–º–ø–æ—Ä—Ç–∞. –û–¥–Ω–∞–∫–æ –≤–æ II –∫–≤–∞—Ä—Ç–∞–ª–µ —ç—Ç–æ–≥–æ –≥–æ–¥–∞ —ç–∫—Å–ø–æ—Ä—Ç —Å–Ω–∏–∑–∏–ª—Å—è, –∏–º–ø–æ—Ä—Ç –ø–æ–¥—Ä–æ—Å, –Ω–æ —ç—Ç–æ –Ω–∏–∫–∞–∫ –Ω–µ –æ—Ç—Ä–∞–∑–∏–ª–æ—Å—å –Ω–∞ –≤–∞–ª—é—Ç–Ω–æ–º —Ä—ã–Ω–∫–µ. –í–æ–∑–º–æ–∂–Ω–∞—è –ø—Ä–∏—á–∏–Ω–∞ ‚Äî –±–æ–ª—å—à–µ –ø–æ–ª–æ–≤–∏–Ω—ã —Ä–∞—Å—á–µ—Ç–æ–≤ –ø–æ –≤–Ω–µ—à–Ω–µ–π —Ç–æ—Ä–≥–æ–≤–ª–µ –∏–¥–µ—Ç –≤ —Ä—É–±–ª—è—Ö, –∞ –∫—É—Ä—Å —Ç–µ—Ä—è–µ—Ç —Å–≤—è–∑—å —Å —Ä–µ–∞–ª—å–Ω–æ—Å—Ç—å—é. –ü–æ–¥—Ä–æ–±–Ω–µ–µ –æ —Ç–æ–º, –ø–æ—á–µ–º—É –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–µ –º–µ–Ω—è–µ—Ç—Å—è –≤–æ–ø—Ä–µ–∫–∏ —Ä–æ—Å—Ç—É –∏–º–ø–æ—Ä—Ç–∞ –∏ —á—Ç–æ –±—É–¥–µ—Ç —Å –Ω–∏–º –¥–∞–ª—å—à–µ ‚Äî —Ä–∞—Å—Å–∫–∞–∑—ã–≤–∞–µ–º –Ω–∞ —Å–∞–π—Ç–µ üì∏: –§–æ—Ç–æ Getty Images

[8] date=2025-08-31 channel(s)=–ë–ª—É–º–±–µ—Ä–≥
 document=–†–µ–∞–ª—å–Ω—ã–π –∫—É—Ä—Å —Ä—É–±–ª—è –¥–æ–ª–∂–µ–Ω —Å–æ—Å—Ç–∞–≤–ª—è—Ç—å 24,5 –∑–∞ –¥–æ–ª–ª–∞—Ä, —Å—á–∏—Ç–∞—é—Ç —ç–∫—Å–ø–µ—Ä—Ç—ã. –†–æ—Å—Å–∏–π—Å–∫–∞—è –≤–∞–ª—é—Ç–∞ –Ω–∞–∑–≤–∞–Ω–∞ –æ–¥–Ω–æ–π –∏–∑ —Å–∞–º—ã—Ö –Ω–µ–¥–æ–æ—Ü–µ–Ω—ë–Ω–Ω—ã—Ö –≤ –º–∏—Ä–µ. @bIoombusiness

[9] date=2025-08-31 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–†—É–±–ª—å —Å–Ω–æ–≤–∞ –æ–∫–∞–∑–∞–ª—Å—è –æ–¥–Ω–æ–π –∏–∑ —Å–∞–º—ã—Ö –Ω–µ–¥–æ–æ—Ü–µ–Ω–µ–Ω–Ω—ã—Ö –≤–∞–ª—é—Ç –≤ –º–∏—Ä–µ. –ü–æ –∏–Ω–¥–µ–∫—Å—É –±–∏–≥–º–∞–∫–∞ –µ–≥–æ –∫—É—Ä—Å –¥–æ–ª–∂–µ–Ω —Å–æ—Å—Ç–∞–≤–ª—è—Ç—å 24,5 —Ä—É–±–ª—è –∑–∞ –¥–æ–ª–ª–∞—Ä ‚Äî –†–ò–ê –ù–æ–≤–æ—Å—Ç–∏. @bankollo

[10] date=2025-08-18 channel(s)=–°–∏–≥–Ω–∞–ª—ã –†–¶–ë
 document=üè¶ –¶–ë —É—Å—Ç–∞–Ω–æ–≤–∏–ª –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–µ –∫—É—Ä—Å—ã –≤–∞–ª—é—Ç –Ω–∞ 19 –∞–≤–≥—É—Å—Ç–∞: USD: 80.4256 (+0.50%) EUR: 94.0884 (+0.40%) CNY: 11.1547 (+0.42%)

[11] date=2025-08-14 channel(s)=–°–∏–≥–Ω–∞–ª—ã –†–¶–ë
 document=üè¶ –¶–ë —É—Å—Ç–∞–Ω–æ–≤–∏–ª –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–µ –∫—É—Ä—Å—ã –≤–∞–ª—é—Ç –Ω–∞ 15 –∞–≤–≥—É—Å—Ç–∞: USD: 79.7653 (+0.20%) EUR: 93.0588 (-0.35%) CNY: 11.0940 (+0.24%)

[12] date=2025-08-07 channel(s)=–°–∏–≥–Ω–∞–ª—ã –†–¶–ë
 document=üè¶ –¶–ë —É—Å—Ç–∞–Ω–æ–≤–∏–ª –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–µ –∫—É—Ä—Å—ã –≤–∞–ª—é—Ç –Ω–∞ 8 –∞–≤–≥—É—Å—Ç–∞: USD: 79.3847 (-1.00%) EUR: 92.6636 (-0.36%) CNY: 11.0170 (-0.89%)

[13] date=2025-07-28 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–°–≤–µ–∂–∏–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ—Ç –¶–ë –æ–∫–∞–∑–∞–ª—Å—è –º–µ–Ω—å—à–µ –≤–Ω–µ–±–∏—Ä–∂–µ–≤–æ–≥–æ –Ω–∞ –ø–æ–ª—Ç–æ—Ä–∞ —Ä—É–±–ª—è. @bankrollo

[14] date=2025-06-30 channel(s)=–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏
 document=–ò–Ω–¥–µ–∫—Å –¥–æ–ª–ª–∞—Ä–∞, –ø–æ–∫–∞–∑—ã–≤–∞—é—â–∏–π –∫—É—Ä—Å –∫ –≤–∞–ª—é—Ç–∞–º —à–µ—Å—Ç–∏ —Å—Ç—Ä–∞–Ω - —Ç–æ—Ä–≥–æ–≤—ã—Ö –ø–∞—Ä—Ç–Ω–µ—Ä–æ–≤ –°–®–ê, —Å–µ–≥–æ–¥–Ω—è –æ–ø—É—Å–∫–∞–ª—Å—è –Ω–∏–∂–µ 97 –ø—É–Ω–∫—Ç–æ–≤ –≤–ø–µ—Ä–≤—ã–µ —Å –º–∞—Ä—Ç–∞ 2022-–≥–æ –ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è –Ω–∞ –†–ò–ê –ù–æ–≤–æ—Å—Ç–∏ / –í—Å–µ –Ω–∞—à–∏ –∫–∞–Ω–∞–ª—ã

[15] date=2025-06-30 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–ò–Ω–¥–µ–∫—Å –¥–æ–ª–ª–∞—Ä–∞ —Ä—É—Ö–Ω—É–ª –Ω–∏–∂–µ 97 –ø—É–Ω–∫—Ç–æ–≤ –≤–ø–µ—Ä–≤—ã–µ —Å –º–∞—Ä—Ç–∞ 2022 –≥–æ–¥–∞. –û–Ω –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç —Å—Ç–æ–∏–º–æ—Å—Ç—å –∞–º–µ—Ä–∏–∫–∞–Ω—Å–∫–æ–π –≤–∞–ª—é—Ç—ã –∫ –µ–≤—Ä–æ, –∏–µ–Ω–µ, —à–≤–µ–π—Ü–∞—Ä—Å–∫–æ–º—É —Ñ—Ä–∞–Ω–∫—É, –±—Ä–∏—Ç–∞–Ω—Å–∫–æ–º—É —Ñ—É–Ω—Ç—É, –∫–∞–Ω–∞–¥—Å–∫–æ–º—É –¥–æ–ª–ª–∞—Ä—É –∏ —à–≤–µ–¥—Å–∫–æ–π –∫—Ä–æ–Ω–µ. @bankrollo

[16] date=2025-06-20 channel(s)=–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏
 document=–†–∞–≤–Ω–æ–≤–µ—Å–Ω—ã–º —Å–µ–π—á–∞—Å —è–≤–ª—è–µ—Ç—Å—è –≤–∞–ª—é—Ç–Ω—ã–π –∫—É—Ä—Å –≤—ã—à–µ 100 —Ä—É–±–ª–µ–π –∑–∞ –¥–æ–ª–ª–∞—Ä, –∑–∞—è–≤–∏–ª –≥–ª–∞–≤–∞ –°–±–µ—Ä–±–∞–Ω–∫–∞ –ì–µ—Ä–º–∞–Ω –ì—Ä–µ—Ñ. –û–Ω –¥–æ–±–∞–≤–∏–ª, —á—Ç–æ –Ω—ã–Ω–µ—à–Ω–∏–π ‚Äì –≥–æ—Ä–∞–∑–¥–æ –º–µ–Ω—å—à–∏–π ‚Äì –∫—É—Ä—Å –æ—á–µ–Ω—å –Ω–µ–≥–∞—Ç–∏–≤–Ω–æ –≤–ª–∏—è–µ—Ç –Ω–∞ –≤—Å–µ —ç–∫—Å–ø–æ—Ä—Ç–Ω—ã–µ –æ—Ç—Ä–∞—Å–ª–∏ —Ä–æ—Å—Å–∏–π—Å–∫–æ–π —ç–∫–æ–Ω–æ–º–∏–∫–∏ –∏ –±—é–¥–∂–µ—Ç. –û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∞ –ø—è—Ç–Ω–∏—Ü—É —Å–æ—Å—Ç–∞–≤–ª—è–µ—Ç 78,5 —Ä—É–±–ª—è.

[17] date=2025-06-19 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–û–ø—Ç–∏–º–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –≤ —Ç–µ–∫—É—â–µ–π —Å–∏—Ç—É–∞—Ü–∏–∏ –≤ —Ä–∞–π–æ–Ω–µ 100 —Ä—É–±–ª–µ–π –∑–∞ –¥–æ–ª–ª–∞—Ä, –∑–∞—è–≤–∏–ª –ø–µ—Ä–≤—ã–π –≤–∏—Ü–µ-–ø—Ä–µ–º—å–µ—Ä –ú–∞–Ω—Ç—É—Ä–æ–≤. –°–µ–≥–æ–¥–Ω—è –ø—Ä–∏ –∫—Ä–µ–ø–∫–æ–º —Ä—É–±–ª–µ —ç–∫—Å–ø–æ—Ä—Ç –¥–ª—è –±–∞–∑–æ–≤—ã—Ö –æ—Ç—Ä–∞—Å–ª–µ–π –ø—Ä–æ–º—ã—à–ª–µ–Ω–Ω–æ—Å—Ç–∏ —Å—Ç–∞–Ω–æ–≤–∏—Ç—Å—è –º–∞–ª–æ—ç—Ñ—Ñ–µ–∫—Ç–∏–≤–Ω—ã–º, –æ—Ç–º–µ—Ç–∏–ª –æ–Ω. @bankrollo

[18] date=2025-06-12 channel(s)=–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏
 document=–ò–Ω–¥–µ–∫—Å –¥–æ–ª–ª–∞—Ä–∞ (–∫—É—Ä—Å –∫ –∫–æ—Ä–∑–∏–Ω–µ –≤–∞–ª—é—Ç —à–µ—Å—Ç–∏ —Å—Ç—Ä–∞–Ω - —Ç–æ—Ä–≥–æ–≤—ã—Ö –ø–∞—Ä—Ç–Ω–µ—Ä–æ–≤ –°–®–ê) –æ–ø—É—Å—Ç–∏–ª—Å—è –¥–æ –º–∏–Ω–∏–º—É–º–∞ —Å –≤–µ—Å–Ω—ã 2022-–≥–æ. –°–µ–π—á–∞—Å –æ–Ω –Ω–∞—Ö–æ–¥–∏—Ç—Å—è –Ω–∞ —É—Ä–æ–≤–Ω–µ 97,8 –ø—É–Ω–∫—Ç–∞.

[19] date=2025-05-29 channel(s)=–†–∏–∞ –ù–æ–≤–æ—Å—Ç–∏
 document=–û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ—Ç –¶–ë –æ–ø—É—Å—Ç–∏–ª—Å—è –Ω–∏–∂–µ 79 —Ä—É–±–ª–µ–π. –ï–≥–æ –∑–Ω–∞—á–µ–Ω–∏–µ –Ω–∞ —á–µ—Ç–≤–µ—Ä–≥ - 78,5 —Ä—É–±–ª—è.

[20] date=2025-05-21 channel(s)=Forbes Russia
 document=–ë–∞–Ω–∫ –†–æ—Å—Å–∏–∏ —É—Å—Ç–∞–Ω–æ–≤–∏–ª –Ω–∞ 22 –º–∞—è –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∞ —É—Ä–æ–≤–Ω–µ 79,75 —Ä—É–±–ª—è. –ù–∏–∂–µ 80 —Ä—É–±–ª–µ–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ–ø—É—Å—Ç–∏–ª—Å—è –≤–ø–µ—Ä–≤—ã–µ —Å –º–∞—è 2023 –≥–æ–¥–∞. –ü—Ä–∏ —ç—Ç–æ–º –µ–≤—Ä–æ –ø–æ–¥–æ—Ä–æ–∂–∞–ª –¥–æ 91,29 —Ä—É–±–ª—è, –∞ —é–∞–Ω—å –ø–æ—á—Ç–∏ –Ω–µ –∏–∑–º–µ–Ω–∏–ª—Å—è

[21] date=2025-04-21 channel(s)=–°–∏–≥–Ω–∞–ª—ã –†–¶–ë
 document=–¶–ë —É—Å—Ç–∞–Ω–æ–≤–∏–ª –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∏–∂–µ 81 —Ä—É–±–ª—è –≤–ø–µ—Ä–≤—ã–µ —Å –ª–µ—Ç–∞ 2023 –≥–æ–¥–∞ –¶–ë –†–§ —Å 22 –∞–ø—Ä–µ–ª—è —Å–Ω–∏–∑–∏–ª –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –°–®–ê –Ω–∞ 37.74 –∫–æ–ø., –¥–æ 80.7597 —Ä—É–±.

[22] date=2025-04-21 channel(s)=–ë–ª—É–º–±–µ—Ä–≥
 document=–ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ–ø—É—Å—Ç–∏–ª—Å—è –¥–æ 79 —Ä—É–±–ª–µ–π. üîµ Bloomberg

[23] date=2025-04-14 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞; –ë–ª—É–º–±–µ—Ä–≥
 document=–¶–µ–Ω—Ç—Ä–æ–±–∞–Ω–∫ –ø–æ–Ω–∏–∑–∏–ª –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –±–æ–ª–µ–µ —á–µ–º –Ω–∞ 1 —Ä—É–±–ª—å, –µ–≤—Ä–æ ‚Äî –±–æ–ª–µ–µ —á–µ–º –Ω–∞ 2. @bankrollo

[24] date=2025-03-18 channel(s)=–†–∞–Ω—å—à–µ –≤—Å–µ—Ö. –ù—É –ø–æ—á—Ç–∏
 document=–í–Ω–µ–±–∏—Ä–∂–µ–≤–æ–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –æ–ø—É—Å—Ç–∏–ª—Å—è –Ω–∏–∂–µ 81 —Ä—É–±–ª—è

[25] date=2025-03-17 channel(s)=–†–∞–Ω—å—à–µ –≤—Å–µ—Ö. –ù—É –ø–æ—á—Ç–∏
 document=–†–∞—Å—á–µ—Ç–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ - –æ—Ä–∏–µ–Ω—Ç–∏—Ä –¥–ª—è –≤–Ω–µ–±–∏—Ä–∂–µ–≤–æ–≥–æ —Ä—ã–Ω–∫–∞ - —É–ø–∞–ª –Ω–∏–∂–µ 83 —Ä—É–±–ª–µ–π, –≤–ø–µ—Ä–≤—ã–µ —Å –ø—Ä–æ—à–ª–æ–≥–æ –ª–µ—Ç–∞, —Å–ª–µ–¥—É–µ—Ç –∏–∑ –¥–∞–Ω–Ω—ã—Ö —Ç–æ—Ä–≥–æ–≤. –ö 17.02 –º—Å–∫ –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –ø–∞–¥–∞–µ—Ç –Ω–∞ 2,52 —Ä—É–±–ª—è –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–æ –ø—Ä–µ–¥—ã–¥—É—â–µ–≥–æ –∑–∞–∫—Ä—ã—Ç–∏—è - –¥–æ 82,87 —Ä—É–±–ª—è, –º–∏–Ω–∏–º—É–º–∞ —Å 27 –∞–≤–≥—É—Å—Ç–∞ –ø—Ä–æ—à–ª–æ–≥–æ –≥–æ–¥–∞.

[26] date=2025-03-11 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–¶–ë –æ–ø—É—Å—Ç–∏–ª –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –¥–æ 86 —Ä—É–±–ª–µ–π. –ï–≤—Ä–æ —É–ø–∞–ª –¥–æ 93,6 —Ä—É–±–ª—è. @bankrollo

[27] date=2024-12-24 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–û—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —É–ø–∞–ª –Ω–∏–∂–µ 100. @bankrollo

[28] date=2024-12-06 channel(s)=–ë–∞–Ω–∫–∏, –¥–µ–Ω—å–≥–∏, –¥–≤–∞ –æ—Ñ—à–æ—Ä–∞
 document=–¶–ë –ø–æ–Ω–∏–∑–∏–ª –∫—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –Ω–∏–∂–µ 100, –µ–≤—Ä–æ ‚Äî –Ω–∏–∂–µ 107. @bankrollo

[29] date=2024-11-27 channel(s)=–ë–ª—É–º–±–µ—Ä–≥
 document=–ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ –ø–µ—Ä–µ–≤–∞–ª–∏–ª –∑–∞ 111 —Ä—É–±–ª–µ–π. UPD: –ö—É—Ä—Å —É–∂–µ 114. üîµ Bloomberg

[30] date=2024-11-14 channel(s)=–≠–∫–æ–Ω–æ–º–∏–∫–∞ 
 document=–ö—É—Ä—Å –¥–æ–ª–ª–∞—Ä–∞ —Ç–µ–ø–µ—Ä—å —Å—Ç–æ–∏—Ç –≤—ã—à–µ 100 —Ä—É–±–ª–µ–π. –≠—Ç–æ —Å–ª–µ–¥—É–µ—Ç –∏–∑ –¥–∞–Ω–Ω—ã—Ö –ú–æ—Å–±–∏—Ä–∂–∏. ¬´–°—Ç–æ–∏–º–æ—Å—Ç—å –¥–æ–ª–ª–∞—Ä–∞ ‚Äî 100,2428 —Ä—É–±–ª—è¬ª, ‚Äî —Å–æ–æ–±—â–∞–µ—Ç —Ä–µ–≥—É–ª—è—Ç–æ—Ä. –¢–∞–∫–æ–≥–æ —Å–∫–∞—á–∫–∞ –Ω–µ –±—ã–ª–æ –Ω–µ—Å–∫–æ–ª—å–∫–æ –º–µ—Å—è—Ü–µ–≤. –°–µ–π—á–∞—Å –¶–µ–Ω—Ç—Ä–æ–±–∞–Ω–∫ —Å–∞–º —Ä–µ–≥—É–ª–∏—Ä—É–µ—Ç —Ü–µ–Ω—É –¥–æ–ª–ª–∞—Ä–∞ –ø–æ –æ—Ç–Ω–æ—à–µ–Ω–∏—é –∫ —Ä—É–±–ª—é. –≠—Ç–æ —Å—Ç–∞–ª–æ —Å–ª–µ–¥—Å—Ç–≤–∏–µ–º —Å–∞–Ω–∫—Ü–∏–π –°–®–ê. ùîº‚ÑÇùïÜ‚ÑïùïÜùïÑùïÄùïÇùî∏

In [47]:
len(out["candidates"]), len(out["candidates_dedup"]), len(out["candidates_filtered"])

(150, 124, 121)