In [None]:
import os
import sys
import re
import time
import glob
import shutil
import inspect
import logging
import joblib
from hashlib import sha1
from difflib import SequenceMatcher
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats, sparse
from scipy.stats import chi2_contingency, pearsonr, entropy

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, f1_score, top_k_accuracy_score
)

from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import hdbscan
import optuna

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset

from transformers import (
    AutoModel, AutoTokenizer, AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup, TrainingArguments, Trainer,
    EarlyStoppingCallback
)

from google.colab import drive
from tqdm.notebook import tqdm

sns.set()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

##1. Data loading and Preparation

Apply quotas and sampling

In [None]:
def make_quota_power_with_ratio(
    df, class_col='department_name', target_total=120_000, alpha=0.45, ratio_cap=8.0, k_folds=5, min_per_fold=250, keep_frac=1.0, seed=42):
    rng = np.random.default_rng(seed)
    vc = df[class_col].value_counts()
    cls = vc.index.to_list()
    avail = vc.values.astype(int)

    min_need = np.minimum(avail, k_folds * min_per_fold).astype(int)
    min_per_class = min_need.copy()

    m = int(np.clip(np.median(min_per_class), 1, int(avail.min())))
    if m <= 0:
        m = int(max(1, avail.min()))

    max_cap_scalar = int(ratio_cap * m)
    max_per_class = np.minimum(avail, max_cap_scalar)

    max_sum = int(max_per_class.sum())
    if target_total > max_sum:
        target_total = max_sum

    min_clip = np.minimum(min_per_class, max_per_class)
    min_sum = int(min_clip.sum())
    if target_total < min_sum:
        target_total = min_sum

    w = np.power(np.maximum(avail, 1), alpha)
    w = w / w.sum()

    raw = w * target_total
    quota = np.floor(raw).astype(int)
    quota = np.maximum(quota, min_clip)
    quota = np.minimum(quota, max_per_class)

    deficit = int(target_total - quota.sum())
    if deficit > 0:
        frac = raw - np.floor(raw)
        order = np.argsort(-frac)
        for i in order:
            if deficit <= 0: break
            free = int(max_per_class[i] - quota[i])
            if free <= 0: continue
            add = min(deficit, free)
            quota[i] += add
            deficit -= add
    elif deficit < 0:
        deficit = -deficit
        frac = raw - np.floor(raw)
        order = np.argsort(frac)
        for i in order:
            if deficit <= 0: break
            can_drop = int(quota[i] - min_clip[i])
            if can_drop <= 0: continue
            drop = min(deficit, can_drop)
            quota[i] -= drop
            deficit -= drop

    quota = pd.Series(quota, index=cls)

    final_min  = int(quota.min())
    final_max  = int(quota.max())
    final_ratio = final_max / max(1, final_min)

    info = dict(
        target_total=int(quota.sum()),
        min_class=final_min,
        max_class=final_max,
        ratio=float(final_ratio),
        m_ref=int(m),
        max_sum=max_sum,
        min_sum=min_sum
    )
    return quota, info

quota, info = make_quota_power_with_ratio(
    df, class_col='department_name',
    target_total=55000,
    alpha=0.5,
    ratio_cap=15,
    k_folds=5,
    min_per_fold=200
)

print(quota.sort_values(ascending=False))
print(info)

In [None]:
def strat_sample_by_company(df, class_col='department_name', group_col='company_id',
                            quota=None, seed=42):
    rng = np.random.default_rng(seed)
    parts = []
    for cls, q in quota.items():
        sub = df[df[class_col] == cls]
        if len(sub) <= q:
            parts.append(sub)
            continue
        gvc = sub[group_col].value_counts()
        p = gvc / gvc.sum()
        raw = p * q
        base = np.floor(raw).astype(int)
        rest = q - base.sum()
        frac = (raw - base).sort_values(ascending=False)
        for comp_id in frac.index[:rest]:
            base.loc[comp_id] += 1

        for comp_id, k in base.items():
            if k <= 0: continue
            block = sub[sub[group_col] == comp_id]
            if len(block) <= k:
                parts.append(block)
            else:
                parts.append(block.sample(k, random_state=seed))
    return pd.concat(parts, axis=0).reset_index(drop=True)

df_quota = strat_sample_by_company(df, class_col='department_name',
                                  group_col='company_id', quota=quota, seed=42)
print("Размер после квот:", len(df_quota))
print(df_quota['department_name'].value_counts().sort_values(ascending=False).head(10))

##2. Preprocessing and Feature Engineering

###2.1 Feature extraction and Text preprocessing

In [None]:
def analyze_department_distribution(df, target_col='department_name'):
    dept_counts = df[target_col].value_counts()
    for dept, count in dept_counts.head(10).items():
        percentage = count / len(df) * 100
        print(f"  {dept}: {count} ({percentage:.1f}%)")

    plt.figure(figsize=(12, 6))

    plt.subplot(1, 2, 1)
    dept_counts.head(15).plot(kind='bar')
    plt.title('Top 15 Departments by Count')
    plt.xticks(rotation=45)
    plt.ylabel('Count')

    plt.subplot(1, 2, 2)

    top_depts = dept_counts.head(8)
    others_count = dept_counts.iloc[8:].sum()

    if others_count > 0:
        plot_data = pd.concat([top_depts, pd.Series({'Others': others_count})])
    else:
        plot_data = top_depts

    plt.pie(plot_data.values, labels=plot_data.index, autopct='%1.1f%%')
    plt.title('Department Distribution')

    plt.tight_layout()
    plt.show()

    return dept_counts

dept_counts = analyze_department_distribution(df, target_col='department_name')

In [None]:
_whitespace = re.compile(r"\s+")
_boiler_patterns = [
    r"^с уважением.*",
    r"^best regards.*",
    r"^kind regards.*",
    r"^отправлено.*",
    r"^sent from my.*",
    r"^-{2,}.*",
    r"^_{2,}.*",
    r"^\*{2,}.*",
]

def _norm_ws(s: str) -> str:
    return _whitespace.sub(" ", s.strip())

def _strip_boilerplate(text: str) -> str:
    lines = [l.strip() for l in text.splitlines() if l.strip() != ""]
    cleaned = []
    for line in lines:
        bad = False
        for pat in _boiler_patterns:
            if re.match(pat, line, flags=re.I):
                bad = True
                break
        if not bad:
            cleaned.append(line)
    return " ".join(cleaned).strip()

def merge_subject_details(subject, details, sep="[SEP]", sim_threshold=0.92, max_chars=700):
    s = "" if pd.isna(subject) else str(subject)
    d = "" if pd.isna(details) else str(details)

    s = _norm_ws(s)
    d = _norm_ws(d)

    if not d:
        return s, 0, 0, len(s)

    sim = SequenceMatcher(None, s, d).ratio()
    if sim >= sim_threshold or d.startswith(s) or s.startswith(d):
        return s, len(s), len(d), len(s)

    d2 = d.replace(s, " ").strip()
    d2 = _norm_ws(d2)

    d2 = _strip_boilerplate(d2)

    if not d2:
        return s, len(s), len(d), len(s)

    merged = f"{s} {sep} {d2}".strip()

    if len(merged) > max_chars:
        merged = merged[:max_chars].rstrip()

    return merged, len(s), len(d), len(merged)


In [None]:
def process_data(df):
    df = df.copy()

    df = df[~df['subject'].isnull()]
    df = df[~df['department_name'].isnull()]
    df['company_id'] = df['company_id'].fillna('__unknown__')

    df['is_subtask'] = df['parent_uuid'].notnull().astype(int)

    merged = df.apply(
        lambda r: merge_subject_details(r['subject'], r['details']),
        axis=1, result_type='expand'
    )
    merged.columns = ['text', 'len_subject', 'len_details', 'len_text']

    df = pd.concat([df, merged], axis=1)
    df = df[df['text'].str.strip() != ""]

    df = df.drop(columns=['parent_uuid', 'subject', 'details'])

    df['has_details'] = (df['len_details'] > 0).astype(int)
    df['added_details'] = ((df['len_text'] > df['len_subject']) & (df['has_details'] == 1)).astype(int)

    print("nulls:\n", df.isnull().sum())
    return df

df_processed = process_data(df)
df_processed.head()

Text cleaning

In [None]:
def normalize_text_preserve_info(s: str):
    if not isinstance(s, str):
        s = "" if pd.isna(s) else str(s)

    x = s.strip().lower()
    x = x.replace("\u00a0", " ")
    x = re.sub(r'[\u200b\u200c\u200d\u2060]', '', x)

    x = re.sub(r'(?:https?://|ftp://|www\.)\S+', ' <url> ', x, flags=re.I)
    x = re.sub(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', ' <ip> ', x)

    x = re.sub(r'\s+', ' ', x)

    return x.strip()

df_processed['text_norm'] = df_processed['text'].apply(normalize_text_preserve_info)

###2.2 Noise Filtering (Duplicates, HDBSCAN)

Remove exact duplicates

In [None]:
df_processed["uid"] = df_processed["text_norm"].map(lambda s: sha1(s.encode("utf-8")).hexdigest())
df_processed = df_processed.sort_values(["uid", "company_id"]).reset_index(drop=True)
df_processed["is_exact_dup"] = df_processed.duplicated("uid", keep="first")

df_processed["global_drop_reason"] = None
df_processed.loc[df_processed["is_exact_dup"], "global_drop_reason"] = "exact_dup"

Remove near-duplicates

In [None]:
WS_RE = re.compile(r'\s+')
def _prep_for_shingles(s: str) -> str:
    s = s.lower().strip()
    s = s.replace('\u00a0',' ')
    s = WS_RE.sub(' ', s)
    return s

def _char_ngrams(s: str, lo=3, hi=5):
    for n in range(lo, hi+1):
        for i in range(0, max(0, len(s)-n+1)):
            yield s[i:i+n]

def _hash64(x: str, seed: int = 0) -> int:
    return (hash((seed, x)) & 0xffffffffffffffff)

def simhash64(text: str, lo=3, hi=5, n_feat=128):
    s = _prep_for_shingles(text)
    vec = [0]*64
    feats = []
    seen = set()
    for g in _char_ngrams(s, lo, hi):
        if g in seen: continue
        seen.add(g)
        feats.append(g)
        if len(feats) >= n_feat: break

    if not feats:
        return 0

    for g in feats:
        h = _hash64(g)
        w = 1
        for bit in range(64):
            if (h >> bit) & 1:
                vec[bit] += w
            else:
                vec[bit] -= w
    out = 0
    for bit in range(64):
        if vec[bit] >= 0:
            out |= (1 << bit)
    return out

def hamming64(a: int, b: int) -> int:
    return (a ^ b).bit_count()

def drop_near_dups_simhash_keep_first(texts, bands=8, ham_thr=3, n_feat=256):
    n = len(texts)
    keep = np.ones(n, dtype=bool)

    sims = [simhash64(t, lo=3, hi=5, n_feat=n_feat) for t in texts]

    band_size = 64 // bands
    buckets = [defaultdict(list) for _ in range(bands)]

    for i, h in enumerate(sims):
        if not keep[i]:
            continue
        dup = False
        for b in range(bands):
            shift = b * band_size
            mask = (1 << band_size) - 1
            key = (h >> shift) & mask
            bucket = buckets[b][key]

            for j in bucket:
                if not keep[j]:
                    continue
                if hamming64(h, sims[j]) <= ham_thr:
                    keep[i] = False
                    dup = True
                    break
            if dup:
                break

        if keep[i]:
            for b in range(bands):
                shift = b * band_size
                mask = (1 << band_size) - 1
                key = (h >> shift) & mask
                buckets[b][key].append(i)

    return keep


mask_candidates = df_processed["global_drop_reason"].isna() & (~df_processed["is_exact_dup"])
texts = df_processed.loc[mask_candidates, "text_norm"].tolist()

keep_local = drop_near_dups_simhash_keep_first(texts, bands=8, ham_thr=3, n_feat=256)

ix = df_processed.index[mask_candidates]
df_processed.loc[ix[~keep_local], "global_drop_reason"] = "near_dup"

In [None]:
df_clean = df_processed[df_processed["global_drop_reason"].isna()].copy()
print("Размер до:", len(df_processed))
print("Размер после:", len(df_clean))
print(df_processed["global_drop_reason"].value_counts())

HDBSCAN

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используем: {device}")

model_name = "Zamza/XLM-roberta-large-ftit-emb-lr01"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

model.eval()
model.to(device)

In [None]:
def get_embeddings_batched(texts: list[str], batch_size: int = 64):
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i : i + batch_size]

        encoded = tokenizer(
            batch_texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        )
        encoded = {k: v.to(device) for k, v in encoded.items()}

        with torch.no_grad():
            outputs = model(**encoded)

        cls_emb = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_emb)

    return np.vstack(all_embeddings)


texts_list = df_clean['text_norm'].tolist()
embeddings_1024d = get_embeddings_batched(texts_list)

In [None]:
N_COMPONENTS = 64

pca = PCA(n_components=N_COMPONENTS, random_state=42)
embeddings_64d = pca.fit_transform(embeddings_1024d)

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=100,
    min_samples=15,
    metric='euclidean',
    cluster_selection_method='eom',
    core_dist_n_jobs=-1,
    prediction_data=True
)

labels = clusterer.fit_predict(embeddings_64d)

In [None]:
artifacts = {
    'pca': pca,
    'clusterer': clusterer,
    'params': {
        'min_cluster_size': 100,
        'min_samples': 15,
        'metric': 'euclidean',
        'cluster_selection_method': 'eom'
    },
    'emb_model_name': model_name,
}

joblib.dump(artifacts, "hdbscan_artifacts.joblib", compress=3)

In [None]:
df_processed['hdbscan_label'] = labels
df_processed['is_noise'] = (labels == -1)

noise_percentage = df_processed['is_noise'].mean() * 100
n_clusters = len(np.unique(labels)) - 1

print(f"\nНайдено {n_clusters} кластеров.")
print(f"'{noise_percentage:.2f}%' данных помечено как 'шум' (label = -1).")

train_clean_hdbscan = df_processed[df_processed['is_noise'] == False].copy()

print(f"\nРазмер 'старого' train_clean: {len(df_processed)}")
print(f"Размер НОВОГО 'train_clean_hdbscan': {len(train_clean_hdbscan)}")
print(train_clean_hdbscan['department_name'].value_counts())

Create splits

In [None]:
le = LabelEncoder()
train_clean_hdbscan["label_id"] = le.fit_transform(train_clean_hdbscan["department_name"])

def create_folds(df, class_col="label_id", n_folds=5, random_state=42):
    df_out = df.copy()

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    df_out['fold'] = -1

    for fold, (_, val_idx) in enumerate(skf.split(df_out, df_out[class_col])):
        df_out.iloc[val_idx, df_out.columns.get_loc('fold')] = fold

    return df_out

train_clean_with_folds = create_folds(
    train_clean_hdbscan,
    class_col="label_id",
    n_folds=5
)

print(f"Финальный train_clean с фолдами: {len(train_clean_with_folds)}")
print(train_clean_with_folds['fold'].value_counts())

mapping_y = dict(zip(le.classes_, le.transform(le.classes_)))
for k, v in mapping_y.items():
    print(f"{k} -> {v}")

In [None]:
TEXT_COL = "text_norm"

def token_len(s):
    return len(s.split())

def char_len(s):
    return len(s)

def placeholder_ratios(s):
    tokens = s.split()
    n = len(tokens) if tokens else 1
    cnt = Counter(tokens)
    return {
        "ratio_URL":   cnt.get("<url>", 0)/n
    }

tmp = train_clean_with_folds[[TEXT_COL, "label_id", "department_name", "company_id"]].copy()
tmp["len_char"] = tmp[TEXT_COL].map(char_len)
tmp["len_tok"]  = tmp[TEXT_COL].map(token_len)

ratios = tmp[TEXT_COL].map(placeholder_ratios).apply(pd.Series)
tmp = pd.concat([tmp, ratios], axis=1)

by_cls = (tmp
    .groupby(["label_id","department_name"])
    .agg(
        n=("label_id","size"),
        len_char_mean=("len_char","mean"),
        len_char_p10=("len_char",lambda s: np.percentile(s,10)),
        len_char_p90=("len_char",lambda s: np.percentile(s,90)),
        len_tok_mean=("len_tok","mean"),
        url_mean=("ratio_URL","mean"),
        n_companies=("company_id", lambda s: s.nunique())
    )
    .reset_index()
    .sort_values("n", ascending=False)
)

print(by_cls.head(20))

###2.3 Meta-Feature engineering

In [None]:
desc = train_clean["len_text"].describe(percentiles=[0.01,0.05,0.1,0.25,0.5,0.75,0.9,0.95,0.99])
print(desc)

plt.hist(train_clean["len_text"], bins=80, range=(0,300))
plt.title("Распределение len_text (обрезано до 300)")
plt.xlabel("len_text")
plt.ylabel("count")
plt.show()

per_class = train_clean.groupby("label_id")["len_text"].describe()[["mean","min","max","25%","75%"]]
print(per_class)

Enrichment

In [None]:
def enrich_short_texts(row):
    text = row['text_norm']

    if len(text.split()) < 10:
        prefix = []

        if row['is_subtask'] == 1:
            prefix.append("дочерняя задача")
        else:
            prefix.append("новая задача")

        if row['len_text'] < 20:
            prefix.append("короткая")
        elif row['len_text'] < 50:
            prefix.append("средняя")
        else:
            prefix.append("подробная")

        if row['has_details'] == 1:
            prefix.append("с деталями")

        text = " ".join(prefix) + ": " + text

    return text

train_clean['text_enriched'] = train_clean.apply(enrich_short_texts, axis=1)
train_clean["len_text"] = train_clean["text_enriched"].astype(str).str.len()

In [None]:
fold_vals = train_clean['fold'].values
splits_pos = []
for f in np.sort(np.unique(fold_vals)):
    va_pos = np.where(fold_vals == f)[0]
    tr_pos = np.where(fold_vals != f)[0]
    splits_pos.append((tr_pos, va_pos))

Create domain keywords

In [None]:
def mine_domain_keywords(df, text_col, y_col,
                         ngram_range=(1,2), min_df=5, max_df=0.9,
                         top_k=40, min_docs=10, min_precision=0.55,
                         stop_re=None, seed=42):

    rng = np.random.default_rng(seed)
    vect = TfidfVectorizer(
        analyzer="word", ngram_range=ngram_range,
        min_df=min_df, max_df=max_df, lowercase=True, dtype=np.float32
    )
    X = vect.fit_transform(df[text_col].astype(str).values)
    y = df[y_col].to_numpy()
    vocab = np.array(vect.get_feature_names_out())

    Xbin = X.copy()
    Xbin.data[:] = 1.0

    classes = np.unique(y)
    result = {}

    if stop_re is None:
        stop_re = re.compile(r"^(?:\d+|<\w+>|ok|ок|спасибо|заявк|просьб|сообщ|письм|звон|номер|дата)$")

    df_total = np.asarray(Xbin.sum(axis=0)).ravel()

    for c in classes:
        mask_pos = (y == c)
        Xp = Xbin[mask_pos]
        Xn = Xbin[~mask_pos]

        chi, _ = chi2(Xbin, (y == c).astype(int))
        order = np.argsort(-chi)  # от большего к меньшему

        df_pos = np.asarray(Xp.sum(axis=0)).ravel()
        prec = np.divide(df_pos, np.maximum(df_total, 1))

        picks = []
        for j in order:
            term = vocab[j]
            if stop_re.search(term):
                continue
            dpos = int(df_pos[j]); dall = int(df_total[j])
            if dpos < min_docs:
                continue
            if prec[j] < min_precision:
                continue
            if len(term) <= 2:
                continue
            picks.append(term)
            if len(picks) >= top_k:
                break
        result[int(c)] = picks

    return result

In [None]:
def disjoint_keywords_by_precision(df, kw_dict, text_col, y_col):
    all_terms = sorted({t for lst in kw_dict.values() for t in lst})
    term_re = {t: re.compile(rf"(?<!\w){re.escape(t)}(?!\w)", flags=re.IGNORECASE) for t in all_terms}

    stats = {t: {} for t in all_terms}
    for c, terms in kw_dict.items():
        sub = df[df[y_col] == c][text_col].astype(str)
        for t in terms:
            cnt = sub.str.contains(term_re[t]).sum()
            stats[t][c] = int(cnt)

    total = {}
    for t in all_terms:
        total[t] = sum(stats[t].values())

    cleaned = {c: [] for c in kw_dict.keys()}
    for t in all_terms:
        if total[t] == 0:
            continue
        best_c = max(stats[t].keys(), key=lambda c: stats[t][c] / max(1, total[t]))
        cleaned[best_c].append(t)

    for c in cleaned:
        cleaned[c] = sorted(cleaned[c])
    return cleaned

In [None]:
def compile_domain_feature_rules(data, text_col, y_col, id2prefix):
    start_time = time.time()

    kw_raw = mine_domain_keywords(
        data,
        text_col=text_col, y_col=y_col,
        ngram_range=(1,2), min_df=5, max_df=0.95,
        top_k=50, min_docs=12, min_precision=0.6
    )

    kw_final = disjoint_keywords_by_precision(
        data, kw_raw,
        text_col=text_col, y_col=y_col
    )

    compiled_regexes = {}
    for c, terms in kw_final.items():
        if not terms: continue
        name = id2prefix.get(c, f"class{c}")
        pattern = r"|".join([rf"(?<!\w){re.escape(t)}(?!\w)" for t in terms])
        compiled_regexes[name] = re.compile(pattern, flags=re.IGNORECASE)

    artifacts = {
        'id2prefix': id2prefix,
        'compiled_regexes': compiled_regexes
    }

    total_time = time.time() - start_time
    print(f"Артефакты (regex-правила) скомпилированы в памяти за {total_time:.2f} сек.")

    return artifacts

def apply_domain_feature_rules(text_enriched: str, feature_artifacts: dict):
    id2prefix = feature_artifacts['id2prefix']
    compiled_regexes = feature_artifacts['compiled_regexes']

    domain_features_vector = np.zeros(len(id2prefix) * 2, dtype=np.float32)

    idx = 0
    for class_id in sorted(id2prefix.keys()):
        name = id2prefix[class_id]

        if name in compiled_regexes:
            re_pat = compiled_regexes[name]

            found = 1 if re_pat.search(text_enriched) else 0
            count = len(re_pat.findall(text_enriched))

            domain_features_vector[idx] = found
            domain_features_vector[idx + 1] = count

        idx += 2
    return domain_features_vector


id2prefix = {
  0: "outstaff", 1: "accounting", 2: "field", 3: "office_hw",
  4: "reminder", 5: "servers", 6: "sales",
  7: "pm", 8: "service_center", 9: "remote_support"
}

domain_feature_rules = compile_domain_feature_rules(
  train_clean,
  text_col="text_enriched",
  y_col="label_id",
  id2prefix=id2prefix
)

domain_features = np.array(
    [apply_domain_feature_rules(text, domain_feature_rules)
     for text in tqdm(train_clean['text_enriched'], desc="  Generating domain feats")]
)

n_classes = len(id2prefix)
feat_names = [f"{id2prefix[i]}_{t}" for i in sorted(id2prefix.keys()) for t in ("keywords", "count")]

domain_features_df = pd.DataFrame(domain_features, columns=feat_names, index=train_clean.index)

print("\nГотово! Получены переменные 'domain_features' (numpy) и 'domain_features_df' (pandas).")
print(f"Форма domain_features_df: {domain_features_df.shape}")
print(domain_features_df.head())

Create company features

In [None]:
def advanced_company_features(df, splits_pos):
    comp = df['company_id'].values
    y = df['label_id'].values
    n_classes = 10
    n_samples = len(df)

    features = np.zeros((n_samples, n_classes + 3), dtype=np.float32)

    global_class_prior = np.bincount(y, minlength=n_classes) / len(y)

    for fold, (tr_idx, va_idx) in enumerate(splits_pos, 1):
        print(f"Fold {fold}: processing {len(va_idx)} samples")

        tr_companies = set(comp[tr_idx])

        comp_te = {cls: {} for cls in range(n_classes)}
        for c in tr_companies:
            mask = (comp[tr_idx] == c)
            n_samples_c = mask.sum()
            if n_samples_c > 0:
                for cls in range(n_classes):
                    rate = (y[tr_idx][mask] == cls).mean()
                    alpha = min(n_samples_c, 10)
                    smoothed = (rate * alpha + global_class_prior[cls] * 10) / (alpha + 10)
                    comp_te[cls][c] = smoothed

        comp_dominant = {}
        for c in tr_companies:
            mask = (comp[tr_idx] == c)
            if mask.sum() > 5:
                cls_counts = np.bincount(y[tr_idx][mask], minlength=n_classes)
                dominant_ratio = cls_counts.max() / cls_counts.sum()
                comp_dominant[c] = dominant_ratio

        comp_diversity = {}
        for c in tr_companies:
            mask = (comp[tr_idx] == c)
            if mask.sum() > 1:
                cls_counts = np.bincount(y[tr_idx][mask], minlength=n_classes)
                probs = cls_counts / cls_counts.sum()
                probs = probs[probs > 0]
                entropy = -np.sum(probs * np.log(probs + 1e-10))
                comp_diversity[c] = entropy

        for idx in va_idx:
            c = comp[idx]
            is_new = c not in tr_companies

            features[idx, -1] = 1.0 if is_new else 0.0

            if is_new:
                features[idx, :n_classes] = global_class_prior
                features[idx, n_classes] = 0.1
                features[idx, n_classes + 1] = np.log(n_classes)
            else:
                for cls in range(n_classes):
                    features[idx, cls] = comp_te[cls].get(c, global_class_prior[cls])
                features[idx, n_classes] = comp_dominant.get(c, 0.1)
                features[idx, n_classes + 1] = comp_diversity.get(c, 0.0)

    return features

company_feats_advanced = advanced_company_features(train_clean, splits_pos)
print(f"Shape: {company_feats_advanced.shape}")

Create extra features

In [None]:
train_clean["len_char"] = train_clean["text_enriched"].str.len()
train_clean["len_tok"]  = train_clean["text_enriched"].str.split().map(len)
train_clean["log_len_char"] = np.log1p(train_clean["len_char"])
train_clean["log_len_tok"]  = np.log1p(train_clean["len_tok"])
train_clean["details_combo"] = train_clean["has_details"] + train_clean["added_details"]
train_clean["uniq_ratio"] = train_clean["text_enriched"].map(lambda s: len(set(s.split())) / (len(s.split()) + 1))

In [None]:
features_to_check = [
    'is_subtask', 'len_subject', 'len_details', 'len_text',
    'has_details', 'added_details', 'len_char', 'len_tok',
    'log_len_char', 'log_len_tok', 'details_combo', 'uniq_ratio'
]

X_feat = train_clean[features_to_check].fillna(0)
y_feat = train_clean['label_id']

mi = mutual_info_classif(X_feat, y_feat, discrete_features='auto', random_state=42)

mi_series = pd.Series(mi, index=X_feat.columns).sort_values(ascending=False)
print(mi_series)

In [None]:
extra = train_clean[[
    'is_subtask', 'len_subject', 'len_details', 'len_text',
    'has_details', 'added_details', 'len_char', 'len_tok',
    'log_len_char', 'log_len_tok', 'details_combo', 'uniq_ratio'
]].to_numpy(dtype=np.float32)

##3. L1-Models

### 3.1 TF-IDF (char) + LogReg(OOF)

In [None]:
def fit_predict_char_lr_oof(data, splits, max_features):
    n = len(data)
    C = data['label_id'].nunique()
    oof_proba = np.zeros((n, C), dtype=np.float32)
    oof_pred  = np.zeros(n, dtype=np.int64)
    fold_acc  = []

    for fold, (tr_idx, va_idx) in enumerate(splits, 1):
        tr_df = data.iloc[tr_idx]
        va_df = data.iloc[va_idx]

        vect = TfidfVectorizer(
            analyzer='char', ngram_range=(2, 5),
            min_df=2, max_features=max_features,
            lowercase=False, sublinear_tf=True, dtype=np.float32
        )
        X_tr = vect.fit_transform(tr_df['text_enriched'])
        X_va = vect.transform(va_df['text_enriched'])
        y_tr = tr_df['label_id'].values
        y_va = va_df['label_id'].values

        clf = LogisticRegression(
            penalty='l2', C=1, solver='saga',
            max_iter=3000, n_jobs=-1,
            random_state=42
        )
        clf.fit(X_tr, y_tr)
        proba = clf.predict_proba(X_va)

        pred = proba.argmax(axis=1)

        va_original_indices = va_df.index.values
        oof_proba[va_original_indices] = proba
        oof_pred[va_original_indices] = pred

        acc = accuracy_score(y_va, pred)
        fold_acc.append(acc)

        print(f"[fold {fold}] acc={acc:.4f} train={X_tr.shape} val={X_va.shape}")

    mean, std = float(np.mean(fold_acc)), float(np.std(fold_acc))
    print(f"\nOOF char accuracy: {mean:.4f} ± {std:.4f}")
    return dict(oof_proba=oof_proba, oof_pred=oof_pred, fold_acc=fold_acc, mean=mean, std=std)

res_nat = fit_predict_char_lr_oof(train_clean, splits_pos, max_features=200000)
np.save("char_lr_oof_proba_enriched_v1_25_hdb.npy", res_nat['oof_proba'])
print("\nreport:\n", classification_report(train_clean['label_id'], res_nat['oof_pred'], digits=3))
cm = confusion_matrix(train_clean['label_id'], res_nat['oof_pred'])
print(f"\nConfusion Matrix shape: {cm.shape}, Total: {cm.sum()}")
cm

### 3.2 TF-IDF (word) + LogReg(OOF)

In [None]:
def train_word_tfidf(data, splits):
    n = len(data)
    C = data['label_id'].nunique()

    oof = {
        'proba': np.zeros((n, C), dtype=np.float32),
        'pred': np.zeros(n, dtype=np.int64),
        'fold_acc': []
    }

    data = data.reset_index(drop=True)

    for fold, (tr_idx, va_idx) in enumerate(splits, 1):
        tr_df = data.iloc[tr_idx]
        va_df = data.iloc[va_idx]

        vect = TfidfVectorizer(
            analyzer='word',
            ngram_range=(1, 2),
            min_df=5,
            max_features=150000,
            lowercase=True,
            sublinear_tf=True,
            dtype=np.float32
        )
        X_tr = vect.fit_transform(tr_df['text_enriched'])
        X_va = vect.transform(va_df['text_enriched'])

        clf = LogisticRegression(
            C=1.0,
            solver='saga',
            max_iter=3000,
            n_jobs=-1,
            random_state=42
        )
        clf.fit(X_tr, tr_df['label_id'].values)

        proba = clf.predict_proba(X_va)
        pred = proba.argmax(axis=1)

        oof['proba'][va_idx] = proba
        oof['pred'][va_idx] = pred

        acc = accuracy_score(va_df['label_id'].values, pred)
        oof['fold_acc'].append(acc)
        print(f"[fold {fold}] acc={acc:.4f} train={X_tr.shape} val={X_va.shape}")

    mean_acc = np.mean(oof['fold_acc'])
    print(f"\nOOF word accuracy: {mean_acc:.4f} ± {np.std(oof['fold_acc']):.4f}")
    return oof

oof_word = train_word_tfidf(train_clean, splits_pos)
np.save("word_lr_oof_proba_enriched_v1_12_hdbscan.npy", oof_word['proba'])

cm = confusion_matrix(train_clean['label_id'], oof_word['pred'])
print(f"\nConfusion Matrix shape: {cm.shape}, Total: {cm.sum()}")
print(cm)

### 3.5 Finetune XLM-Roberta and mDeBERTa

In [None]:
def train_xlm_finetuned_oof(data, splits, num_epochs=3, batch_size=16, learning_rate=2e-5):
    model_name = 'Zamza/XLM-roberta-large-ftit-emb-lr01'

    class TextDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_length=256):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            text = str(self.texts[idx])
            label = self.labels[idx]

            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )

            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(label, dtype=torch.long)
            }

    n = len(data)
    C = data['label_id'].nunique()
    oof_proba = np.zeros((n, C), dtype=np.float32)
    fold_acc = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    sig = inspect.signature(TrainingArguments)

    for fold, (tr_idx, va_idx) in enumerate(splits, 1):
        print(f"\n{'='*70}")
        print(f"FOLD {fold}/{len(splits)} - Fine-tuning")
        print('='*70)

        tr_df = data.loc[tr_idx].copy()
        va_df = data.loc[va_idx].copy()

        cnt = defaultdict(int)
        keep = []
        for idx, r in tr_df.iterrows():
            key = (r['company_id'], r['label_id'], r['text_enriched'])
            if cnt[key] < 5:
                keep.append(idx)
                cnt[key] += 1
        tr_df = tr_df.loc[keep]

        train_dataset = TextDataset(
            tr_df['text_enriched'].tolist(),
            tr_df['label_id'].tolist(),
            tokenizer
        )

        val_dataset = TextDataset(
            va_df['text_enriched'].tolist(),
            va_df['label_id'].tolist(),
            tokenizer
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=C,
            ignore_mismatched_sizes=True
        )

        grad_accum = 1

        args_kwargs = dict(
            output_dir=f"./mdeberta_fold_fix_{fold}",
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size * 2,
            gradient_accumulation_steps=grad_accum,
            num_train_epochs=num_epochs,
            weight_decay=0.01,
            logging_dir=f"./logs_fix_{fold}",
            load_best_model_at_end=True,
            metric_for_best_model="accuracy",
            report_to="none",
            fp16=not (torch.cuda.get_device_capability(0)[0] >= 8),
            bf16=(torch.cuda.get_device_capability(0)[0] >= 8),
            save_total_limit=1,
            label_smoothing_factor=0.1,
            learning_rate=learning_rate,
            warmup_steps=100,
            dataloader_num_workers=2,
        )

        if "evaluation_strategy" in sig.parameters:
            args_kwargs["evaluation_strategy"] = "epoch"
        else:
            args_kwargs["eval_strategy"] = "epoch"

        if "save_strategy" in sig.parameters:
            args_kwargs["save_strategy"] = "epoch"
        else:
            pass

        if "logging_strategy" in sig.parameters:
            args_kwargs["logging_strategy"] = "steps"
            args_kwargs["logging_steps"] = 50
        else:
            args_kwargs["logging_steps"] = 50

        if "lr_scheduler_type" in sig.parameters:
            args_kwargs["lr_scheduler_type"] = "linear"

        training_args = TrainingArguments(**args_kwargs)

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            acc = accuracy_score(labels, predictions)
            return {'accuracy': acc}

        try:
            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
            )
        except TypeError as e:
            print(f"Ошибка (возможно, 'eval_steps' не поддерживается): {e}")
            print("Пробуем запустить без 'eval_steps'...")

            del training_args.eval_steps

            trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                compute_metrics=compute_metrics,
            )

        print(f"\nОбучаем fold {fold}...")
        trainer.train()

        print(f"Предсказываем fold {fold}...")
        predictions = trainer.predict(val_dataset)

        logits = predictions.predictions
        proba = torch.softmax(torch.tensor(logits), dim=1).numpy()

        oof_proba[va_idx] = proba

        acc = accuracy_score(va_df['label_id'].values, proba.argmax(axis=1))
        fold_acc.append(acc)
        print(f"Fold {fold} accuracy: {acc:.4f}")


        print(f"Сохраняем лучшую модель фолда {fold} в './best_model_fold_{fold}'")
        trainer.save_model(f'./best_xlmr_fold_{fold}')

        del model, trainer
        torch.cuda.empty_cache()

    mean = np.mean(fold_acc)
    print(f"\n xlm roberta (fine-tuned): {mean:.4f} ± {np.std(fold_acc):.4f}")

    return oof_proba, mean


oof_xlm_ft, acc_xlm_ft = train_xlm_finetuned_oof(
    train_clean, splits_pos,
    num_epochs=10,
    batch_size=64,
    learning_rate=2e-5
)
np.save("embed_oof_xlm_roberta_finetuned.npy", oof_xlm_ft)
print(f"Результат: {acc_xlm_ft:.4f}")

## 4. L2 Model: Stacking


In [None]:
oof_xlmr = np.load("embed_oof_xlm_roberta_finetuned_hdbscan.npy")  # (N, C)
oof_char_25 = np.load("char_lr_oof_proba_enriched_v1_25_hdbscan.npy")  # (N, C)
oof_word_12 = np.load("word_lr_oof_proba_enriched_v1_12_hdbscan.npy") # (N, C)

y     = train_clean['label_id'].values

eps = 1e-6
def to_logit(p):
    p = np.clip(p, eps, 1-eps)
    return np.log(p) - np.log1p(-p)
A = to_logit(oof_xlmr)
B = to_logit(oof_char_25)
D = to_logit(oof_word_12)

X_meta = np.hstack([A,B,D, extra, domain_features, company_feats_advanced]).astype(np.float32)

n_samples = len(y)
n_classes = int(y.max() + 1)
oof_pred = np.zeros(n_samples, dtype=np.int64)
oof_proba = np.zeros((n_samples, n_classes), dtype=np.float32)
fold_acc = []


for fold,(tr_pos,va_pos) in enumerate(splits_pos,1):
    X_tr, X_va = X_meta[tr_pos], X_meta[va_pos]
    y_tr, y_va = y[tr_pos], y[va_pos]

    meta = XGBClassifier(
        n_estimators=1500,
        learning_rate=0.02,
        max_depth=7,
        reg_lambda=5.0,
        random_state=42,
        n_jobs=-1,
        tree_method='hist',
        device='cuda',
        objective='multi:softmax',
        num_class=10,
        early_stopping_rounds=100
    )


    meta.fit(X_tr, y_tr,
                      eval_set=[(X_tr, y_tr), (X_va, y_va)],
                      verbose=False)
    p = meta.predict_proba(X_va)
    oof_pred[va_pos] = np.argmax(p, axis=1)
    oof_proba[va_pos] = p

    acc = accuracy_score(y_va, oof_pred[va_pos])
    fold_acc.append(acc)
    print(f"[fold {fold}] STACK 1: acc={acc:.4f} tr={X_tr.shape} va={X_va.shape}")


mean,std = float(np.mean(fold_acc)), float(np.std(fold_acc))
l2_results["xgb"] = mean
roc_auc = roc_auc_score(y, oof_proba, multi_class='ovr', average='weighted')
print(f"\nOOF accuracy STACK: {mean:.4f} ± {std:.4f}")
acc_top3 = top_k_accuracy_score(y, oof_proba, k=3)
acc_top5 = top_k_accuracy_score(y, oof_proba, k=5)
print(f"OOF accuracy (Top-3): {acc_top3:.4f}")
print(f"OOF accuracy (Top-5): {acc_top5:.4f}")
print(f"OOF ROC AUC (weighted OVR): {roc_auc:.4f}")
print("\nreport:\n", classification_report(y, oof_pred, digits=3))
cm = confusion_matrix(y, oof_pred, labels=range(int(y.max()+1)))
print("CM shape:", cm.shape)
print(cm)

In [None]:
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
optuna.logging.set_verbosity(optuna.logging.INFO)

eps = 1e-6
def to_logit(p):
    p = np.clip(p, eps, 1-eps)
    return np.log(p) - np.log1p(-p)

A = to_logit(np.load("embed_oof_xlm_roberta_finetuned_hdbscan.npy"))
B = to_logit(np.load("char_lr_oof_proba_enriched_v1_25_hdbscan.npy"))
C = to_logit(np.load("word_lr_oof_proba_enriched_v1_12_hdbscan.npy"))


X_meta_FULL = np.hstack([A, B, C, extra, domain_features, company_feats_advanced]).astype(np.float32)
y_FULL = train_clean['label_id'].values

print(f"  Форма X_meta_FULL: {X_meta_FULL.shape}")

def objective(trial):
    params = {
        'n_estimators': 2000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.05, log=True),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1.0, 15.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),

        'random_state': 42,
        'n_jobs': -1,
        'tree_method': 'hist',
        'device': 'cuda',
        'objective': 'multi:softmax',
        'num_class': 10,
        'early_stopping_rounds': 100
    }

    fold_acc = []

    for fold, (tr_pos, va_pos) in enumerate(splits_pos, 1):
        X_tr, X_va = X_meta_FULL[tr_pos], X_meta_FULL[va_pos]
        y_tr, y_va = y_FULL[tr_pos], y_FULL[va_pos]

        meta = XGBClassifier(**params)

        meta.fit(X_tr, y_tr,
                 eval_set=[(X_va, y_va)],
                 verbose=False)

        preds = meta.predict(X_va)
        acc = accuracy_score(y_va, preds)
        fold_acc.append(acc)

    mean_acc = float(np.mean(fold_acc))
    return mean_acc

print("Запуск тюнинга L2-модели (Optuna)")

study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=20)

print(f"  Лучший CV Score: {study.best_value:.5f}")
print("  Лучшие Параметры:")
print(study.best_params)

In [None]:
oof_xlmr = np.load("embed_oof_xlm_roberta_finetuned_hdbscan.npy")  # (N, C)
oof_char_25 = np.load("char_lr_oof_proba_enriched_v1_25_hdbscan.npy")  # (N, C)
oof_word_12 = np.load("word_lr_oof_proba_enriched_v1_12_hdbscan.npy")

y     = train_clean['label_id'].values

eps = 1e-6
def to_logit(p):
    p = np.clip(p, eps, 1-eps)
    return np.log(p) - np.log1p(-p)
A = to_logit(oof_xlmr)
B = to_logit(oof_char_25)
C = to_logit(oof_word_12)

X_meta = np.hstack([A,B,C, extra, domain_features, company_feats_advanced]).astype(np.float32)

n_samples = len(y)
n_classes = int(y.max() + 1)
oof_pred = np.zeros(n_samples, dtype=np.int64)
oof_proba = np.zeros((n_samples, n_classes), dtype=np.float32)
fold_acc = []
history_per_fold = {}

params = {
    'n_estimators': 2000,
    'learning_rate': 0.01253085083690128,
    'max_depth': 10,
    'reg_lambda': 3.507499070233551,
    'reg_alpha': 1.3946130633958758,
    'subsample': 0.7935545348903001,
    'colsample_bytree': 0.7853058760189121,
    'gamma': 0.8458201681412074,
    'min_child_weight': 5,
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',
    'device': 'cuda',
    'objective': 'multi:softprob',
    'num_class': 10,
    'early_stopping_rounds': 100}

for fold,(tr_pos,va_pos) in enumerate(splits_pos,1):
    X_tr, X_va = X_meta[tr_pos], X_meta[va_pos]
    y_tr, y_va = y[tr_pos], y[va_pos]

    meta = XGBClassifier(
        **params
    )


    meta.fit(X_tr, y_tr,
                      eval_set=[(X_tr, y_tr), (X_va, y_va)],
                      verbose=False)
    p = meta.predict_proba(X_va)
    oof_pred[va_pos] = np.argmax(p, axis=1)
    oof_proba[va_pos] = p

    evals_res = meta.evals_result()
    history_per_fold[f'fold_{fold}'] = evals_res
    try:
        best_it = meta.get_booster().best_iteration
    except Exception:
        best_it = getattr(meta, "best_iteration", None)

    acc = accuracy_score(y_va, oof_pred[va_pos])
    print(f"[fold {fold}] best_it={best_it}, acc={acc:.4f}")
    fold_acc.append(acc)
    print(f"[fold {fold}] STACK : acc={acc:.4f} tr={X_tr.shape} va={X_va.shape}")


mean,std = float(np.mean(fold_acc)), float(np.std(fold_acc))
roc_auc = roc_auc_score(y, oof_proba, multi_class='ovr', average='weighted')
print(f"\nOOF accuracy STACK: {mean:.4f} ± {std:.4f}")
acc_top3 = top_k_accuracy_score(y, oof_proba, k=3)
acc_top5 = top_k_accuracy_score(y, oof_proba, k=5)
print(f"OOF accuracy (Top-3): {acc_top3:.4f}")
print(f"OOF accuracy (Top-5): {acc_top5:.4f}")
print(f"OOF ROC AUC (weighted OVR): {roc_auc:.4f}")
print("\nreport:\n", classification_report(y, oof_pred, digits=3))
cm = confusion_matrix(y, oof_pred, labels=range(int(y.max()+1)))
print("CM shape:", cm.shape)
print(cm)

In [None]:
for k,v in history_per_fold.items():
    train_mlog = v['validation_0']['mlogloss']
    val_mlog = v['validation_1']['mlogloss']
    plt.plot(val_mlog, label=f"{k}_val")
plt.legend();
plt.xlabel("boost round");
plt.ylabel("mlogloss");
plt.show()

##5. Production Pipeline

###5.1 Enriching text and creating 'extra' features

In [None]:
def enrich_short_texts(row):
    text = row['text_norm']

    if len(text.split()) < 10:
        prefix = []

        if row['is_subtask'] == 1:
            prefix.append("дочерняя задача")
        else:
            prefix.append("новая задача")

        if row['len_text'] < 20:
            prefix.append("короткая")
        elif row['len_text'] < 50:
            prefix.append("средняя")
        else:
            prefix.append("подробная")

        if row['has_details'] == 1:
            prefix.append("с деталями")

        text = " ".join(prefix) + ": " + text

    return text

train_clean['text_enriched'] = train_clean.apply(enrich_short_texts, axis=1)
train_clean["len_text"] = train_clean["text_enriched"].astype(str).str.len()
train_clean["len_char"] = train_clean["text_enriched"].str.len()
train_clean["len_tok"]  = train_clean["text_enriched"].str.split().map(len)
train_clean["log_len_char"] = np.log1p(train_clean["len_char"])
train_clean["log_len_tok"]  = np.log1p(train_clean["len_tok"])
train_clean["details_combo"] = train_clean["has_details"] + train_clean["added_details"]
train_clean["uniq_ratio"] = train_clean["text_enriched"].map(lambda s: len(set(s.split())) / (len(s.split()) + 1))

In [None]:
extra = train_clean[[
    'is_subtask', 'len_subject', 'len_details', 'len_text',
    'has_details', 'added_details', 'len_char', 'len_tok',
    'log_len_char', 'log_len_tok', 'details_combo', 'uniq_ratio'
]].to_numpy(dtype=np.float32)

### 5.2 Creating and saving domain features

In [None]:
def mine_domain_keywords(df, text_col, y_col,
                         ngram_range=(1,2), min_df=5, max_df=0.9,
                         top_k=40, min_docs=10, min_precision=0.55,
                         stop_re=None, seed=42):

    rng = np.random.default_rng(seed)
    vect = TfidfVectorizer(
        analyzer="word", ngram_range=ngram_range,
        min_df=min_df, max_df=max_df, lowercase=True, dtype=np.float32
    )
    X = vect.fit_transform(df[text_col].astype(str).values)
    y = df[y_col].to_numpy()
    vocab = np.array(vect.get_feature_names_out())

    Xbin = X.copy()
    Xbin.data[:] = 1.0

    classes = np.unique(y)
    result = {}

    if stop_re is None:
        stop_re = re.compile(r"^(?:\d+|<\w+>|ok|ок|спасибо|заявк|просьб|сообщ|письм|звон|номер|дата)$")

    df_total = np.asarray(Xbin.sum(axis=0)).ravel()

    for c in classes:
        mask_pos = (y == c)
        Xp = Xbin[mask_pos]
        Xn = Xbin[~mask_pos]

        chi, _ = chi2(Xbin, (y == c).astype(int))
        order = np.argsort(-chi)

        df_pos = np.asarray(Xp.sum(axis=0)).ravel()
        prec = np.divide(df_pos, np.maximum(df_total, 1))

        picks = []
        for j in order:
            term = vocab[j]
            if stop_re.search(term):
                continue
            dpos = int(df_pos[j]); dall = int(df_total[j])
            if dpos < min_docs:
                continue
            if prec[j] < min_precision:
                continue
            if len(term) <= 2:
                continue
            picks.append(term)
            if len(picks) >= top_k:
                break
        result[int(c)] = picks

    return result

In [None]:
def disjoint_keywords_by_precision(df, kw_dict, text_col, y_col):
    all_terms = sorted({t for lst in kw_dict.values() for t in lst})
    term_re = {t: re.compile(rf"(?<!\w){re.escape(t)}(?!\w)", flags=re.IGNORECASE) for t in all_terms}

    stats = {t: {} for t in all_terms}
    for c, terms in kw_dict.items():
        sub = df[df[y_col] == c][text_col].astype(str)
        for t in terms:
            cnt = sub.str.contains(term_re[t]).sum()
            stats[t][c] = int(cnt)

    total = {}
    for t in all_terms:
        total[t] = sum(stats[t].values())

    cleaned = {c: [] for c in kw_dict.keys()}
    for t in all_terms:
        if total[t] == 0:
            continue
        best_c = max(stats[t].keys(), key=lambda c: stats[t][c] / max(1, total[t]))
        cleaned[best_c].append(t)

    for c in cleaned:
        cleaned[c] = sorted(cleaned[c])
    return cleaned

In [None]:
def build_prod_domain_features(data, text_col, y_col, id2prefix, save_path="prod_domain_features.joblib"):
    start_time = time.time()

    kw_raw = mine_domain_keywords(
        data,
        text_col=text_col, y_col=y_col,
        ngram_range=(1,2), min_df=5, max_df=0.95,
        top_k=50, min_docs=12, min_precision=0.6
    )

    kw_final = disjoint_keywords_by_precision(
        data, kw_raw,
        text_col=text_col, y_col=y_col
    )

    compiled_regexes = {}
    for c, terms in kw_final.items():
        if not terms: continue
        name = id2prefix.get(c, f"class{c}")
        pattern = r"|".join([rf"(?<!\w){re.escape(t)}(?!\w)" for t in terms])
        compiled_regexes[name] = re.compile(pattern, flags=re.IGNORECASE)

    artifacts = {
        'id2prefix': id2prefix,
        'compiled_regexes': compiled_regexes
    }

    joblib.dump(artifacts, save_path)

    total_time = time.time() - start_time
    print(f"Файл готов за {total_time:.2f} сек.")

    return artifacts

def generate_domain_features_for_inference(text_enriched: str, feature_maps: dict):
    id2prefix = feature_maps['id2prefix']
    compiled_regexes = feature_maps['compiled_regexes']

    domain_features = np.zeros(len(id2prefix) * 2, dtype=np.float32)

    idx = 0
    for class_id in sorted(id2prefix.keys()):
        name = id2prefix[class_id]

        if name in compiled_regexes:
            re_pat = compiled_regexes[name]

            found = 1 if re_pat.search(text_enriched) else 0

            count = len(re_pat.findall(text_enriched))

            domain_features[idx] = found
            domain_features[idx + 1] = count

        idx += 2

    return domain_features


id2prefix = {
  0: "outstaff", 1: "accounting", 2: "field", 3: "office_hw",
  4: "reminder", 5: "servers", 6: "sales",
  7: "pm", 8: "service_center", 9: "remote_support"
}

prod_domain_feature_maps = build_prod_domain_features(
  train_clean,
  text_col="text_enriched",
  y_col="label_id",
  id2prefix=id2prefix,
  save_path="prod_domain_features.joblib"
)

domain_features = np.array(
    [generate_domain_features_for_inference(text, prod_domain_feature_maps)
     for text in tqdm(train_clean['text_enriched'], desc="  Generating domain feats")]
)

###5.3 Creating and saving company features

In [None]:
def build_prod_company_features(data, save_path="prod_company_features.joblib"):
    start_time = time.time()

    comp = data['company_id'].values
    y = data['label_id'].values

    n_classes = 10
    global_class_prior = np.bincount(y, minlength=n_classes) / len(y)

    comp_te_per_class_map = {cls: {} for cls in range(n_classes)}
    for c in np.unique(comp):
        mask = (comp == c)
        n_samples_c = mask.sum()
        for cls in range(n_classes):
            if n_samples_c > 0:
                rate = (y[mask] == cls).mean()
                alpha = min(n_samples_c, 10)
                smoothed = (rate * alpha + global_class_prior[cls] * 10) / (alpha + 10)
                comp_te_per_class_map[cls][c] = float(smoothed)
            else:
                comp_te_per_class_map[cls][c] = float(global_class_prior[cls])

    comp_main_cls_map = {}
    for c in np.unique(comp):
        mask = (comp == c)
        n_tasks = mask.sum()
        if n_tasks > 5:
            cls_counts = np.bincount(y[mask], minlength=10)
            dominant = cls_counts.argmax()
            comp_main_cls_map[c] = cls_counts[dominant] / n_tasks
        else:
            comp_main_cls_map[c] = 0.1

    comp_div_map = {}
    for c in np.unique(comp):
        mask = (comp == c)
        if mask.sum() > 1:
            cls_counts = np.bincount(y[mask], minlength=10)
            probs = cls_counts / cls_counts.sum()
            probs = probs[probs > 0]
            entropy = -np.sum(probs * np.log(probs))
            comp_div_map[c] = entropy
        else:
            comp_div_map[c] = 0.0

    known_companies = set(np.unique(comp).tolist())

    artifacts = {
        'te_per_class': comp_te_per_class_map,
        'dominant_class': comp_main_cls_map,
        'diversity': comp_div_map,
        'priors': {
            'te_per_class': 0.1,
            'dominant_class': 0.1,
            'diversity': 0.0
        },
        'known_companies': known_companies,
    }

    joblib.dump(artifacts, save_path)

    total_time = time.time() - start_time
    print(f"Файл готов за {total_time:.2f} сек.")

    return artifacts

def generate_company_features_for_inference(company_id: str, feature_maps: dict):
    n_classes = 10
    te_features = np.zeros(10, dtype=np.float32)
    for cls in range(10):
        te_features[cls] = feature_maps['te_per_class'][cls].get(
            company_id,
            feature_maps['priors']['te_per_class']
        )

    dominant_feature = feature_maps['dominant_class'].get(
        company_id,
        feature_maps['priors']['dominant_class']
    )

    diversity_feature = feature_maps['diversity'].get(
        company_id,
        feature_maps['priors']['diversity']
    )

    known_companies = feature_maps.get('known_companies', set())
    is_new = np.float32(0.0 if company_id in known_companies else 1.0)

    return np.hstack([
        te_features,
        dominant_feature,
        diversity_feature,
        is_new
    ]).astype(np.float32)


feature_maps = build_prod_company_features(train_clean, "prod_company_features.joblib")

company_features = np.array(
    [generate_company_features_for_inference(cid, feature_maps)
    for cid in tqdm(train_clean['company_id'], desc="  Generating company feats")]
)

###5.4 Training and saving L1 models

#### 5.4.1 TF-IDF + LogReg


In [None]:
def build_prod_tfidf_lr_model(data, save_prefix, analyzer, ngram_range, max_features, C):
  X_text = data['text_enriched']
  y = data['label_id'].values

  vect = TfidfVectorizer(
      analyzer=analyzer,
      ngram_range=ngram_range,
      min_df=2,
      max_features=max_features,
      lowercase=False,
      sublinear_tf=True,
      dtype=np.float32
  )

  start_time = time.time()
  X_tf = vect.fit_transform(X_text)
  print(f"TF-IDF обучена за {time.time() - start_time:.2f} сек. Форма: {X_tf.shape}")

  clf = LogisticRegression(
      penalty='l2',
      C=C,
      solver='saga',
      max_iter=3000,
      n_jobs=-1,
      random_state=42
  )

  start_time = time.time()
  clf.fit(X_tf, y)
  print(f"LogReg обучена за {time.time() - start_time:.2f} сек.")

  vect_path = f"{save_prefix}_vect.joblib"
  clf_path = f"{save_prefix}_clf.joblib"

  joblib.dump(vect, vect_path)
  joblib.dump(clf, clf_path)

  print(f"  Артефакты сохранены:\n    1. {vect_path}\n    2. {clf_path}")

  return vect, clf

if not isinstance(train_clean.index, pd.RangeIndex):
  train_clean = train_clean.reset_index(drop=True)

vect_b, clf_b = build_prod_tfidf_lr_model(
        data=train_clean,
        save_prefix="prod_char_lr_25",
        analyzer='char',
        ngram_range=(2, 5),
        max_features=200000,
        C=1.0
    )

vect_e, clf_e = build_prod_tfidf_lr_model(
        data=train_clean,
        save_prefix="prod_word_lr_12",
        analyzer='word',
        ngram_range=(1, 2),
        max_features=150000,
        C=1.0
    )

####5.4.2 XLM-Roberta

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def build_prod_transformer_model(data, model_name, save_path, num_epochs=5, batch_size=64, learning_rate=2e-5):
    start_time = time.time()

    X_text = data['text_enriched'].tolist()
    y_labels = data['label_id'].tolist()
    C = data['label_id'].nunique()

    X_train, X_val, y_train, y_val = train_test_split(
        X_text, y_labels, test_size=0.10, random_state=42, stratify=y_labels
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=C,
        ignore_mismatched_sizes=True
    )

    train_dataset = TextDataset(X_train, y_train, tokenizer)
    val_dataset = TextDataset(X_val, y_val, tokenizer)


    def compute_metrics(pred):
        preds = np.argmax(pred.predictions, axis=1)
        labels = pred.label_ids
        acc = accuracy_score(labels, preds)
        return {"accuracy": float(acc)}


    training_args = TrainingArguments(
        output_dir=f"./temp_logs_{save_path}",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size * 2,
        gradient_accumulation_steps=1,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        report_to="none",
        fp16=not (torch.cuda.get_device_capability(0)[0] >= 8),
        bf16=(torch.cuda.get_device_capability(0)[0] >= 8),
        learning_rate=learning_rate,
        warmup_steps=100,
        dataloader_num_workers=2,
        logging_strategy="epoch",
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()

    print(f"  Обучение завершено. Сохраняем модель в {save_path}...")
    trainer.save_model(save_path)

    print(f"  Сохраняем ТОКЕНИЗАТОР в {save_path}...")
    tokenizer.save_pretrained(save_path)

    del model, trainer
    torch.cuda.empty_cache()

    total_time = time.time() - start_time
    print(f" Завершено за {total_time:.2f} сек.")


build_prod_transformer_model(
    data=train_clean,
    model_name='Zamza/XLM-roberta-large-ftit-emb-lr01',
    save_path='prod_xlmr_finetuned_hdbscan',
    num_epochs=8,
    batch_size=64,
    learning_rate=2e-5
)



###5.5 Training and saving L2 model

In [None]:
y_FULL = train_clean['label_id'].values

def to_logit(p, eps=1e-6):
    p = np.clip(p, eps, 1-eps)
    return np.log(p) - np.log1p(-p)

oof_xlmr = np.load("embed_oof_xlm_roberta_finetuned_hdbscan.npy")  # (N, C)
oof_char_25 = np.load("char_lr_oof_proba_enriched_v1_25_hdbscan.npy")  # (N, C)
oof_word_12 = np.load("word_lr_oof_proba_enriched_v1_12_hdbscan.npy") # (N, C)

def to_logit(p):
    p = np.clip(p, eps, 1-eps)
    return np.log(p) - np.log1p(-p)
A = to_logit(oof_xlmr)
B = to_logit(oof_char_25)
C = to_logit(oof_word_12)

X_meta_FULL = np.hstack([A,B,C, extra, domain_features, company_features]).astype(np.float32)

print(f"Финальная форма X_meta_FULL: {X_meta_FULL.shape}")
print(f"Финальная форма y_FULL: {y_FULL.shape}")

params = {
    'n_estimators': 800,
    'learning_rate': 0.01253085083690128,
    'max_depth': 10,
    'reg_lambda': 3.507499070233551,
    'reg_alpha': 1.3946130633958758,
    'subsample': 0.7935545348903001,
    'colsample_bytree': 0.7853058760189121,
    'gamma': 0.8458201681412074,
    'min_child_weight': 5,
    'random_state': 42,
    'n_jobs': -1,
    'tree_method': 'hist',
    'device': 'cuda',
    'objective': 'multi:softprob',
    'num_class': 10}

final_stacker = XGBClassifier(
    **params
)

final_stacker.fit(X_meta_FULL, y_FULL, verbose=100)

save_path_l2 = "prod_stacker_L2.xgb"
final_stacker.save_model(save_path_l2)

##6. Inference Pipeline

In [None]:
def load_prod_features(load_path):
    if not os.path.exists(load_path):
        print(f"ОШИБКА: Файл артефактов L2-фич не найден: {load_path}")
        return None

    print(f"Загрузка артефактов L2-фич из {load_path}...")
    artifacts = joblib.load(load_path)
    return artifacts

def load_all_prod_artifacts():
    artifacts = {}

    # A (XLM-R)
    artifacts['tok_A'] = AutoTokenizer.from_pretrained("Zamza/XLM-roberta-large-ftit-emb-lr01")
    artifacts['clf_A'] = AutoModelForSequenceClassification.from_pretrained('./prod_xlmr_finetuned_hdbscan').to(device).eval()
    # B (char_lr_25)
    artifacts['vect_B'] = joblib.load("prod_char_lr_25_vect.joblib")
    artifacts['clf_B']  = joblib.load("prod_char_lr_25_clf.joblib")
    # C (word_lr_12)
    artifacts['vect_C'] = joblib.load("prod_word_lr_12_vect.joblib")
    artifacts['clf_C']  = joblib.load("prod_word_lr_12_clf.joblib")

    # L2 (Mappers)
    artifacts['company_feature_maps'] = load_prod_features("prod_company_features.joblib")
    artifacts['domain_feature_maps']  = load_prod_features("prod_domain_features.joblib")

    # L2 (XGBoost)
    artifacts['stacker_L2'] = XGBClassifier()
    artifacts['stacker_L2'].load_model("prod_stacker_L2.xgb")

    artifacts['hdbscan'] = joblib.load("hdbscan_artifacts.joblib")

    print("Все Артифакты загружены успешно!")

    return artifacts

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

_whitespace = re.compile(r"\s+")
_boiler_patterns = [
    r"^с уважением.*",
    r"^best regards.*",
    r"^kind regards.*",
    r"^отправлено.*",
    r"^sent from my.*",
    r"^-{2,}.*",
    r"^_{2,}.*",
    r"^\*{2,}.*",
]

def _norm_ws(s: str) -> str:
    return _whitespace.sub(" ", s.strip())

def _strip_boilerplate(text: str) -> str:
    lines = [l.strip() for l in text.splitlines() if l.strip() != ""]
    cleaned = []
    for line in lines:
        bad = False
        for pat in _boiler_patterns:
            if re.match(pat, line, flags=re.I): bad = True; break
        if not bad: cleaned.append(line)
    return " ".join(cleaned).strip()

def merge_subject_details(subject, details, sep="[SEP]", sim_threshold=0.92, max_chars=700):
    s = "" if pd.isna(subject) else str(subject)
    d = "" if pd.isna(details) else str(details)
    s = _norm_ws(s)
    d = _norm_ws(d)

    if not d:
        return s, 0, 0, len(s)

    sim = SequenceMatcher(None, s, d).ratio()
    if sim >= sim_threshold or d.startswith(s) or s.startswith(d):
        return s, len(s), len(d), len(s)

    d2 = d.replace(s, " ").strip()
    d2 = _norm_ws(d2)

    d2 = _strip_boilerplate(d2)

    if not d2:
        return s, len(s), len(d), len(s)

    merged = f"{s} {sep} {d2}".strip()

    if len(merged) > max_chars:
        merged = merged[:max_chars].rstrip()

    return merged, len(s), len(d), len(merged)

def normalize_text_preserve_info(s: str):
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    x = s.strip().lower()
    x = x.replace("\\u00a0", " ")
    x = re.sub(r'[\\u200b\\u200c\\u200d\\u2060]', '', x)
    x = re.sub(r'(?:https?://|ftp://|www\\.)\\S+', ' <url> ', x, flags=re.I)
    x = re.sub(r'\\b(?:\\d{1,3}\\.){3}\\d{1,3}\\b', ' <ip> ', x)
    x = re.sub(r'\\s+', ' ', x)
    return x.strip()

def enrich_short_texts(text_norm, is_subtask, len_text, has_details):
    text = text_norm
    if len(text.split()) < 10:
        prefix = []
        prefix.append("дочерняя задача" if is_subtask == 1 else "новая задача")
        if len_text < 20: prefix.append("короткая")
        elif len_text < 50: prefix.append("средняя")
        else: prefix.append("подробная")
        if has_details == 1: prefix.append("с деталями")
        text = " ".join(prefix) + ": " + text
    return text

def to_logit(p, eps=1e-6):
    p = np.clip(p, eps, 1-eps)
    return np.log(p) - np.log1p(-p)

def generate_company_features_for_inference(company_id: str, feature_maps: dict):
    te_features = np.zeros(10, dtype=np.float32)
    for cls in range(10):
        te_features[cls] = feature_maps['te_per_class'][cls].get(
            company_id, feature_maps['priors']['te_per_class']
        )
    dominant_feature = feature_maps['dominant_class'].get(
        company_id, feature_maps['priors']['dominant_class']
    )
    diversity_feature = feature_maps['diversity'].get(
        company_id, feature_maps['priors']['diversity']
    )

    known_companies = feature_maps.get('known_companies', set())
    is_new = np.float32(0.0 if company_id in known_companies else 1.0)
    return np.hstack([
        te_features, dominant_feature, diversity_feature, is_new
    ]).astype(np.float32)

def generate_domain_features_for_inference(text_enriched: str, feature_maps: dict):
    id2prefix = feature_maps['id2prefix']
    compiled_regexes = feature_maps['compiled_regexes']
    domain_features = np.zeros(len(id2prefix) * 2, dtype=np.float32)
    idx = 0
    for class_id in sorted(id2prefix.keys()):
        name = id2prefix[class_id]
        if name in compiled_regexes:
            re_pat = compiled_regexes[name]
            found = 1 if re_pat.search(text_enriched) else 0
            count = len(re_pat.findall(text_enriched))
            domain_features[idx] = found
            domain_features[idx + 1] = count
        idx += 2
    return domain_features.astype(np.float32)


def generate_l1_features(text_enriched: str, artifacts: dict):
    # 1. A (XLM-R)
    inputs_A = artifacts['tok_A'](text_enriched, truncation=True, padding='max_length', max_length=256, return_tensors='pt')
    inputs_A = {k: v.to(device) for k,v in inputs_A.items()}
    with torch.no_grad():
        logits_A = artifacts['clf_A'](**inputs_A).logits
        proba_A = torch.softmax(logits_A, dim=1).cpu().numpy()[0]
    # 2. B (char_lr_25)
    X_tf_B = artifacts['vect_B'].transform([text_enriched])
    proba_B = artifacts['clf_B'].predict_proba(X_tf_B)[0]

    # 3. C (word_lr_12)
    X_tf_C = artifacts['vect_C'].transform([text_enriched])
    proba_C = artifacts['clf_C'].predict_proba(X_tf_C)[0]

    # L1
    l1_features = np.hstack([
        to_logit(proba_A),
        to_logit(proba_B),
        to_logit(proba_C)
    ])
    return l1_features.astype(np.float32)

def generate_extra_features(text_enriched: str, has_details: int, added_details: int, is_subtask: int, len_subject: int, len_details: int):
    is_missing = (text_enriched is None) or (isinstance(text_enriched, float) and np.isnan(text_enriched))

    s_str = str(text_enriched)
    len_text = len(s_str)

    len_char = np.nan if is_missing else len(s_str)

    tokens = [] if is_missing else s_str.split()
    len_tok = len(tokens)

    uniq_ratio = len(set(tokens)) / (len_tok + 1)

    log_len_char = np.nan if is_missing else np.log1p(len_char)
    log_len_tok = np.log1p(len_tok)


    len_text = len(text_enriched)
    tokens = text_enriched.split()
    len_tok = len(tokens)
    uniq_ratio = len(set(tokens)) / (len_tok + 1)

    details_combo = int(has_details) + int(added_details)

    basic_features = np.array([
        is_subtask,
        len_subject,
        len_details,
        len_text,
        has_details,
        added_details,
        len_char if not np.isnan(len_char) else np.nan,
        len_tok,
        log_len_char if not np.isnan(log_len_char) else np.nan,
        log_len_tok,
        details_combo,
        uniq_ratio
    ]).astype(np.float32)

    return basic_features

In [None]:
def predict_one_pipeline(
    raw_subject: str,
    raw_details: str,
    raw_parent_uuid: str,
    raw_company_id: str,
    artifacts_bundle: dict
):
    text_merged, len_subject, len_details, len_text_original = merge_subject_details(raw_subject, raw_details)

    text_norm = normalize_text_preserve_info(text_merged)

    is_subtask = 1 if (raw_parent_uuid and pd.notna(raw_parent_uuid)) else 0
    has_details = 1 if (len_details > 0) else 0
    added_details = 1 if (has_details == 1 and (len_text_original > len_subject)) else 0

    text_enriched = enrich_short_texts(
        text_norm, is_subtask, len_text_original, has_details
    )

    l1_features = generate_l1_features(
        text_enriched,
        artifacts_bundle
    )

    extra = generate_extra_features(
        text_enriched,
        has_details,
        added_details,
        is_subtask,
        len_subject,
        len_details
    )

    company_features = generate_company_features_for_inference(
        raw_company_id,
        artifacts_bundle['company_feature_maps']
    )

    domain_features = generate_domain_features_for_inference(
        text_enriched,
        artifacts_bundle['domain_feature_maps']
    )

    X_meta_single = np.hstack([
        l1_features,
        extra,
        domain_features,
        company_features
    ]).reshape(1, -1)

    proba = artifacts_bundle['stacker_L2'].predict_proba(X_meta_single)[0]

    return proba

#7. Test


In [None]:
def _embed_texts_batch_from_artifacts(texts, artifacts_bundle, batch_size=64):
    model_name = "Zamza/XLM-roberta-large-ftit-emb-lr01"
    emb_model = AutoModel.from_pretrained(model_name)
    emb_tokenizer = AutoTokenizer.from_pretrained(model_name)
    device_local = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    emb_model.to(device_local)
    emb_model.eval()

    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = emb_tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)
        enc = {k: v.to(device_local) for k, v in enc.items()}
        with torch.no_grad():
            out = emb_model(**enc)
        cls_emb = out.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(cls_emb)
    return np.vstack(all_embeddings)

def hdbscan_assign_batch_using_artifacts(texts, artifacts_bundle, strength_thresh=0.25, batch_size=64):
    art_h = artifacts_bundle['hdbscan']
    pca = art_h['pca']
    clusterer = art_h['clusterer']

    emb = _embed_texts_batch_from_artifacts(texts, artifacts_bundle, batch_size=batch_size)
    emb_p = pca.transform(emb)
    labels, strengths = hdbscan.approximate_predict(clusterer, emb_p)
    labels = np.array(labels, dtype=int)
    strengths = np.array(strengths, dtype=float)
    in_cluster = (labels != -1) & (strengths >= strength_thresh)
    return {'labels': labels, 'strengths': strengths, 'in_cluster': in_cluster}

def experiment_skip_noise_using_artifacts(test_df, artifacts_bundle, strength_thresh=0.25, batch_size=64):
    merged = test_df.apply(
        lambda r: merge_subject_details(r['subject'], r['details']),
        axis=1, result_type='expand'
    )

    test_df['text_for_hdbscan'] = merged[0]

    texts = test_df['text_for_hdbscan'].apply(normalize_text_preserve_info).tolist()

    assign = hdbscan_assign_batch_using_artifacts(texts, artifacts_bundle, strength_thresh=strength_thresh, batch_size=batch_size)

    in_mask = assign['in_cluster']
    n_total = len(texts)
    n_in = int(in_mask.sum())
    pct_in = 100.0 * n_in / n_total
    print(f"Всего строк: {n_total}, in-cluster: {n_in} ({pct_in:.2f}%), noise/skipped: {n_total-n_in}")

    # Predict only in-cluster
    all_probas = []
    all_true = []
    idxs = np.where(in_mask)[0].tolist()
    for i in tqdm(idxs, desc="Predicting in-cluster samples"):
        row = test_df.iloc[i]
        proba = predict_one_pipeline(row['subject'], row['details'], row['parent_uuid'], row['company_id'], artifacts_bundle)
        all_probas.append(proba)
        all_true.append(int(row['label_id']))

    if len(all_true) == 0:
        print("Нет in-cluster примеров — ничего не считаем.")
        return {"covered": 0.0}

    all_probas = np.vstack(all_probas)
    all_true = np.array(all_true, dtype=int)

    preds_top1 = np.argmax(all_probas, axis=1)
    acc_top1 = accuracy_score(all_true, preds_top1)
    acc_top3 = top_k_accuracy_score(all_true, all_probas, k=3)
    acc_top5 = top_k_accuracy_score(all_true, all_probas, k=5)

    print("\n--- Results (only in-cluster) ---")
    print(f"Covered (in-cluster) = {n_in}/{n_total} = {pct_in:.2f}%")
    print(f"Top-1: {acc_top1:.5f}, Top-3: {acc_top3:.5f}, Top-5: {acc_top5:.5f}")

    cm = confusion_matrix(all_true, preds_top1, labels=range(all_probas.shape[1]))
    return {
        "covered": float(pct_in),
        "n_total": n_total,
        "n_in": n_in,
        "top1": float(acc_top1),
        "top3": float(acc_top3),
        "top5": float(acc_top5),
        "probas": all_probas,
        "y_true": all_true,
        "preds_top1": preds_top1,
        "confusion_matrix": cm
    }


artifacts_bundle = load_all_prod_artifacts()
test_df_existing_only = pd.read_csv('test_df_existing_only.csv')

id2name = {
    0: "Аутстаффинг", 1: "Бухгалтерия", 2: "Выездные специалисты",
    3: "Заправки и ремонт офисной техники", 4: "Напоминание",
    5: "Обслуживание серверов", 6: "Продажи ",
    7: "Проектный менеджер", 8: "Сервисный центр",
    9: "Удаленная поддержка"
}


name2id = {v: k for k, v in id2name.items()}

test_df_existing_only["label_id"] = test_df_existing_only["department_name"].map(name2id)

results = experiment_skip_noise_using_artifacts(test_df_existing_only, artifacts_bundle)

Results

In [None]:
def print_experiment_summary(results: dict, id2name: dict):
    probas = results.get("probas", None)
    y_true = np.asarray(results.get("y_true", []), dtype=int)
    preds_top1 = results.get("preds_top1", None)
    if preds_top1 is None and probas is not None:
        preds_top1 = np.argmax(probas, axis=1)
    elif preds_top1 is None:
        raise ValueError("No preds_top1 or probas found in results.")

    if probas is not None and len(probas) == len(y_true):
        acc_top1 = accuracy_score(y_true, preds_top1)
        acc_top3 = top_k_accuracy_score(y_true, probas, k=3)
        acc_top5 = top_k_accuracy_score(y_true, probas, k=5)
    else:
        acc_top1 = accuracy_score(y_true, preds_top1)
        acc_top3 = results.get("top3", None)
        acc_top5 = results.get("top5", None)

    roc_auc = roc_auc_score(
                y_true, probas,
                multi_class='ovr',
                average='weighted'
            )

    n_total = results.get("n_total", None)
    n_in = results.get("n_in", None)
    covered_pct = results.get("covered", None) or (100.0 * n_in / n_total if (n_total and n_in) else None)

    if n_total is not None and n_in is not None:
        print(f"Rows total: {n_total}, in-cluster (covered): {n_in} ({covered_pct:.2f}%)")
    elif n_total is not None:
        print(f"Rows total: {n_total}")
    if results.get("covered") is not None:
        print(f"in_cluster fraction: {results['covered']:.4f}")

    print(f"\nTop-1 Accuracy: {acc_top1:.5f}")
    if acc_top3 is not None:
        print(f"Top-3 Accuracy: {acc_top3:.5f}")
    if acc_top5 is not None:
        print(f"Top-5 Accuracy: {acc_top5:.5f}")
    if roc_auc is not None:
        print(f"ROC-AUC (weighted OVR): {roc_auc:.5f}")


    max_label = max(id2name.keys())
    labels = list(range(max_label+1))
    target_names = [id2name[i] for i in labels]

    print("\nClassification report:\n")
    report_txt = classification_report(y_true, preds_top1, labels=labels, target_names=target_names, digits=4)
    print(report_txt)

    cm = confusion_matrix(y_true, preds_top1, labels=labels)

    print("\nConfusion matrix:")
    print(cm)

    extra_df = None
    if probas is not None:
        top3_preds = np.argsort(probas, axis=1)[:, -3:][:, ::-1]
        top5_preds = np.argsort(probas, axis=1)[:, -5:][:, ::-1]
        per_class = []
        for lab in labels:
            idx = np.where(y_true == lab)[0]
            if len(idx) == 0:
                per_class.append({
                    "label": lab,
                    "name": id2name[lab],
                    "support": 0,
                    "top1_recall": None,
                    "in_top3_rate": None,
                    "in_top5_rate": None
                })
                continue
            top1_recall = (preds_top1[idx] == lab).mean()
            in_top3 = np.mean([lab in top3_preds[i] for i in idx])
            in_top5 = np.mean([lab in top5_preds[i] for i in idx])
            per_class.append({
                "label": lab,
                "name": id2name[lab],
                "support": len(idx),
                "top1_recall": float(top1_recall),
                "in_top3_rate": float(in_top3),
                "in_top5_rate": float(in_top5)
            })
        extra_df = pd.DataFrame(per_class).set_index("label")
        print("\nPer-class top-k coverage (support, top1 recall, in_top3, in_top5):")
        print(extra_df[["name","support","top1_recall","in_top3_rate","in_top5_rate"]])

    return {
        "acc_top1": acc_top1,
        "acc_top3": acc_top3,
        "acc_top5": acc_top5,
        "classification_report_text": report_txt
    }

In [None]:
print("Результаты на известных компаниях без выбросов \n")
summary = print_experiment_summary(results, id2name)