In [1]:
import pandas as pd
unseen = pd.read_csv('unseen.csv')
seen = pd.concat([pd.read_csv('train.csv'), pd.read_csv('validation.csv')])[['comment_id', 'text_original']]
unseen = unseen[['comment_id', 'text_original']]
df = pd.concat([unseen, seen])
df = df.drop_duplicates(subset=['comment_id'])

In [2]:
import joblib

pipeline_lang = joblib.load("langid_ru_kk_model.pkl")

In [3]:
def predict_lang(texts):
    if isinstance(texts, str):
        return pipeline_lang.predict([texts])[0]
    return pipeline_lang.predict(list(texts))

In [4]:
df["lang_model"] = predict_lang(df["text_original"].astype(str))
df["is_kazakh_model"] = df["lang_model"].eq("kk")

In [7]:
kaz_df = df[df.is_kazakh_model]
df = df[~df.is_kazakh_model]

In [9]:
kaz_df.to_csv('kaz_df.csv', index=False)

In [10]:
# !pip install transformers sentencepiece --quiet
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

model_checkpoint = "cointegrated/rubert-tiny-toxicity"
tokenizer_tox = AutoTokenizer.from_pretrained(model_checkpoint)
model_tox = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_tox.to(device)
model_tox.eval()

# имена аспектов из описания модели:
tox_aspect_names = ["toxic", "obscene", "threat", "insult", "hate"]


In [11]:
def text2toxicity_batch(texts, aggregate=False):
    """
    texts: список сырых значений (строки, NaN, числа и т.д.)
    aggregate=False -> возвращает матрицу [batch, 5] по аспектам
    aggregate=True  -> возвращает вектор аггрегированного score [batch]
    """
    # чистим вход: все к строке, NaN/None -> ""
    cleaned_texts = []
    for t in texts:
        if t is None or (isinstance(t, float) and pd.isna(t)):
            cleaned_texts.append("")
        else:
            cleaned_texts.append(str(t))

    with torch.no_grad():
        inputs = tokenizer_tox(
            cleaned_texts,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512,
        ).to(device)

        logits = model_tox(**inputs).logits
        proba = torch.sigmoid(logits).cpu().numpy()  # shape: [batch, 5]

    if aggregate:
        # та же формула, что у тебя в text2toxicity
        agg = 1 - proba[:, 0] * (1 - proba[:, -1])
        return agg  # shape: [batch]
    return proba  # shape: [batch, 5]


In [12]:
def add_toxicity_to_df(df, text_col="text_original", batch_size=64, add_aspects=True):
    """
    df: твой DataFrame
    text_col: колонка с текстом
    batch_size: размер батча
    add_aspects: если True — создаст колонки по аспектам
    """
    all_agg_scores = []
    all_aspects = []

    for i in tqdm(range(0, len(df), batch_size)):
        batch_texts = df[text_col].iloc[i:i + batch_size].tolist()
        batch_proba = text2toxicity_batch(batch_texts, aggregate=False)  # [B, 5]
        batch_agg = 1 - batch_proba[:, 0] * (1 - batch_proba[:, -1])    # [B]

        all_agg_scores.extend(batch_agg.tolist())
        all_aspects.append(batch_proba)

    # агрегированный score
    df["toxicity_score"] = all_agg_scores

    if add_aspects:
        aspects_arr = np.vstack(all_aspects)  # [N, 5]
        for j, name in enumerate(tox_aspect_names):
            df[f"tox_{name}"] = aspects_arr[:, j]

    return df


In [13]:
# df уже загружен, тексты лежат в колонке "text_original"
df = add_toxicity_to_df(df, text_col="text_original", batch_size=64, add_aspects=True)

100%|██████████| 2739/2739 [01:37<00:00, 28.14it/s]
