In [1]:
import os
import re
import json
import nltk
import pandas as pd
import numpy as np
from tqdm import tqdm
from pymorphy3 import MorphAnalyzer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

In [None]:
# ─── ⓵ Загрузка и «распаковка» JSON ─────────────────────────────────────────────
with open('articles.json', 'r', encoding='utf-8') as f:
    raw = json.load(f)
# сразу распаковываем только нужные поля:
df = pd.json_normalize(
    [{"id": k, **v["data"]} for k, v in raw.items()]
)

In [3]:
# ─── ⓶ Чистка и фильтрация меток ───────────────────────────────────────────────
def clean_classification(x):
    if isinstance(x, str) and "Неверный формат ответа" in x:
        return []
    return x if isinstance(x, list) else []

df['classification'] = df['classification'].apply(clean_classification)
df = df[df['classification'].map(len) > 0].reset_index(drop=True)

In [4]:
# ─── ⓷ Препроцессинг + лемматизация (с кешем) ─────────────────────────────────
cache_path = 'text_lemmatized_cache.csv'
if os.path.exists(cache_path):
    df = pd.read_csv(cache_path, index_col=0)
else:
    nltk.download('stopwords', quiet=True)
    stop_words = set(nltk.corpus.stopwords.words('russian'))
    morph = MorphAnalyzer()

    def preprocess_text(text: str) -> str:
        text = str(text)
        # убираем HTML, невидимые символы и нормализуем
        text = re.sub(r'<[^>]+>', ' ', text)
        text = re.sub(r'[\r\n\t]', ' ', text)
        text = text.replace('Ё','Е').replace('ё','е')
        text = re.sub(r'[^A-Za-z0-9А-Яа-яЕе\-\.,:/%]', ' ', text)
        text = re.sub(r'\s{2,}', ' ', text).strip()
        # токенизация + стоп-слова
        tokens = [t for t in text.split() if t.lower() not in stop_words and len(t)>2]
        # лемматизация
        return ' '.join(morph.parse(t)[0].normal_form for t in tokens)

    tqdm.pandas(desc="Лемматизация")
    df['text_lemmatized'] = df['text'].progress_apply(preprocess_text)
    df.to_csv(cache_path)

Лемматизация: 100%|██████████| 4558/4558 [12:53<00:00,  5.90it/s]


In [5]:
# ─── ⓸ Кодирование меток ────────────────────────────────────────────────────────
mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(df['classification'])

In [6]:
# ─── ⓹ Стратифицированный сплит по мульти-лейблам ───────────────────────────────
texts = df['text_lemmatized'].values
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(msss.split(texts, Y))
X_train_texts, X_test_texts = texts[train_idx], texts[test_idx]
Y_train, Y_test = Y[train_idx], Y[test_idx]

In [7]:
# ─── ⓺ Pipeline: TF-IDF + One-vs-Rest LogisticRegression ───────────────────────
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', OneVsRestClassifier(
        LogisticRegression(class_weight='balanced', max_iter=1000)
    )),
])

In [8]:
# ─── ⓻ GridSearch по параметру C ───────────────────────────────────────────────
param_grid = {
    'clf__estimator__C': [0.1, 1, 10],
}
grid_tfidf = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='f1_micro',
    verbose=1,
    n_jobs=-1
)
grid_tfidf.fit(X_train_texts, Y_train)
best_tfidf = grid_tfidf.best_estimator_

Fitting 3 folds for each of 3 candidates, totalling 9 fits


In [9]:
# ─── ⓼ Оценка на тесте ─────────────────────────────────────────────────────────
Y_pred  = best_tfidf.predict(X_test_texts)
Y_proba = best_tfidf.predict_proba(X_test_texts)

print("\n=== Baseline: TF-IDF + One-vs-Rest LogisticRegression ===")
print("Лучший C:", grid_tfidf.best_params_['clf__estimator__C'])
print(classification_report(
    Y_test, Y_pred,
    target_names=[str(c) for c in mlb.classes_],
    zero_division=0
))
print("ROC-AUC (micro):", roc_auc_score(Y_test,  Y_proba, average='micro'))
print("ROC-AUC (macro):", roc_auc_score(Y_test,  Y_proba, average='macro'))


=== Baseline: TF-IDF + One-vs-Rest LogisticRegression ===
Лучший C: 10
              precision    recall  f1-score   support

           0       0.43      0.63      0.51       102
           1       0.84      0.92      0.88       327
           2       0.83      0.90      0.86       177
           3       0.78      0.83      0.80       162
           4       0.86      0.84      0.85        45
           5       0.67      0.82      0.74       133
           6       0.40      0.59      0.48        29
           7       0.59      0.62      0.60       154

   micro avg       0.72      0.81      0.76      1129
   macro avg       0.67      0.77      0.72      1129
weighted avg       0.73      0.81      0.77      1129
 samples avg       0.77      0.86      0.79      1129

ROC-AUC (micro): 0.9620281429685595
ROC-AUC (macro): 0.9475017048701386


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# ⓵ Сначала векторизуем тексты (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train_texts)
X_test_vec  = vectorizer.transform(   X_test_texts)

# ⓶ Строим MultiOutput + OVO
ovo_multi = MultiOutputClassifier(
    OneVsOneClassifier(
        LogisticRegression(
            class_weight='balanced',
            C=grid_tfidf.best_params_['clf__estimator__C'],  # лучший C из GridSearch
            max_iter=1000,
            solver='liblinear'  # для бинарной задачи liblinear хорошо
        ),
        n_jobs=-1
    ),
    n_jobs=-1
)

ovo_multi.fit(X_train_vec, Y_train)
Y_pred_ovo = ovo_multi.predict(X_test_vec)

# ⓷ Собираем вероятности «метка=1» у каждого внутреннего классификатора
proba_list = []
for ovo_clf in ovo_multi.estimators_:
    # ovo_clf.estimators_ – список внутренних бинарных клаcсификаторов,
    # здесь он длины 1, потому что наша задача 0 vs 1:
    base = ovo_clf.estimators_[0]
    proba = base.predict_proba(X_test_vec)[:, 1]
    proba_list.append(proba)

Y_proba_ovo = np.vstack(proba_list).T  # shape = (n_samples, n_labels)

# ⓸ Метрики
print("\n=== Multi-label One-vs-One (через MultiOutput) ===")
print(classification_report(
    Y_test,
    Y_pred_ovo,
    target_names=[str(c) for c in mlb.classes_],
    zero_division=0
))
print("ROC-AUC (micro):", roc_auc_score(Y_test, Y_proba_ovo, average='micro'))
print("ROC-AUC (macro):", roc_auc_score(Y_test, Y_proba_ovo, average='macro'))



=== Multi-label One-vs-One (через MultiOutput) ===
              precision    recall  f1-score   support

           0       0.43      0.63      0.51       102
           1       0.84      0.93      0.88       327
           2       0.82      0.90      0.86       177
           3       0.78      0.83      0.81       162
           4       0.87      0.87      0.87        45
           5       0.67      0.84      0.75       133
           6       0.39      0.59      0.47        29
           7       0.60      0.62      0.61       154

   micro avg       0.72      0.82      0.76      1129
   macro avg       0.67      0.78      0.72      1129
weighted avg       0.73      0.82      0.77      1129
 samples avg       0.77      0.86      0.79      1129

ROC-AUC (micro): 0.9619188114612541
ROC-AUC (macro): 0.9472226455809138


In [11]:
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
import numpy as np

# 1) Забираем обученный TF-IDF из вашего GridSearch-pipeline
tfidf = best_tfidf.named_steps['tfidf']

# 2) Преобразуем тексты в векторы
X_train_vec = tfidf.transform(X_train_texts)
X_test_vec  = tfidf.transform(X_test_texts)

# 3) Инициализируем ClassifierChain с теми же LR-параметрами
chain = ClassifierChain(
    base_estimator=LogisticRegression(
        C=grid_tfidf.best_params_['clf__estimator__C'],
        class_weight='balanced',
        max_iter=1000
    ),
    order='random',
    random_state=42
)

# 4) Обучаем цепочку
chain.fit(X_train_vec, Y_train)

# 5) Предсказываем метки
Y_pred_chain = chain.predict(X_test_vec)

# 6) Берём вероятности «1» для каждой метки
# Если ваша версия sklearn поддерживает predict_proba у ClassifierChain:
try:
    Y_proba_chain = chain.predict_proba(X_test_vec)
except AttributeError:
    # Иначе: вручную последовательно наращиваем «предсказанные» фичи
    from scipy.sparse import hstack
    X_ext = X_test_vec
    proba_list = []
    for est in chain.estimators_:
        p = est.predict_proba(X_ext)[:, 1]
        proba_list.append(p)
        # бинарные предсказания для следующего шага
        y_bin = est.predict(X_ext).reshape(-1, 1)
        X_ext = hstack([X_ext, y_bin])
    Y_proba_chain = np.vstack(proba_list).T

# 7) Выводим те же метрики
print("\n=== ClassifierChain LogisticRegression ===")
print(classification_report(
    Y_test, Y_pred_chain,
    target_names=[str(c) for c in mlb.classes_],
    zero_division=0
))
print("ROC-AUC (micro):", roc_auc_score(Y_test, Y_proba_chain, average='micro'))
print("ROC-AUC (macro):", roc_auc_score(Y_test, Y_proba_chain, average='macro'))



=== ClassifierChain LogisticRegression ===
              precision    recall  f1-score   support

           0       0.41      0.45      0.43       102
           1       0.84      0.92      0.88       327
           2       0.88      0.84      0.86       177
           3       0.79      0.78      0.79       162
           4       0.85      0.78      0.81        45
           5       0.67      0.82      0.74       133
           6       0.31      0.52      0.38        29
           7       0.57      0.62      0.59       154

   micro avg       0.72      0.78      0.75      1129
   macro avg       0.66      0.72      0.68      1129
weighted avg       0.73      0.78      0.75      1129
 samples avg       0.77      0.81      0.77      1129

ROC-AUC (micro): 0.9277832412342865
ROC-AUC (macro): 0.8952148897994732


In [12]:
from sklearn.pipeline        import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model   import LogisticRegression
from sklearn.multiclass     import OneVsRestClassifier
from sklearn.metrics        import classification_report, roc_auc_score

# 1) Собираем комбинированный TF–IDF
tfidf_wc = FeatureUnion([
    ("word", TfidfVectorizer(
        analyzer="word",
        ngram_range=(1,2),
        max_features=10000,
        token_pattern=r"(?u)\b\w+\b"
    )),
    ("char", TfidfVectorizer(
        analyzer="char",
        ngram_range=(3,5),
        max_features=5000
    )),
])

# 2) Собираем Pipeline
wordchar_pipeline = Pipeline([
    ("tfidf", tfidf_wc),
    ("clf", OneVsRestClassifier(
        LogisticRegression(
            solver="saga",      # хорошо работает с большими sparse-разреженными фичами
            penalty="l2",
            C=1.0,              # можно покрутить
            class_weight="balanced",
            max_iter=1000
        ),
        n_jobs=-1
    )),
])

# 3) Обучение
wordchar_pipeline.fit(X_train_texts, Y_train)
best_wordchar = wordchar_pipeline

# 4) Предсказание
Y_pred  = wordchar_pipeline.predict(X_test_texts)
Y_proba = wordchar_pipeline.predict_proba(X_test_texts)

# 5) Оценка
print("=== Baseline: word+char TF–IDF + OneVsRest(LogReg) ===\n")
print(classification_report(
    Y_test, Y_pred, 
    target_names=[str(c) for c in mlb.classes_],
    zero_division=0
))
print(f"ROC-AUC (micro): {roc_auc_score(Y_test, Y_proba, average='micro'):.4f}")
print(f"ROC-AUC (macro): {roc_auc_score(Y_test, Y_proba, average='macro'):.4f}")

=== Baseline: word+char TF–IDF + OneVsRest(LogReg) ===

              precision    recall  f1-score   support

           0       0.36      0.75      0.48       102
           1       0.83      0.92      0.87       327
           2       0.82      0.95      0.88       177
           3       0.77      0.88      0.82       162
           4       0.65      0.91      0.76        45
           5       0.63      0.87      0.73       133
           6       0.14      0.79      0.23        29
           7       0.56      0.74      0.64       154

   micro avg       0.62      0.87      0.72      1129
   macro avg       0.59      0.85      0.68      1129
weighted avg       0.69      0.87      0.76      1129
 samples avg       0.72      0.90      0.77      1129

ROC-AUC (micro): 0.9543
ROC-AUC (macro): 0.9489


In [13]:
# 5) Оценка
print("=== Baseline: word+char TF–IDF + OneVsRest(LogReg) ===\n")
print(classification_report(
    Y_test, Y_pred, 
    target_names=[str(c) for c in mlb.classes_],
    zero_division=0
))
print(f"ROC-AUC (micro): {roc_auc_score(Y_test, Y_proba, average='micro'):.4f}")
print(f"ROC-AUC (macro): {roc_auc_score(Y_test, Y_proba, average='macro'):.4f}")


=== Baseline: word+char TF–IDF + OneVsRest(LogReg) ===

              precision    recall  f1-score   support

           0       0.36      0.75      0.48       102
           1       0.83      0.92      0.87       327
           2       0.82      0.95      0.88       177
           3       0.77      0.88      0.82       162
           4       0.65      0.91      0.76        45
           5       0.63      0.87      0.73       133
           6       0.14      0.79      0.23        29
           7       0.56      0.74      0.64       154

   micro avg       0.62      0.87      0.72      1129
   macro avg       0.59      0.85      0.68      1129
weighted avg       0.69      0.87      0.76      1129
 samples avg       0.72      0.90      0.77      1129

ROC-AUC (micro): 0.9543
ROC-AUC (macro): 0.9489


In [14]:
import os
import re
import json
import nltk
import pandas as pd
import numpy as np
from tqdm import tqdm
from pymorphy3 import MorphAnalyzer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer  # Changed from TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

# Assuming all preprocessing steps (loading, cleaning, lemmatization, label encoding, and train-test split)
# are the same as in your original code, up to the pipeline definition.

# ─── ⓺ Pipeline: BoW + One-vs-Rest LogisticRegression ───────────────────────
pipeline = Pipeline([
    ('bow', CountVectorizer(max_features=5000)),  # BoW instead of TF-IDF
    ('clf', OneVsRestClassifier(
        LogisticRegression(class_weight='balanced', max_iter=1000)
    )),
])

# ─── ⓻ GridSearch over C parameter ──────────────────────────────────────────
param_grid = {
    'clf__estimator__C': [0.1, 1, 10],
}

grid_bow = GridSearchCV(
    pipeline,
    param_grid,
    cv=3,
    scoring='f1_micro',
    verbose=1,
    n_jobs=-1
)

# Fit the model
grid_bow.fit(X_train_texts, Y_train)
best_bow = grid_bow.best_estimator_       # keep a separate copy for BoW


# ─── ⓼ Evaluate on test set ────────────────────────────────────────────────
Y_pred = best_bow.predict(X_test_texts)
Y_proba = best_bow.predict_proba(X_test_texts)

# Print results
print("\n=== Baseline: BoW + One-vs-Rest LogisticRegression ===")
print("Best C:", grid_bow.best_params_['clf__estimator__C'])
print(classification_report(
    Y_test, Y_pred,
    target_names=[str(c) for c in mlb.classes_],
    zero_division=0
))
print("ROC-AUC (micro):", roc_auc_score(Y_test, Y_proba, average='micro'))
print("ROC-AUC (macro):", roc_auc_score(Y_test, Y_proba, average='macro'))

Fitting 3 folds for each of 3 candidates, totalling 9 fits

=== Baseline: BoW + One-vs-Rest LogisticRegression ===
Best C: 0.1
              precision    recall  f1-score   support

           0       0.43      0.49      0.46       102
           1       0.83      0.88      0.85       327
           2       0.83      0.87      0.85       177
           3       0.77      0.75      0.76       162
           4       0.85      0.76      0.80        45
           5       0.73      0.77      0.75       133
           6       0.43      0.41      0.42        29
           7       0.59      0.51      0.55       154

   micro avg       0.73      0.74      0.74      1129
   macro avg       0.68      0.68      0.68      1129
weighted avg       0.73      0.74      0.74      1129
 samples avg       0.74      0.79      0.75      1129

ROC-AUC (micro): 0.9321869571581892
ROC-AUC (macro): 0.9149329291942099


In [None]:
# -----------------------------------------------------------
# 0. paths & helper functions
# -----------------------------------------------------------
MANUAL_PATH = r"400.json"

def to_int_list(lbls):
    """Safely cast JSON labels to a list[int]."""
    return [int(x) for x in lbls] if isinstance(lbls, list) else []

def preprocess_text(text: str) -> str:
    """⇢ the *same* cleaning / lemmatisation routine you used before."""
    text = str(text).replace('Ё', 'Е').replace('ё', 'е')
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'[\r\n\t]', ' ', text)
    text = re.sub(r'[^A-Za-z0-9А-Яа-яЕе\-\.,:/%]', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()

    tokens = [
        t for t in text.split()
        if t.lower() not in stop_words and len(t) > 2
    ]
    return ' '.join(morph.parse(t)[0].normal_form for t in tokens)

# -----------------------------------------------------------
# 1. load & prepare the gold validation data
# -----------------------------------------------------------
with open(MANUAL_PATH, encoding='utf-8') as f:
    gold_raw = json.load(f)

gold_df = pd.json_normalize(
    [{"id": k, **v["data"]} for k, v in gold_raw.items()]
)

# take ONLY rows that have non‑empty manual_labels
gold_df = gold_df[gold_df["manual_labels"].map(bool)].copy()

gold_df["manual_labels"] = gold_df["manual_labels"].apply(to_int_list)
gold_df["text_lemmatized"] = gold_df["text"].progress_apply(preprocess_text)

# binarise with the SAME MultiLabelBinarizer that is already fitted
Y_val = mlb.transform(gold_df["manual_labels"])
X_val_texts = gold_df["text_lemmatized"].values



Лемматизация: 100%|██████████| 400/400 [01:13<00:00,  5.43it/s]


In [21]:
# -----------------------------------------------------------
# 2. vectorise once for the models that need raw TF‑IDF / BoW
# -----------------------------------------------------------
X_val_tfidf   = best_tfidf.named_steps['tfidf'].transform(X_val_texts)
X_val_bow     = best_bow.named_steps['bow'].transform(X_val_texts)
X_val_tfidfwc = best_wordchar.named_steps['tfidf'].transform(X_val_texts)

In [22]:
print(best_tfidf)   # should show ('tfidf', TfidfVectorizer(...))
print(best_bow)     # should show ('bow',   CountVectorizer(...))

Pipeline(steps=[('tfidf', TfidfVectorizer(max_features=5000)),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(C=10,
                                                                  class_weight='balanced',
                                                                  max_iter=1000)))])
Pipeline(steps=[('bow', CountVectorizer(max_features=5000)),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(C=0.1,
                                                                  class_weight='balanced',
                                                                  max_iter=1000)))])


In [23]:
# -----------------------------------------------------------
# 3. evaluation helper
# -----------------------------------------------------------
# Compute probabilities for OvO on the validation set
proba_list_val = []
for ovo_clf in ovo_multi.estimators_:
    base = ovo_clf.estimators_[0]
    proba = base.predict_proba(X_val_tfidf)[:, 1]
    proba_list_val.append(proba)
Y_proba_ovo_val = np.vstack(proba_list_val).T  # shape = (n_samples, n_labels)

# Modified evaluate function to accept precomputed probabilities
def evaluate(name, estimator, X_mat, Y_proba_precomputed=None):
    """Prints report & ROC‑AUC for a fitted *multi‑label* estimator."""
    Y_pred = estimator.predict(X_mat)
    
    print(f"\n=== {name} on manual validation set ===")
    print(classification_report(
        Y_val, Y_pred,
        target_names=[str(c) for c in mlb.classes_],
        zero_division=0
    ))

    # Use precomputed probabilities if provided, otherwise try predict_proba
    if Y_proba_precomputed is not None:
        Y_proba = Y_proba_precomputed
    else:
        try:
            Y_proba = estimator.predict_proba(X_mat)
        except AttributeError:
            Y_proba = None

    if Y_proba is not None:
        print("ROC‑AUC (micro):", roc_auc_score(Y_val, Y_proba, average='micro'))
        print("ROC‑AUC (macro):", roc_auc_score(Y_val, Y_proba, average='macro'))
    else:
        print("ROC-AUC: Not available (predict_proba not supported)")

In [24]:
# -----------------------------------------------------------
# 4. run every classical model
# -----------------------------------------------------------
# 4.1 TF‑IDF  + OvR (best from GridSearch)
evaluate("TF‑IDF  + OvR (C = {})".format(
    best_tfidf.named_steps['clf'].estimator.C), best_tfidf, X_val_texts)

# 4.2 TF‑IDF  + OvO
evaluate("TF‑IDF + OvO", ovo_multi, X_val_tfidf, Y_proba_precomputed=Y_proba_ovo_val)

# 4.3 TF‑IDF  + Classifier Chain
evaluate("TF‑IDF  + Classifier‑Chain", chain, X_val_tfidf)

# 4.4 word+char TF‑IDF + OvR
evaluate("word+char TF‑IDF + OvR", wordchar_pipeline, X_val_texts)

# 4.5 Bag‑of‑Words + OvR
bow_clf = best_bow.named_steps['clf']          # only the classifier
evaluate("BoW + OvR (clf only, C = {})".format(
         bow_clf.estimator.C),
         bow_clf,
         X_val_bow)            # <-- csr_matrix



=== TF‑IDF  + OvR (C = 10) on manual validation set ===
              precision    recall  f1-score   support

           0       0.48      0.62      0.54        50
           1       0.72      0.72      0.72        80
           2       0.79      0.75      0.77        64
           3       0.72      0.71      0.72        83
           4       0.92      0.82      0.87        55
           5       0.75      0.81      0.78       108
           6       0.81      0.69      0.75        72
           7       0.68      0.70      0.69       131

   micro avg       0.72      0.73      0.73       643
   macro avg       0.73      0.73      0.73       643
weighted avg       0.73      0.73      0.73       643
 samples avg       0.74      0.79      0.73       643

ROC‑AUC (micro): 0.9251546846974518
ROC‑AUC (macro): 0.9185668463947259

=== TF‑IDF + OvO on manual validation set ===
              precision    recall  f1-score   support

           0       0.48      0.60      0.54        50
          