# Анализ датасетов, сгенерированных моделью Mistral 7B, с помощью baseline-классификаторов

In [1]:
FILE_NAME_PREFIX = 'mistral_essays'

In [37]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb
from xgboost import XGBClassifier
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import random

In [38]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x146d32b30>

In [3]:
df = pd.read_csv("../datasets/human_essays.csv", encoding="utf-8").assign(label=0)[["text", "label"]]

In [4]:
def get_dataset(orig_df: pd.DataFrame, filename: str) -> pd.DataFrame:
    """
    Загружает датасет из CSV, добавляет метку и объединяет с исходным DataFrame. Затем перемешивает строки.

    :param orig_df: исходный DataFrame
    :param filename: путь к CSV-файлу
    :return: объединённый и перемешанный DataFrame
    """
    new_data = pd.read_csv(filename, encoding="utf-8").assign(label=1)[["text", "label"]]

    result = pd.concat([orig_df, new_data], ignore_index=True).sample(frac=1, random_state=SEED).reset_index(drop=True)

    return result

In [5]:
datasets = [
    get_dataset(df, f"../datasets/{FILE_NAME_PREFIX}_{i}.csv")
    for i in range(1, 6)
]

In [6]:
datasets[0].head(10)

Unnamed: 0,text,label
0,Directed by the American film producer and dir...,0
1,Table of Contents\n 1. “The Jewelry” by Guy de...,0
2,The financial health of ABC Company is a subje...,1
3,The site acknowledges that COVID-19 is here to...,0
4,The Summary of Hitchen’s Main Arguments\n\nHit...,0
5,Table of Contents\n 1. Introduction\n 2. Histo...,0
6,"The Hero with a Thousand Faces, published in 1...",1
7,Introduction\n\nOperational Management is defi...,0
8,Introduction\n\nWater is a kind of chemical su...,0
9,Introduction\n\nJail overcrowding is a situati...,0


### LogisticRegression + TF-IDF

In [7]:
def get_log_classification_report(data):
    """
    Обучает Logistic Regression на TF-IDF признаках и возвращает classification report и F1-score.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        data["text"],
        data["label"],
        test_size=0.2,
        random_state=SEED,
        stratify=data["label"],
    )

    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
    )

    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000, random_state=SEED)
    model.fit(X_train_vec, y_train)

    y_pred = model.predict(X_test_vec)

    return classification_report(y_test, y_pred), f1_score(y_test, y_pred, average="binary")

In [8]:
log_results = []
for df in datasets:
    log_results.append(get_log_classification_report(df))

In [9]:
for i, (report, f1) in enumerate(log_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)
    print(f"F1-score: {f1:.4f}\n\n")

Отчёт по датасету №1

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25573
           1       1.00      0.57      0.73       200

    accuracy                           1.00     25773
   macro avg       1.00      0.78      0.86     25773
weighted avg       1.00      1.00      1.00     25773

F1-score: 0.7261


Отчёт по датасету №2

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25573
           1       1.00      0.68      0.81       200

    accuracy                           1.00     25773
   macro avg       1.00      0.84      0.90     25773
weighted avg       1.00      1.00      1.00     25773

F1-score: 0.8060


Отчёт по датасету №3

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25573
           1       1.00      0.61      0.76       200

    accuracy                           1.00     25773
   macro avg       1.00 

### Boosting

In [10]:
def get_boost_classification_report(data):
    """
    Обучает Word2Vec + XGBoost и возвращает classification report и F1-score.
    """
    df = data.copy()

    df["tokens"] = df["text"].apply(lambda text: simple_preprocess(str(text)))

    w2v_model = Word2Vec(
        sentences=df["tokens"],
        vector_size=300,
        window=7,
        min_count=2,
        workers=12,
        sg=1,
    )

    def text_to_vector(tokens, model):
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

    X = np.vstack([text_to_vector(tokens, w2v_model) for tokens in df["tokens"]])
    y = df["label"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=SEED,
        stratify=y,
    )

    model = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.5,
        eval_metric="logloss",
        n_jobs=12,
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    return classification_report(y_test, y_pred), f1_score(y_test, y_pred)

In [11]:
boost_results = []
for df in datasets:
    boost_results.append(get_boost_classification_report(df))

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

In [12]:
for i, (report, f1) in enumerate(boost_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)
    print(f"F1-score: {f1:.4f}\n\n")

Отчёт по датасету №1

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25573
           1       0.84      0.47      0.60       200

    accuracy                           1.00     25773
   macro avg       0.92      0.73      0.80     25773
weighted avg       0.99      1.00      0.99     25773

F1-score: 0.6026


Отчёт по датасету №2

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25573
           1       0.72      0.49      0.58       200

    accuracy                           0.99     25773
   macro avg       0.86      0.74      0.79     25773
weighted avg       0.99      0.99      0.99     25773

F1-score: 0.5816


Отчёт по датасету №3

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25573
           1       0.75      0.45      0.56       200

    accuracy                           0.99     25773
   macro avg       0.87 

### BERT

In [39]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='mps')

In [40]:
class ChunkedTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist() if hasattr(labels, "tolist") else list(labels)

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        essay_id = self.encodings["essay_ids"][idx]
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[essay_id], dtype=torch.long),
            "essay_id": essay_id,
        }

In [41]:
def tokenize_with_sliding_window(texts, tokenizer, max_length=256, stride=128):
    """
    Токенизирует тексты с использованием скользящего окна. Длинные тексты разбиваются на перекрывающиеся чанки.

    texts: список текстов (эссе)
    tokenizer: BERT-токенизатор
    max_length: максимальная длина одного чанка
    stride: шаг перекрытия между чанками
    """
    input_ids, attention_masks, essay_ids = [], [], []

    for essay_id, text in enumerate(texts):
        encoded = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            stride=stride,
            return_overflowing_tokens=True,
            padding="max_length",
            return_tensors="pt",
        )

        n_chunks = encoded["input_ids"].shape[0]
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
        essay_ids.extend([essay_id] * n_chunks)

    return {
        "input_ids": torch.cat(input_ids),
        "attention_mask": torch.cat(attention_masks),
        "essay_ids": essay_ids,
    }

def get_bert_classification_report(data, model_name="bert-base-uncased", max_length=256, stride=128, batch_size=8, epochs=2):
    """
    Обучает BERT-классификатор с чанками и возвращает classification report и macro F1-score.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        data["text"],
        data["label"],
        test_size=0.2,
        random_state=SEED,
        stratify=data["label"],
    )

    tokenizer = BertTokenizer.from_pretrained(model_name)

    train_enc = tokenize_with_sliding_window(X_train, tokenizer, max_length, stride)
    test_enc = tokenize_with_sliding_window(X_test, tokenizer, max_length, stride)

    train_dataset = ChunkedTextDataset(train_enc, y_train)
    test_dataset = ChunkedTextDataset(test_enc, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model = BertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=data["label"].nunique(),
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    model.train()
    for _ in range(epochs):
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items() if k != "essay_id"}
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()

    model.eval()
    essay_logits = defaultdict(list)
    essay_labels = {}

    with torch.no_grad():
        for batch in test_loader:
            essay_ids = batch["essay_id"]
            labels = batch["labels"]

            batch = {k: v.to(device) for k, v in batch.items() if k != "essay_id"}

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            for i, essay_id in enumerate(essay_ids):
                essay_logits[essay_id].append(outputs.logits[i].cpu())
                essay_labels[essay_id] = labels[i].item()

    final_preds, final_labels = [], []

    for essay_id, logits_list in essay_logits.items():
        mean_logits = torch.stack(logits_list).mean(dim=0)
        final_preds.append(mean_logits.argmax().item())
        final_labels.append(essay_labels[essay_id])

    report = classification_report(final_labels, final_preds)
    f1 = f1_score(final_labels, final_preds, average="macro")

    return report, f1

In [42]:
bert_results = []
for df in datasets:
    bert_results.append(get_bert_classification_report(df))

KeyboardInterrupt: 

In [None]:
for i, (report, f1) in enumerate(bert_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)
    print(f"F1-score: {f1:.4f}\n\n")