# Анализ датасетов, сгенерированных моделью Mistral 7B, с помощью стандартных классификаторов

In [1]:
FILE_NAME_PREFIX = 'mistral_essays'
DF_SPLIT = 9000
DF_COUNT = 5
THRESHOLD=0.99
SEED = 42

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, precision_recall_curve, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import torch
import random
from collections import defaultdict
import torch.nn.functional as F

In [3]:
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x136d36b70>

In [4]:
df = pd.read_csv("../datasets/human_essays.csv", encoding="utf-8").assign(label=0)[["text", "label"]]

In [5]:
def get_dataset(orig_df: pd.DataFrame, filename: str) -> pd.DataFrame:
    """
    Загружает датасет из CSV, добавляет метку и объединяет с исходным DataFrame. Затем перемешивает строки.

    :param orig_df: исходный DataFrame
    :param filename: путь к CSV-файлу
    :return: объединённый и перемешанный DataFrame
    """
    new_data = pd.read_csv(filename, encoding="utf-8").assign(label=1)[["text", "label"]]

    result = pd.concat([orig_df, new_data], ignore_index=True).sample(frac=1, random_state=SEED).reset_index(drop=True)

    return result

In [6]:
datasets = [
    get_dataset(df[0:DF_SPLIT], f"../datasets/{FILE_NAME_PREFIX}_{i}.csv")
    for i in range(1, DF_COUNT + 1)
]

In [7]:
sizes = [dataset.shape for dataset in datasets]
print(sizes)

[(10500, 2), (10500, 2), (10500, 2), (10500, 2), (10500, 2)]


In [8]:
datasets[0].head(20)

Unnamed: 0,text,label
0,Cash Flow Problems\n\nModern enterprises risk ...,0
1,Table of Contents\n 1. Introduction\n 2. Overv...,0
2,"Social networking sites, news portals, and sea...",0
3,Table of Contents\n 1. Introduction\n 2. The R...,0
4,The relationship between the Dominican Republi...,0
5,Table of Contents\n 1. Overview\n 2. Funding\n...,0
6,In “Tropicalizations: Transcultural Representa...,0
7,This paper investigates the current trends in ...,0
8,Introduction\n\nPolitical literature uses the ...,0
9,Commercial fishing is the process of taking fi...,0


### Метрика

In [9]:
def recall_at_precision(y_true, y_scores, min_precision=0.995):
    precision, recall, thresholds = precision_recall_curve(
        y_true, y_scores
    )

    valid = precision[:-1] >= min_precision

    if not np.any(valid):
        return {
            "recall": 0.0,
            "precision": float(precision.max()),
            "threshold": None,
            "threshold_found": False
        }

    idx = np.argmax(recall[:-1][valid])
    valid_indices = np.where(valid)[0]
    best_idx = valid_indices[idx]

    return {
        "recall": float(recall[best_idx]),
        "precision": float(precision[best_idx]),
        "threshold": float(thresholds[best_idx]),
        "threshold_found": True
    }

def fp_metrics(y_true, y_scores, threshold):
    y_pred = (y_scores >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    return {
        "fp": int(fp),
        "fpr": fp / (fp + tn + 1e-9),
        "precision": tp / (tp + fp + 1e-9),
        "recall": tp / (tp + fn + 1e-9),
    }

### LogisticRegression + TF-IDF

In [10]:
def get_log_classification_report(data):
    X_train, X_test, y_train, y_test = train_test_split(
        data["text"],
        data["label"],
        test_size=0.2,
        random_state=SEED,
        stratify=data["label"],
    )

    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
    )

    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000, random_state=SEED)
    model.fit(X_train_vec, y_train)

    y_scores = model.predict_proba(X_test_vec)[:, 1]

    rap = recall_at_precision(
        y_test,
        y_scores,
        min_precision=THRESHOLD
    )

    if rap["threshold_found"]:
        y_pred = (y_scores >= rap["threshold"]).astype(int)
        fp = fp_metrics(y_test, y_scores, rap["threshold"])
    else:
        y_pred = np.zeros_like(y_scores, dtype=int)
        fp = None

    return classification_report(y_test, y_pred), rap, fp

In [11]:
log_results = []
for df in datasets:
    log_results.append(get_log_classification_report(df))

In [12]:
rows = []
for i, (report, rap, fp) in enumerate(log_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)

    rows.append({
        "dataset": i,
        "recall@precision": rap["recall"],
        "threshold_found": rap["threshold_found"],
        "fp": fp["fp"],
        "fpr": fp["fpr"],
        "precision": fp["precision"],
        "recall": fp["recall"],
    })

df_metrics = pd.DataFrame(rows)
print("Сводная таблица\n")
print(df_metrics.round(4).to_string(index=False))

Отчёт по датасету №1

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1800
           1       0.99      0.96      0.97       300

    accuracy                           0.99      2100
   macro avg       0.99      0.98      0.99      2100
weighted avg       0.99      0.99      0.99      2100

Отчёт по датасету №2

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1800
           1       0.99      0.96      0.98       300

    accuracy                           0.99      2100
   macro avg       0.99      0.98      0.99      2100
weighted avg       0.99      0.99      0.99      2100

Отчёт по датасету №3

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1800
           1       0.99      0.92      0.95       300

    accuracy                           0.99      2100
   macro avg       0.99      0.96      0.97      2100
weighted

### Boosting

In [13]:
def get_boost_classification_report(data):
    df = data.copy()
    df["tokens"] = df["text"].apply(lambda text: simple_preprocess(str(text)))

    w2v_model = Word2Vec(
        sentences=df["tokens"],
        vector_size=300,
        window=7,
        min_count=2,
        workers=12,
        sg=1,
    )

    def text_to_vector(tokens, model):
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

    X = np.vstack([text_to_vector(tokens, w2v_model) for tokens in df["tokens"]])
    y = df["label"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=SEED,
        stratify=y,
    )

    model = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.5,
        eval_metric="logloss",
        n_jobs=12,
    )

    model.fit(X_train, y_train)

    y_scores = model.predict_proba(X_test)[:, 1]

    rap = recall_at_precision(y_test, y_scores, min_precision=THRESHOLD)

    if rap["threshold_found"]:
        y_pred = (y_scores >= rap["threshold"]).astype(int)
        fp = fp_metrics(y_test, y_scores, rap["threshold"])
    else:
        y_pred = np.zeros_like(y_scores, dtype=int)
        fp = None

    return classification_report(y_test, y_pred), rap, fp

In [14]:
boost_results = []
for df in datasets:
    boost_results.append(get_boost_classification_report(df))

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

In [15]:
rows = []
for i, (report, rap, fp) in enumerate(boost_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)

    rows.append({
        "dataset": i,
        "recall@precision": rap["recall"],
        "threshold_found": rap["threshold_found"],
        "fp": fp["fp"],
        "fpr": fp["fpr"],
        "precision": fp["precision"],
        "recall": fp["recall"],
    })

df_metrics = pd.DataFrame(rows)
print("Сводная таблица\n")
print(df_metrics.round(4).to_string(index=False))

Отчёт по датасету №1

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1800
           1       0.99      0.92      0.95       300

    accuracy                           0.99      2100
   macro avg       0.99      0.96      0.97      2100
weighted avg       0.99      0.99      0.99      2100

Отчёт по датасету №2

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1800
           1       0.99      0.91      0.95       300

    accuracy                           0.99      2100
   macro avg       0.99      0.95      0.97      2100
weighted avg       0.99      0.99      0.99      2100

Отчёт по датасету №3

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1800
           1       0.99      0.75      0.85       300

    accuracy                           0.96      2100
   macro avg       0.98      0.87      0.92      2100
weighted

### BERT

In [16]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='mps')

In [17]:
class ChunkedTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist() if hasattr(labels, "tolist") else list(labels)

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        essay_id = self.encodings["essay_ids"][idx]
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": torch.tensor(self.labels[essay_id], dtype=torch.long),
            "essay_id": essay_id,
        }

In [18]:
def tokenize_with_sliding_window(texts, tokenizer, max_length=256, stride=128):
    input_ids, attention_masks, essay_ids = [], [], []

    for essay_id, text in enumerate(texts):
        encoded = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            stride=stride,
            return_overflowing_tokens=True,
            padding="max_length",
            return_tensors="pt",
        )

        n_chunks = encoded["input_ids"].shape[0]
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
        essay_ids.extend([essay_id] * n_chunks)

    return {
        "input_ids": torch.cat(input_ids),
        "attention_mask": torch.cat(attention_masks),
        "essay_ids": essay_ids,
    }

def get_bert_classification_report(
    data,
    model_name="bert-base-uncased",
    max_length=256,
    stride=128,
    batch_size=32,
    epochs=2,
):
    X_train, X_test, y_train, y_test = train_test_split(
        data["text"],
        data["label"],
        test_size=0.2,
        random_state=SEED,
        stratify=data["label"],
    )

    tokenizer = BertTokenizer.from_pretrained(model_name)

    train_enc = tokenize_with_sliding_window(X_train, tokenizer, max_length, stride)
    test_enc = tokenize_with_sliding_window(X_test, tokenizer, max_length, stride)

    train_dataset = ChunkedTextDataset(train_enc, y_train)
    test_dataset = ChunkedTextDataset(test_enc, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    model = BertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2,
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    model.train()
    for epoch in range(epochs):
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items() if k != "essay_id"}
            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()

    model.eval()
    essay_probs = defaultdict(list)
    essay_labels = {}

    with torch.no_grad():
        for batch in test_loader:
            essay_ids = batch["essay_id"]
            labels = batch["labels"]

            batch = {k: v.to(device) for k, v in batch.items() if k != "essay_id"}

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )

            probs = F.softmax(outputs.logits, dim=1)[:, 1]

            for i, essay_id in enumerate(essay_ids):
                essay_probs[essay_id].append(probs[i].cpu())
                essay_labels[essay_id] = labels[i].item()

    final_scores, final_labels = [], []

    for essay_id, probs_list in essay_probs.items():
        mean_prob = torch.stack(probs_list).mean().item()
        final_scores.append(mean_prob)
        final_labels.append(essay_labels[essay_id])

    final_scores = np.array(final_scores)
    final_labels = np.array(final_labels)

    rap = recall_at_precision(final_labels, final_scores, min_precision=THRESHOLD)

    if rap["threshold_found"]:
        y_pred = (final_scores >= rap["threshold"]).astype(int)
        fp = fp_metrics(final_labels, final_scores, rap["threshold"])
    else:
        y_pred = np.zeros_like(final_scores, dtype=int)
        fp = None

    return classification_report(final_labels, y_pred), rap, fp

In [19]:
bert_results = []
for df in datasets:
    bert_results.append(get_bert_classification_report(df))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are

In [20]:
rows = []
for i, (report, rap, fp) in enumerate(bert_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)

    rows.append({
        "dataset": i,
        "recall@precision": rap["recall"],
        "threshold_found": rap["threshold_found"],
        "fp": fp["fp"],
        "fpr": fp["fpr"],
        "precision": fp["precision"],
        "recall": fp["recall"],
    })

df_metrics = pd.DataFrame(rows)
print("Сводная таблица\n")
print(df_metrics.round(4).to_string(index=False))

Отчёт по датасету №1

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1800
           1       0.99      1.00      1.00       300

    accuracy                           1.00      2100
   macro avg       1.00      1.00      1.00      2100
weighted avg       1.00      1.00      1.00      2100

Отчёт по датасету №2

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1800
           1       0.99      1.00      1.00       300

    accuracy                           1.00      2100
   macro avg       1.00      1.00      1.00      2100
weighted avg       1.00      1.00      1.00      2100

Отчёт по датасету №3

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1800
           1       0.99      0.74      0.85       300

    accuracy                           0.96      2100
   macro avg       0.97      0.87      0.91      2100
weighted