# Анализ датасетов, сгенерированных моделью Llama 13B, с помощью baseline-классификаторов

In [1]:
FILE_NAME_PREFIX = 'llama_essays'

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
import xgboost as xgb
from xgboost import XGBClassifier
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
import torch

In [3]:
df = pd.read_csv("../datasets/human_essays.csv", encoding="utf-8").assign(label=0)[["text", "label"]]

In [4]:
def get_dataset(orig_df: pd.DataFrame, filename: str) -> pd.DataFrame:
    """
    Загружает датасет из CSV, добавляет метку и объединяет с исходным DataFrame. Затем перемешивает строки.

    :param orig_df: исходный DataFrame
    :param filename: путь к CSV-файлу
    :return: объединённый и перемешанный DataFrame
    """
    new_data = pd.read_csv(filename, encoding="utf-8").assign(label=1)[["text", "label"]]

    result = pd.concat([orig_df, new_data], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

    return result

In [5]:
datasets = [
    get_dataset(df, f"../datasets/{FILE_NAME_PREFIX}_{i}.csv")
    for i in range(1, 6)
]

In [6]:
datasets[0].head(10)

Unnamed: 0,text,label
0,Directed by the American film producer and dir...,0
1,Table of Contents\n 1. “The Jewelry” by Guy de...,0
2,"ABC Company, a leading player in the retail in...",1
3,The site acknowledges that COVID-19 is here to...,0
4,The Summary of Hitchen’s Main Arguments\n\nHit...,0
5,Table of Contents\n 1. Introduction\n 2. Histo...,0
6,"In ""The Hero with a Thousand Faces,"" psychoana...",1
7,Introduction\n\nOperational Management is defi...,0
8,Introduction\n\nWater is a kind of chemical su...,0
9,Introduction\n\nJail overcrowding is a situati...,0


### LogisticRegression + TF-IDF

In [7]:
def get_log_classification_report(data):
    """
    Обучает Logistic Regression на TF-IDF признаках и возвращает classification report и F1-score.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        data["text"],
        data["label"],
        test_size=0.2,
        random_state=42,
        stratify=data["label"],
    )

    vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
    )

    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train_vec, y_train)

    y_pred = model.predict(X_test_vec)

    return classification_report(y_test, y_pred), f1_score(y_test, y_pred, average="binary")

In [8]:
log_results = []
for df in datasets:
    log_results.append(get_log_classification_report(df))

In [9]:
for i, (report, f1) in enumerate(log_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)
    print(f"F1-score: {f1:.4f}\n\n")

Отчёт по датасету №1

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25573
           1       1.00      0.46      0.63       200

    accuracy                           1.00     25773
   macro avg       1.00      0.73      0.81     25773
weighted avg       1.00      1.00      1.00     25773

F1-score: 0.6301


Отчёт по датасету №2

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     25573
           1       1.00      0.62      0.77       200

    accuracy                           1.00     25773
   macro avg       1.00      0.81      0.88     25773
weighted avg       1.00      1.00      1.00     25773

F1-score: 0.7654




### Boosting

In [None]:
def get_boost_classification_report(data):
    """
    Обучает Word2Vec + XGBoost и возвращает classification report и F1-score.
    """
    df = data.copy()

    df["tokens"] = df["text"].apply(lambda text: simple_preprocess(str(text)))

    w2v_model = Word2Vec(
        sentences=df["tokens"],
        vector_size=300,
        window=7,
        min_count=2,
        workers=12,
        sg=1,
    )

    def text_to_vector(tokens, model):
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

    X = np.vstack([text_to_vector(tokens, w2v_model) for tokens in df["tokens"]])
    y = df["label"].values

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        random_state=42,
        stratify=y,
    )

    model = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.5,
        eval_metric="logloss",
        n_jobs=12,
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    return classification_report(y_test, y_pred), f1_score(y_test, y_pred)

In [None]:
boost_results = []
for df in datasets:
    boost_results.append(get_boost_classification_report(df))

In [None]:
for i, (report, f1) in enumerate(boost_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)
    print(f"F1-score: {f1:.4f}\n\n")

### BERT

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

In [None]:
def tokenize_texts(texts, tokenizer, max_length):
    """
    Токенизирует тексты одним батчем для ускорения обучения.
    Выполняет усечение до max_length и padding до одинаковой длины,
    возвращая тензоры PyTorch (input_ids, attention_mask).
    """
    return tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt",
    )


def get_bert_classification_report(data, model_name="bert-base-uncased", max_length=512, batch_size=8, epochs=1):
    """
    Fine-tuning BERT для бинарной классификации текста.
    Возвращает classification report и F1-score.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        data["text"],
        data["label"],
        test_size=0.2,
        random_state=42,
        stratify=data["label"],
    )

    tokenizer = BertTokenizer.from_pretrained(model_name)

    train_enc = tokenize_texts(X_train, tokenizer, max_length)
    test_enc = tokenize_texts(X_test, tokenizer, max_length)

    train_dataset = TokenizedDataset(train_enc, y_train)
    test_dataset = TokenizedDataset(test_enc, y_test)

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=False,
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        num_workers=0,
        pin_memory=False,
    )

    model = BertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=data["label"].nunique(),
    )
    model = torch.compile(model)
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    model.train()
    for _ in range(epochs):
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}

            optimizer.zero_grad()
            outputs = model(**batch)
            outputs.loss.backward()
            optimizer.step()

    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
            )
            preds.extend(outputs.logits.argmax(dim=1).cpu().numpy())
            labels.extend(batch["labels"].cpu().numpy())

    return classification_report(labels, preds), f1_score(labels, preds, average='binary')

In [None]:
bert_results = []
for df in datasets:
    bert_results.append(get_bert_classification_report(df))

In [None]:
for i, (report, f1) in enumerate(bert_results, start=1):
    print(f"Отчёт по датасету №{i}\n")
    print(report)
    print(f"F1-score: {f1:.4f}\n\n")