# Загрузим нужные модули

In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from transformers import pipeline, Trainer, AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification

from datasets.dataset_dict import DatasetDict
from datasets import Dataset

import evaluate

Загружаем данные с новостями, где label  0 - Real, а 1 - Fake

Подготавливаем датасет с train и test частями

In [29]:
data_set = pd.read_csv("fakenews.csv")
X_, X_test, y_, y_test = train_test_split(data_set['text'], data_set['label'], random_state=2023)
X_train, X_eval, y_train, y_eval = train_test_split(X_, y_, random_state=2023)

dataset =  DatasetDict({'train':Dataset.from_dict({'text':X_train, 'labels':y_train}),
                        'eval':Dataset.from_dict({'text':X_eval, 'labels':y_eval}),
                        'test':Dataset.from_dict({'text':X_test, 'labels':y_test})})

С помощью pipeline используем предобученную модель Eip/autotrain-real-vs-fake-news-2757281769

In [27]:
model_name = "Eip/autotrain-real-vs-fake-news-2757281769"
pipe = pipeline("text-classification", model=model_name, truncation=True)

y_pre_trained_predict = pipe(X_test.to_list())
y_pre_trained_predict = [ 1 if i['label'] == "Fake" else 0 for i in y_pre_trained_predict]

Метрика до обучения выбранной модели

In [28]:
print(classification_report(y_test, y_pre_trained_predict,  target_names=["Real", "Fake"]) )

              precision    recall  f1-score   support

        Real       0.71      0.01      0.01       717
        Fake       0.43      1.00      0.60       530

    accuracy                           0.43      1247
   macro avg       0.57      0.50      0.31      1247
weighted avg       0.59      0.43      0.26      1247



Подготавливаем модель к обучению на выбранном датасете

In [5]:
id2label = {0: "Real", 1: "Fake"}
label2id = {"Real": 0, "Fake": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=2, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def token_func(data):
    return tokenizer(data["text"], truncation=True, padding="max_length")
# , return_tensors="pt"
tokenized_data = dataset.map(token_func)

Map:   0%|          | 0/2804 [00:00<?, ? examples/s]

Map:   0%|          | 0/935 [00:00<?, ? examples/s]

Map:   0%|          | 0/1247 [00:00<?, ? examples/s]

In [8]:
training_args = TrainingArguments(
		  output_dir="test_trainer", 
		  evaluation_strategy="epoch"
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['eval'],
    compute_metrics=compute_metrics
)

Дообучение и сохранение модели

In [17]:
trainer.train()
trainer.save_model("test_trainer")

  0%|          | 0/1053 [00:00<?, ?it/s]

KeyboardInterrupt: 

Прверка тестовой выборки на дообученной модели.

In [14]:
trained_predictions = trainer.predict(tokenized_data['test'])

  0%|          | 0/156 [00:00<?, ?it/s]

Метрика после дообучения выбранной модели

In [26]:
y_trained_predict = np.argmax(trained_predictions[0], axis=-1) 
print(classification_report(y_test, y_trained_predict,  target_names=["Real", "Fake"]) )

              precision    recall  f1-score   support

        Real       0.83      0.80      0.82       717
        Fake       0.75      0.78      0.76       530

    accuracy                           0.79      1247
   macro avg       0.79      0.79      0.79      1247
weighted avg       0.80      0.79      0.79      1247



Таким образом дообучение модели позволило улучшить определение типа новости практические в 1.84 раза