# Загрузим нужные модули

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from transformers import pipeline, Trainer, AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification

from datasets.dataset_dict import DatasetDict
from datasets import Dataset

import evaluate

Загружаем данные с новостями, где label  0 - Real, а 1 - Fake

Подготавливаем датасет с train и test частями

In [2]:
data_set = pd.read_csv("d:/Downloads/fakenews.csv")
X_train, X_test, y_train, y_test = train_test_split(data_set['text'], data_set['label'], random_state=2023)

dataset =  DatasetDict({'train':Dataset.from_dict({'text':X_train, 'labels':y_train}),
                         'test':Dataset.from_dict({'text':X_test, 'labels':y_test})})

In [37]:
X_test.shape

(1247,)

С помощью pipeline используем предобученную модель Eip/autotrain-real-vs-fake-news-2757281769

In [3]:
model_name = "Eip/autotrain-real-vs-fake-news-2757281769"
pipe = pipeline("text-classification", model=model_name, truncation=True)

y_pre_trained_predict = pipe(X_test.to_list())
y_pre_trained_predict = [ 1 if i['label'] == "Fake" else 0 for i in y_pre_trained_predict]

print(classification_report(y_test, y_pre_trained_predict,  target_names=["Real", "Fake"]) )

              precision    recall  f1-score   support

        Real       0.71      0.01      0.01       717
        Fake       0.43      1.00      0.60       530

    accuracy                           0.43      1247
   macro avg       0.57      0.50      0.31      1247
weighted avg       0.59      0.43      0.26      1247



In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [5]:
def token_func(data):
    return tokenizer(data["text"], truncation=True, padding="max_length")

tokenized_data = dataset.map(token_func)

Map:   0%|          | 0/3739 [00:00<?, ? examples/s]

Map:   0%|          | 0/1247 [00:00<?, ? examples/s]

In [21]:
training_args = TrainingArguments(
		  output_dir="test_trainer", 
		  evaluation_strategy="epoch"
)

model = AutoModelForSequenceClassification.from_pretrained(model_name)


trainer = Trainer(
    # model=model.to('cuda'),
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test']
)




In [22]:
trainer.train()

  0%|          | 0/1404 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

{'eval_loss': 0.48846304416656494, 'eval_runtime': 517.1352, 'eval_samples_per_second': 2.411, 'eval_steps_per_second': 0.302, 'epoch': 1.0}
{'loss': 0.5753, 'learning_rate': 3.2193732193732194e-05, 'epoch': 1.07}


  0%|          | 0/156 [00:00<?, ?it/s]

{'eval_loss': 0.6301013231277466, 'eval_runtime': 516.8076, 'eval_samples_per_second': 2.413, 'eval_steps_per_second': 0.302, 'epoch': 2.0}
{'loss': 0.3736, 'learning_rate': 1.4387464387464389e-05, 'epoch': 2.14}


  0%|          | 0/156 [00:00<?, ?it/s]

{'eval_loss': 0.7373771071434021, 'eval_runtime': 516.8058, 'eval_samples_per_second': 2.413, 'eval_steps_per_second': 0.302, 'epoch': 3.0}
{'train_runtime': 12978.2356, 'train_samples_per_second': 0.864, 'train_steps_per_second': 0.108, 'train_loss': 0.40377554934248966, 'epoch': 3.0}


TrainOutput(global_step=1404, training_loss=0.40377554934248966, metrics={'train_runtime': 12978.2356, 'train_samples_per_second': 0.864, 'train_steps_per_second': 0.108, 'train_loss': 0.40377554934248966, 'epoch': 3.0})

In [28]:
predictions = trainer.predict(tokenized_data["test"])

In [31]:
predictions[1]

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [32]:
print(classification_report(y_test, predictions[1],  target_names=["Real", "Fake"]) )

              precision    recall  f1-score   support

        Real       1.00      1.00      1.00       717
        Fake       1.00      1.00      1.00       530

    accuracy                           1.00      1247
   macro avg       1.00      1.00      1.00      1247
weighted avg       1.00      1.00      1.00      1247



In [34]:
confusion_matrix(y_test, predictions[1])

array([[717,   0],
       [  0, 530]], dtype=int64)

In [24]:
model_name = "Eip/autotrain-real-vs-fake-news-2757281769"
# pipe = pipeline("text-classification", model=model_name, truncation=True)



y_trained_predict = pipe(X_test.to_list())
y_trained_predict = [ 1 if i['label'] == "Fake" else 0 for i in y_trained_predict]

print(classification_report(y_test, y_trained_predict,  target_names=["Real", "Fake"]) )

              precision    recall  f1-score   support

        Real       0.71      0.01      0.01       717
        Fake       0.43      1.00      0.60       530

    accuracy                           0.43      1247
   macro avg       0.57      0.50      0.31      1247
weighted avg       0.59      0.43      0.26      1247



In [12]:
print(predictions.predictions.shape, predictions.label_ids.shape)
preds = np.argmax(predictions.predictions, axis=-1)
metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

(1247, 2) (1247,)


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

In [13]:
compute_metrics(predictions)

ValueError: too many values to unpack (expected 2)

In [14]:
metric.compute(predictions=preds, references=predictions.label_ids)


{'accuracy': 0.545308740978348, 'f1': 0.4878048780487805}