In [None]:
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

In [None]:
import evaluate

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
df = pd.read_csv('data/fake_news_dataset.csv')
df

Unnamed: 0,title,text,date,source,author,category,label
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake
...,...,...,...,...,...,...,...
19995,House party born.,hit and television I change very our happy doo...,2024-12-04,BBC,Gary Miles,Entertainment,fake
19996,Though nation people maybe price box.,fear most meet rock even sea value design stan...,2024-05-26,Daily News,Maria Mcbride,Entertainment,real
19997,Yet exist with experience unit.,activity loss very provide eye west create wha...,2023-04-17,BBC,Kristen Franklin,Entertainment,real
19998,School wide itself item.,term point general common training watch respo...,2024-06-30,Reuters,David Wise,Health,fake


In [None]:
df['author'] = df['author'].fillna('unknown')
df['source'] = df['source'].fillna('unknown')

In [None]:
df.isna().sum()

title       0
text        0
date        0
source      0
author      0
category    0
label       0
dtype: int64

In [None]:
df['full_text'] = df['title'].str.strip() + ' ' + df['text'].str.strip()

In [None]:
df["y"] = (df["label"] == "fake").astype(int)
df

Unnamed: 0,title,text,date,source,author,category,label,full_text,y
0,Foreign Democrat final.,more tax development both store agreement lawy...,2023-03-10,NY Times,Paula George,Politics,real,Foreign Democrat final. more tax development b...,0
1,To offer down resource great point.,probably guess western behind likely next inve...,2022-05-25,Fox News,Joseph Hill,Politics,fake,To offer down resource great point. probably g...,1
2,Himself church myself carry.,them identify forward present success risk sev...,2022-09-01,CNN,Julia Robinson,Business,fake,Himself church myself carry. them identify for...,1
3,You unit its should.,phone which item yard Republican safe where po...,2023-02-07,Reuters,Mr. David Foster DDS,Science,fake,You unit its should. phone which item yard Rep...,1
4,Billion believe employee summer how.,wonder myself fact difficult course forget exa...,2023-04-03,CNN,Austin Walker,Technology,fake,Billion believe employee summer how. wonder my...,1
...,...,...,...,...,...,...,...,...,...
19995,House party born.,hit and television I change very our happy doo...,2024-12-04,BBC,Gary Miles,Entertainment,fake,House party born. hit and television I change ...,1
19996,Though nation people maybe price box.,fear most meet rock even sea value design stan...,2024-05-26,Daily News,Maria Mcbride,Entertainment,real,Though nation people maybe price box. fear mos...,0
19997,Yet exist with experience unit.,activity loss very provide eye west create wha...,2023-04-17,BBC,Kristen Franklin,Entertainment,real,Yet exist with experience unit. activity loss ...,0
19998,School wide itself item.,term point general common training watch respo...,2024-06-30,Reuters,David Wise,Health,fake,School wide itself item. term point general co...,1


In [None]:
df["label"].value_counts()

label
fake    10056
real     9944
Name: count, dtype: int64

In [None]:
train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["y"]
)

train_df, val_df = train_test_split(
    train_df, test_size=0.2, random_state=42, stratify=train_df["y"]
)

In [None]:
train_ds = Dataset.from_pandas(train_df[["full_text", "y"]].rename(columns={"y":"labels"}))
val_ds   = Dataset.from_pandas(val_df[["full_text", "y"]].rename(columns={"y":"labels"}))
test_ds  = Dataset.from_pandas(test_df[["full_text", "y"]].rename(columns={"y":"labels"}))

In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True,  trust_remote_code=False)

def tokenize_batch(batch):
    return tokenizer(
        batch["full_text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_tok = train_ds.map(tokenize_batch, batched=True)
val_tok   = val_ds.map(tokenize_batch, batched=True)
test_tok  = test_ds.map(tokenize_batch, batched=True)

cols_to_keep = ["input_ids", "attention_mask", "labels"]
train_tok.set_format(type="torch", columns=cols_to_keep)
val_tok.set_format(type="torch", columns=cols_to_keep)
test_tok.set_format(type="torch", columns=cols_to_keep)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/12800 [00:00<?, ? examples/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
acc_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))  # если бы было 1 logit; но тут 2 logits

    exp = np.exp(logits - logits.max(axis=1, keepdims=True))
    softmax = exp / exp.sum(axis=1, keepdims=True)
    p_fake = softmax[:, 1]

    preds = np.argmax(logits, axis=1)

    out = {}
    out["accuracy"] = accuracy_score(labels, preds)
    out["f1"] = f1_score(labels, preds)

    # ROC-AUC требует вероятности класса 1
    try:
        out["roc_auc"] = roc_auc_score(labels, p_fake)
    except ValueError:
        out["roc_auc"] = float("nan")

    return out

In [None]:
training_args = TrainingArguments(
    output_dir="./bert_fake_news",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="roc_auc",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Roc Auc
1,0.6939,0.698286,0.502812,0.669162,0.509547
2,0.6943,0.693271,0.497188,0.0,0.511674


TrainOutput(global_step=1600, training_loss=0.6953632760047913, metrics={'train_runtime': 5190.1572, 'train_samples_per_second': 4.932, 'train_steps_per_second': 0.308, 'total_flos': 847791351398400.0, 'train_loss': 0.6953632760047913, 'epoch': 2.0})

In [None]:
test_metrics = trainer.evaluate(test_tok)
test_metrics

{'eval_loss': 0.6932790279388428,
 'eval_accuracy': 0.49725,
 'eval_f1': 0.0,
 'eval_roc_auc': 0.4840032660987995,
 'eval_runtime': 156.5006,
 'eval_samples_per_second': 25.559,
 'eval_steps_per_second': 0.799,
 'epoch': 2.0}

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
pred_out = trainer.predict(test_tok)
logits = pred_out.predictions
labels = pred_out.label_ids

preds = np.argmax(logits, axis=1)

In [None]:
classification_report(labels, preds, target_names=["real","fake"])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


'              precision    recall  f1-score   support\n\n        real       0.50      1.00      0.66      1989\n        fake       0.00      0.00      0.00      2011\n\n    accuracy                           0.50      4000\n   macro avg       0.25      0.50      0.33      4000\nweighted avg       0.25      0.50      0.33      4000\n'

In [None]:
confusion_matrix(labels, preds)

array([[1989,    0],
       [2011,    0]], dtype=int64)