Install necessary software

In [None]:
%pip install -q transformers datasets accelerate
%pip install scikit-learn

Import necessary packages

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

Import data from CSV files

In [None]:
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

true_df["label"] = 1
fake_df["label"] = 0

df = pd.concat([true_df, fake_df], axis=0)
df = df.sample(frac=1).reset_index(drop=True)
df = df[["title", "text", "label"]]

Display sample data

In [None]:
data = df.sample(1000) 
data = data.drop(columns=["text"]) 
data.sample(10) 

Load and label the training and test data
- Load from both real and fake news datasets
- Add a label column: 1 for true, 0 for fake
- Combine and shuffle the datasets
- Extract the article headline and labels as Python lists
- Conduct an 80-20 split for training and validation sets

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2)
train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
val_df = pd.DataFrame({"text": val_texts, "label": val_labels})

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

Tokenization
- Load BERT tokenizer that converts raw text into input IDs and attention masks 
- Add padding and truncates to 512 tokens max (BERT's limit)
- Wrap inputs and labels into datatsets that Trainer can understand

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.remove_columns(["text", "__index_level_0__"])
val_dataset = val_dataset.remove_columns(["text", "__index_level_0__"])
train_dataset.set_format("torch")
val_dataset.set_format("torch")

Define the model

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Define the arguments for training the model

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8, 
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)

Define function to compute the metrics for evaluating the model's accuracy

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

Train the model (requires a Hugging Face API key)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Evaluate the model on the test dataset

In [None]:
output = trainer.train()
metrics = trainer.evaluate()

trainer.predict(test_dataset)

Save trained model and tokenizer for implementation in web interface

In [None]:
model.save_pretrained("my_saved_model")
tokenizer.save_pretrained("my_saved_model")