In [None]:
!pip install -q transformers datasets accelerate

In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
true_df = pd.read_csv("True.csv")
fake_df = pd.read_csv("Fake.csv")

true_df["label"] = 1
fake_df["label"] = 0

df = pd.concat([true_df, fake_df], axis=0)
df = df.sample(frac=1).reset_index(drop=True)
df = df[["title", "text", "label"]]

In [None]:
data = df.sample(1000)  # Sample 1000 rows from df
data = data.drop(columns=["text"])  # Drop the 'text' column from the sampled data
data.sample(10)  # Display a random sample of 10 rows from data

Load and Label Data:
- We are loading both real and fake news datasets
- Adds a label column: 1 for true, 0 for fake
- Combines and shuffles the dataset
- Extracts the news the title and labels as Python lists
- Splits into training and validation sets in this case we are doing 80 training/20 validation split (line 18). Giving 80% ensures the model sees diverse examples and 20% prevents overfitting (preventing that when model memorizes but fails in real. 50/50 too little 90/10 not reliable for small datas.

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2)
train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
val_df = pd.DataFrame({"text": val_texts, "label": val_labels})

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

Tokenization
- We are loading BERT tokenizer that converts raw text into input IDs and attention masks so that BERT can understand. This is a pretrained BERT tokenizer.
- Adds padding and truncates to 512 tokens max (BERT's limit).
- Wraps the inputs and labels into datatsets that Trainer can understand. All fields must be lists of the same length.

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    #example grabs the "text" field from each example in dataset. 
    # "padding" pads all sequences to the max length, and then truncates any 
    # sequences longer than the max length.
    return tokenizer(example["text"], padding="max_length", truncation=True)

#first two line applies tokenize_function for every example in the dataset and instead of 
# processing one record at a time, this line of code runs them in batches
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
train_dataset = train_dataset.remove_columns(["text", "__index_level_0__"])
val_dataset = val_dataset.remove_columns(["text", "__index_level_0__"])
#BERT doesn't need raw text during training - it uses token IDs. 
train_dataset.set_format("torch")
val_dataset.set_format("torch")
#tokenize and ready for BERT

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    #evaluate the model at the end of every epoch, 3 in total, save at end
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    #train with 8 samples per batch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    #run for 3 full passes through the training data, weight decay to reduce
    #overfitting. folder to store training logs for tensorboard
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# Evaluate on validation/test set NOOOOOOOOO
output = trainer.train()
metrics = trainer.evaluate()

# Predict on new data
trainer.predict(test_dataset)

# Save model manually (optional)
trainer.save_model("my_saved_model")


WORD CLOUD VISUALIZATION

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Making sure that our dataset has the correct column names
dataset = df.copy() #df is main combined dataset of real and fake news
dataset = dataset.rename(columns={"label": "target"})

# WordCloud for Real News (target = 1)
#Combine all titles where the target is 1 (real news) into one large string
consolidated = ' '.join(word for word in dataset['title'][dataset['target'] == 1].astype(str))
#Create a WorldClloud object wit specific demensions and settings
wordCloud = WordCloud(width=1600, height=800, random_state=21, max_font_size=110, collocations=False)
#Figure size for displaying the word cloud
plt.figure(figsize=(15, 10))
#Generate and display the word cloud image
plt.imshow(wordCloud.generate(consolidated), interpolation='bilinear')
#Remove axes from the image
plt.axis('off')
#Add a title to plot
plt.title("WordCloud for Real News", fontsize=20)
plt.show()

# WordCloud for Fake News (target = 0) repeat the same
consolidated = ' '.join(word for word in dataset['title'][dataset['target'] == 0].astype(str))
wordCloud = WordCloud(width=1600, height=800, random_state=21, max_font_size=110, collocations=False)
plt.figure(figsize=(15, 10))
plt.imshow(wordCloud.generate(consolidated), interpolation='bilinear')
plt.axis('off')
plt.title("WordCloud for Fake News", fontsize=20)
plt.show()
