In [None]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


Transformers and NLP


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader


Loading Dataset and labeling them for real and fake news

In [None]:
df = pd.read_csv("/content/Fake.csv")
df = df[['title', 'text']].dropna()
df['label'] = 0  #For Fake news

true_df = pd.read_csv("/content/True.csv")
true_df = true_df[['title', 'text']].dropna()
true_df['label'] = 1  #For Real news


TO Combine both files

In [None]:
news_df = pd.concat([df, true_df]).sample(frac=1).reset_index(drop=True)

Preprocessing Function

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\w*\d\w*", "", text)
    text = re.sub("\s+", " ", text).strip()
    return text

news_df['clean_text'] = news_df['text'].apply(clean_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(news_df['clean_text'])
y = news_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
display1 = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels =['Fake', 'True'])
display1.plot(cmap = plt.cm.Blues)
plt.title("LOGISTIC REGRESSION CONFUSION MATRIX")
plt.show()


In [None]:
!pip install lime

For Explainability

In [None]:
import shap
from lime.lime_text import LimeTextExplainer

Fine Tuning

In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 64
train_texts, test_texts, train_labels, test_labels = train_test_split(news_df['clean_text'], news_df['label'], test_size=0.2)

train_dataset = NewsDataset(train_texts.tolist(), train_labels.tolist(), tokenizer, MAX_LEN)
test_dataset = NewsDataset(test_texts.tolist(), test_labels.tolist(), tokenizer, MAX_LEN)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased")

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_steps=10000,
    logging_dir='./logs',
    logging_steps=1000,
    save_steps=10000,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

In [None]:

class_names = ['Fake', 'Real']
explainer = LimeTextExplainer(class_names=class_names)

def predict_proba(texts):
    X_vec = vectorizer.transform(texts)
    return np.column_stack((1 - lr.predict_proba(X_vec)[:,1], lr.predict_proba(X_vec)[:,1]))

exp = explainer.explain_instance(news_df['clean_text'].iloc[1], predict_proba, num_features=5)
exp.show_in_notebook()


In [None]:
sample_text = "Breaking: The prime minister launches a new AI initiative."
probs = predict_proba([sample_text])[0]
label = "Real" if probs[1] > 0.5 else "Fake"
print(f"Prediction: {label}")
print(f"Probability - Fake: {probs[0]:.2f}, Real: {probs[1]:.2f}")