In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("C:/Users/aruns/Downloads/archive/Fake.csv")
df['label'] = 0
true_df = pd.read_csv("C:/Users/aruns/Downloads/archive/True.csv")
true_df['label'] = 1
data = pd.concat([df, true_df])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    data['text'], data['label'], test_size=0.2, random_state=42
)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Logistic Regression baseline
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4733
           1       0.98      0.99      0.98      4247

    accuracy                           0.98      8980
   macro avg       0.98      0.98      0.98      8980
weighted avg       0.98      0.98      0.98      8980



In [4]:
pip install transformers datasets torch

Collecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     -------------------------------------- 42.0/42.0 kB 675.1 kB/s eta 0:00:00
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.7.34-cp39-cp39-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------- 41.5/41.5 kB 2.0 MB/s eta 0:00:00
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Collecting pyarrow

In [6]:
pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
   ---------------------------------------- 0.0/139.8 kB ? eta -:--:--
   ----------- ---------------------------- 41.0/139.8 kB 2.0 MB/s eta 0:00:01
   ----------------------------------- ---- 122.9/139.8 kB 1.8 MB/s eta 0:00:01
   ------------------------------------ - 133.1/139.8 kB 877.7 kB/s eta 0:00:01
   -------------------------------------- 139.8/139.8 kB 925.5 kB/s eta 0:00:00
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
   ---------------------------------------- 0.0/216.6 kB ? eta -:--:--
   ---------------------------------------  215.0/216.6 kB 6.6 MB/s eta

In [1]:
pip install--upgrade transformers

Note: you may need to restart the kernel to use updated packages.


ERROR: unknown command "install--upgrade" - maybe you meant "install"



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

# 1. Load Dataset
fake = pd.read_csv("C:/Users/aruns/Downloads/archive/Fake.csv")
true = pd.read_csv("C:/Users/aruns/Downloads/archive/True.csv")

fake['label'] = 0
true['label'] = 1

df = pd.concat([fake, true]).sample(frac=1).reset_index(drop=True)  # shuffle
df = df[['text', 'label']]   # keep only relevant columns

# Train/test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# 2. Tokenization
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels
})

# 3. Model Setup
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# 4. Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluate_during_training=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# 5. Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# 6. Train Model
trainer.train()

# 7. Evaluate
results = trainer.evaluate()
print(results)

# Save Model
trainer.save_model("./distilbert-fake-news-model")
tokenizer.save_pretrained("./distilbert-fake-news-model")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: __init__() got an unexpected keyword argument 'evaluate_during_training'

In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm

# -----------------------------
# 1. Load Dataset
# -----------------------------
fake = pd.read_csv("C:/Users/aruns/Downloads/archive/Fake.csv")
true = pd.read_csv("C:/Users/aruns/Downloads/archive/True.csv")

fake['label'] = 0
true['label'] = 1
df = pd.concat([fake, true]).sample(frac=1).reset_index(drop=True)

texts = df['text'].tolist()
labels = df['label'].tolist()

# -----------------------------
# 2. Tokenizer
# -----------------------------
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
labels = torch.tensor(labels)

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

dataset = NewsDataset(encodings, labels)

# Train/test split
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# -----------------------------
# 3. Model
# -----------------------------
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.to(device)

optim = AdamW(model.parameters(), lr=5e-5)

# -----------------------------
# 4. Training Loop
# -----------------------------
epochs = 2

for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optim.step()

        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}")

# -----------------------------
# 5. Evaluation
# -----------------------------
from sklearn.metrics import classification_report

model.eval()
preds, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

print(classification_report(true_labels, preds))

# -----------------------------
# 6. Save Model
# -----------------------------
model.save_pretrained("./distilbert-fake-news-model")
tokenizer.save_pretrained("./distilbert-fake-news-model")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 4490/4490 [4:40:09<00:00,  3.74s/it, loss=0.000243]     


Epoch 1, Training Loss: 0.0105


Epoch 2: 100%|██████████| 4490/4490 [3:39:38<00:00,  2.94s/it, loss=5.49e-5]   


Epoch 2, Training Loss: 0.0044
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4754
           1       1.00      1.00      1.00      4226

    accuracy                           1.00      8980
   macro avg       1.00      1.00      1.00      8980
weighted avg       1.00      1.00      1.00      8980



('./distilbert-fake-news-model\\tokenizer_config.json',
 './distilbert-fake-news-model\\special_tokens_map.json',
 './distilbert-fake-news-model\\vocab.txt',
 './distilbert-fake-news-model\\added_tokens.json',
 './distilbert-fake-news-model\\tokenizer.json')