In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, TensorDataset

In [1]:

dataset = load_dataset("SetFit/20_newsgroups")


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding=True,
        max_length=512
    )


tokenized_20NG = dataset.map(preprocess_function, batched=True)


train_encodings = tokenized_20NG['train']
train_labels = torch.tensor(dataset["train"]["label"])


train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])


train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=20)
model.to(device)


2026-02-15 14:13:13.640120: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771164793.825696      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771164793.889665      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771164794.374159      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771164794.374206      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771164794.374209      55 computation_placer.cc:177] computation placer alr

README.md:   0%|          | 0.00/734 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/8.91M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11314 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7532 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/11314 [00:00<?, ? examples/s]

Map:   0%|          | 0/7532 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from transformers import get_linear_schedule_with_warmup, BertForSequenceClassification, BertTokenizer
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import numpy as np
from sklearn.metrics import accuracy_score

In [8]:
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
loss_function = CrossEntropyLoss()

total_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)
print(f"Total training steps: {total_steps}")
print(f"Warmup steps: {int(0.1 * total_steps)}\n")

epochs = 3


for epoch in range(epochs):
    model.train()
    total_loss = 0
    all_predictions = []
    all_true_labels = []
    
    # Use tqdm for progress bar
    progress_bar = tqdm(
        enumerate(train_loader),
        total=len(train_loader),
        desc=f"Epoch {epoch+1}/{epochs}",
        leave=True
    )
    
    for batch_idx, (input_ids, attention_mask, labels) in progress_bar:
        # Move to device
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits
        
        # Calculate loss
        loss = loss_function(logits, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        # Track loss and predictions
        total_loss += loss.item()
        
        # Get predictions for accuracy calculation
        predictions = torch.argmax(logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())
        
        
        progress_bar.set_postfix({'loss': loss.item()})
        
        if (batch_idx + 1) % 100 == 0:
            current_acc = accuracy_score(all_true_labels, all_predictions)
            print(f"\n  Batch {batch_idx+1}/{len(train_loader)}")
            print(f"  Loss: {loss.item():.4f}")
            print(f"  Accuracy so far: {current_acc:.4f}")
    
    
    average_loss = total_loss / len(train_loader)
    epoch_accuracy = accuracy_score(all_true_labels, all_predictions)
    
    print(f"EPOCH {epoch+1}/{epochs} COMPLETED")
    print(f"Average Loss:     {average_loss:.4f}")
    print(f"Epoch Accuracy:   {epoch_accuracy:.4f} ({epoch_accuracy*100:.2f}%)")


model.save_pretrained("./bert-20newsgroups")
tokenizer.save_pretrained("./bert-20newsgroups")

Setting up optimizer and scheduler...
Total training steps: 2124
Warmup steps: 212



Epoch 1/3:  14%|█▍        | 100/708 [01:26<08:45,  1.16it/s, loss=2.28]


  Batch 100/708
  Loss: 2.2818
  Accuracy so far: 0.3906


Epoch 1/3:  28%|██▊       | 200/708 [02:52<07:17,  1.16it/s, loss=1.85]


  Batch 200/708
  Loss: 1.8490
  Accuracy so far: 0.4387


Epoch 1/3:  42%|████▏     | 300/708 [04:18<05:51,  1.16it/s, loss=1.74] 


  Batch 300/708
  Loss: 1.7421
  Accuracy so far: 0.4838


Epoch 1/3:  56%|█████▋    | 400/708 [05:44<04:25,  1.16it/s, loss=1.32] 


  Batch 400/708
  Loss: 1.3160
  Accuracy so far: 0.5230


Epoch 1/3:  71%|███████   | 500/708 [07:10<02:59,  1.16it/s, loss=1.12] 


  Batch 500/708
  Loss: 1.1189
  Accuracy so far: 0.5531


Epoch 1/3:  85%|████████▍ | 600/708 [08:36<01:33,  1.16it/s, loss=0.703]


  Batch 600/708
  Loss: 0.7027
  Accuracy so far: 0.5765


Epoch 1/3:  99%|█████████▉| 700/708 [10:02<00:06,  1.16it/s, loss=0.751]


  Batch 700/708
  Loss: 0.7512
  Accuracy so far: 0.5905


Epoch 1/3: 100%|██████████| 708/708 [10:09<00:00,  1.16it/s, loss=0.696]


EPOCH 1/3 COMPLETED
Average Loss:     1.5232
Epoch Accuracy:   0.5917 (59.17%)


Epoch 2/3:  14%|█▍        | 100/708 [01:26<08:43,  1.16it/s, loss=1.03]


  Batch 100/708
  Loss: 1.0349
  Accuracy so far: 0.7600


Epoch 2/3:  28%|██▊       | 200/708 [02:52<07:17,  1.16it/s, loss=1.39] 


  Batch 200/708
  Loss: 1.3864
  Accuracy so far: 0.7669


Epoch 2/3:  42%|████▏     | 300/708 [04:18<05:51,  1.16it/s, loss=0.644]


  Batch 300/708
  Loss: 0.6442
  Accuracy so far: 0.7725


Epoch 2/3:  56%|█████▋    | 400/708 [05:44<04:25,  1.16it/s, loss=0.762]


  Batch 400/708
  Loss: 0.7617
  Accuracy so far: 0.7681


Epoch 2/3:  71%|███████   | 500/708 [07:10<02:59,  1.16it/s, loss=0.263]


  Batch 500/708
  Loss: 0.2629
  Accuracy so far: 0.7706


Epoch 2/3:  85%|████████▍ | 600/708 [08:36<01:33,  1.16it/s, loss=1.12] 


  Batch 600/708
  Loss: 1.1166
  Accuracy so far: 0.7709


Epoch 2/3:  99%|█████████▉| 700/708 [10:03<00:06,  1.16it/s, loss=0.628]


  Batch 700/708
  Loss: 0.6277
  Accuracy so far: 0.7710


Epoch 2/3: 100%|██████████| 708/708 [10:09<00:00,  1.16it/s, loss=1.07] 


EPOCH 2/3 COMPLETED
Average Loss:     0.7764
Epoch Accuracy:   0.7716 (77.16%)


Epoch 3/3:  14%|█▍        | 100/708 [01:26<08:43,  1.16it/s, loss=0.556]


  Batch 100/708
  Loss: 0.5562
  Accuracy so far: 0.8213


Epoch 3/3:  28%|██▊       | 200/708 [02:52<07:18,  1.16it/s, loss=0.297]


  Batch 200/708
  Loss: 0.2970
  Accuracy so far: 0.8391


Epoch 3/3:  42%|████▏     | 300/708 [04:18<05:51,  1.16it/s, loss=0.344]


  Batch 300/708
  Loss: 0.3441
  Accuracy so far: 0.8450


Epoch 3/3:  56%|█████▋    | 400/708 [05:44<04:26,  1.16it/s, loss=0.398]


  Batch 400/708
  Loss: 0.3983
  Accuracy so far: 0.8453


Epoch 3/3:  71%|███████   | 500/708 [07:10<02:59,  1.16it/s, loss=0.294]


  Batch 500/708
  Loss: 0.2935
  Accuracy so far: 0.8433


Epoch 3/3:  85%|████████▍ | 600/708 [08:37<01:33,  1.15it/s, loss=0.506]


  Batch 600/708
  Loss: 0.5055
  Accuracy so far: 0.8451


Epoch 3/3:  99%|█████████▉| 700/708 [10:03<00:06,  1.16it/s, loss=0.508]


  Batch 700/708
  Loss: 0.5084
  Accuracy so far: 0.8453


Epoch 3/3: 100%|██████████| 708/708 [10:09<00:00,  1.16it/s, loss=0.178]


EPOCH 3/3 COMPLETED
Average Loss:     0.5403
Epoch Accuracy:   0.8452 (84.52%)
Saving model and tokenizer...


('./bert-20newsgroups/tokenizer_config.json',
 './bert-20newsgroups/special_tokens_map.json',
 './bert-20newsgroups/vocab.txt',
 './bert-20newsgroups/added_tokens.json')

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [14]:
model = BertForSequenceClassification.from_pretrained("./bert-20newsgroups")
tokenizer = BertTokenizer.from_pretrained("./bert-20newsgroups")
model.to(device)
model.eval()  # Set to evaluation mode
print("Model loaded!\n")

# Tokenize test data
print("Tokenizing test data...")
test_encodings = tokenizer(
    list(dataset['test']['text']),
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors='pt'
)

# Create PyTorch dataset and dataloader
test_labels = torch.tensor(dataset['test']['label'])
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    test_labels
)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
print(f"Test set size: {len(test_labels)} samples\n")

all_predictions = []
all_true_labels = []

with torch.no_grad():
    for batch_idx, (input_ids, attention_mask, labels) in enumerate(test_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        # Forward pass
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        logits = outputs.logits
        
        # Get predicted labels
        predictions = torch.argmax(logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())
        all_true_labels.extend(labels.numpy())
        
        if (batch_idx + 1) % 50 == 0:
            print(f"Processed {(batch_idx + 1) * 32} / {len(test_labels)} samples")

all_predictions = np.array(all_predictions)
all_true_labels = np.array(all_true_labels)



accuracy = accuracy_score(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions, average='weighted', zero_division=0)
recall = recall_score(all_true_labels, all_predictions, average='weighted', zero_division=0)
f1 = f1_score(all_true_labels, all_predictions, average='weighted', zero_division=0)

print(f"Accuracy (weighted):  {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted):    {recall:.4f}")
print(f"F1-Score (weighted):  {f1:.4f}")
print()


cm = confusion_matrix(all_true_labels, all_predictions)

# Plot confusion matrix
fig, ax = plt.subplots(figsize=(14, 12))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, cbar_kws={'label': 'Count'})
ax.set_xlabel('Predicted Label', fontsize=12, fontweight='bold')
ax.set_ylabel('True Label', fontsize=12, fontweight='bold')
ax.set_title('Confusion Matrix - 20 Newsgroups Test Set', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
print("✓ Confusion matrix saved as 'confusion_matrix.png'\n")
plt.show()



def predict_text(text: str):
    encoding = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )
    
    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Make prediction
    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
    
    # Get class & confidence
    predicted_class = torch.argmax(logits, dim=1).item()
    confidence = torch.softmax(logits, dim=1).max().item()
    
    # Get label name from dataset
    sample = dataset['train'][0]  # Get one sample to see structure
    # Use the predicted class as index in label_text
    all_labels = set(dataset['train']['label_text'])
    label_name = list(all_labels)[predicted_class]
    
    return label_name, confidence

Repo card metadata block was not found. Setting CardData to empty.


Label names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

Example 1:
  Text: 'I love playing basketball and football!'
  Predicted: rec.sport.hockey
  Confidence: 33.10%

Example 2:
  Text: 'This new graphics card is amazing for gaming'
  Predicted: comp.sys.ibm.pc.hardware
  Confidence: 31.77%

Example 3:
  Text: 'Jesus Christ is my savior and I believe in God'
  Predicted: soc.religion.christian
  Confidence: 74.78%

Example 4:
  Text: 'The stock market crashed today, all investors lost money'
  Predicted: talk.politics.misc
  Confidence: 62.55%

Example 5:
  Text: 'Python is the best programming language for machine learning'
  Predicted: 

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions, average='weighted', zero_division=0)
recall = recall_score(all_true_labels, all_predictions, average='weighted', zero_division=0)
f1 = f1_score(all_true_labels, all_predictions, average='weighted', zero_division=0)

print(f"Accuracy (weighted):  {accuracy:.4f}")
print(f"Precision (weighted): {precision:.4f}")
print(f"Recall (weighted):    {recall:.4f}")
print(f"F1-Score (weighted):  {f1:.4f}")

Accuracy (weighted):  0.7144
Precision (weighted): 0.7153
Recall (weighted):    0.7144
F1-Score (weighted):  0.7115
