# Text classification approaches on news-like data.

This script demonstrates:
1) Bag-of-words (document-term matrix) + Logistic Regression
2) Sentence embeddings (MiniLM) + Logistic Regression
3) Sentence embeddings + PyTorch dense neural net

In [None]:
import os
import random
import numpy as np
import pandas as pd

# Classic ML approach
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# For embeddings
from sentence_transformers import SentenceTransformer

# For PyTorch neural net
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [None]:
# For reproducibility

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

### Load and prepare data

* Load a subset of 20 Newsgroups as a stand-in for 'news' articles.
* We pick a few categories to make it multi-class.

In [None]:
categories = [
    "rec.autos",
    "sci.space",
    "comp.graphics",
    "talk.politics.misc",
]

In [None]:
dataset = fetch_20newsgroups(
    subset="all",
    categories=categories,
    remove=("headers", "footers", "quotes")
)

In [None]:
print(dataset.DESCR)

In [None]:
texts = dataset.data
labels = dataset.target
target_names = dataset.target_names

In [None]:
print(texts[0])

In [None]:
print(labels[0], ':', target_names[0])

Our first approach is with Logistic Regression, and we'll approach this with a classic Scikit-Learn tack.

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    texts, 
    labels, 
    test_size=0.2, 
    random_state=RANDOM_SEED, 
    stratify=labels
)

In [None]:
print(f"Loaded {len(texts)} documents, {len(target_names)} classes:")
for i, name in enumerate(target_names):
    print(f"  {i}: {name}")

### Bag-of-words (document-term matrix) + Logistic Regression

In [None]:
vectorizer = CountVectorizer(
    max_features=20000,
    stop_words="english"
)

In [None]:
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [None]:
pd.DataFrame(X_train_bow.toarray(), 
             columns=vectorizer.get_feature_names_out())

In [None]:
clf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

In [None]:
clf.fit(X_train_bow, y_train)

In [None]:
y_pred = clf.predict(X_test_bow)

In [None]:
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy (BoW + LogReg): {acc:.4f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
169/(169+16+7+3)

In [None]:
169/(169+6+12+5)

In [None]:
clf_report = classification_report(y_test, y_pred)
print(clf_report)

### Sentence Embeddings + Logistic Regression

In [None]:
# Small sentence-transformer model
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

In [None]:
# Compute dense embeddings
print("Encoding training set...")
X_train_emb = embedder.encode(
    X_train,
    batch_size=32,
    show_progress_bar=True
)

print("Encoding test set...")
X_test_emb = embedder.encode(
    X_test,
    batch_size=32,
    show_progress_bar=True
)

In [None]:
X_train_bow.shape, X_train_emb.shape

In [None]:
clf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

In [None]:
clf.fit(X_train_emb, y_train)

In [None]:
y_pred = clf.predict(X_test_emb)

In [None]:
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy (Embeddings + LogReg): {acc:.4f}")

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)

In [None]:
clf_report = classification_report(y_test, y_pred)
print(clf_report)

### Embeddings + PyTorch Dense Neural Net

In [None]:
class EmbeddingDataset(Dataset):
    def __init__(self, X_emb, y):
        self.X = torch.tensor(X_emb, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, hidden_dim=256, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
type(X_train_emb)

In [None]:
train_dataset = EmbeddingDataset(X_train_emb, y_train)
test_dataset = EmbeddingDataset(X_test_emb, y_test)

In [None]:
train_dataset.X.shape, train_dataset.y.shape

In [None]:
train_dataset.X[0][0:10], train_dataset.y[0]

In [None]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
X_train_emb.shape[1]

In [None]:
input_dim = X_train_emb.shape[1]

In [None]:
len(target_names)

In [None]:
num_classes = len(target_names)

In [None]:
model = MLPClassifier(input_dim=input_dim, 
                      num_classes=num_classes).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
num_epochs = 5

In [None]:
# Here is our "fit"
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * X_batch.size(0)

    avg_loss = running_loss / len(train_dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}, train loss: {avg_loss:.4f}")

In [None]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        logits = model(X_batch)
        preds = logits.argmax(dim=-1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

In [None]:
acc = accuracy_score(all_labels, all_preds)
print(f"Test accuracy (Embeddings + MLP): {acc:.4f}")

In [None]:
conf_matrix = confusion_matrix(all_labels, all_preds)
print(conf_matrix)

In [None]:
clf_report = classification_report(all_labels, all_preds)
print(clf_report)