In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
import numpy as np
import gzip
import urllib.request
from tqdm import tqdm
import re

In [2]:
# Check CUDA availability
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Current device: {torch.cuda.get_device_name(0)}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CUDA available: True
Current device: NVIDIA GeForce RTX 3060


In [3]:
def load_word_vectors():

    filename = "GoogleNews-vectors-negative300.bin.gz"
    url = f"https://github.com/aburkov/theLLMbook/releases/download/v1.0.0/{filename}"

    with tqdm(unit='B', unit_scale=True, unit_divisor=1024, miniters=1, desc=filename) as progress_bar:
        def report_hook(count, block_size, total_size):
            if total_size != -1:
                progress_bar.total = total_size
            progress_bar.update(block_size)

        urllib.request.urlretrieve(url, filename, reporthook=report_hook)

    with gzip.open(filename, 'rb') as f:
        header = f.readline()
        vocab_size, vector_size = map(int, header.split())

        vectors = {}
        binary_len = np.dtype('float32').itemsize * vector_size

        with tqdm(total=vocab_size, desc="Loading word vectors") as pbar:
            for _ in range(vocab_size):
                word = []
                while True:
                    ch = f.read(1)
                    if ch == b' ':
                        word = b''.join(word).decode('utf-8')
                        break
                    if ch != b'\n':
                        word.append(ch)

                vector = np.frombuffer(f.read(binary_len), dtype='float32')
                if re.search(r"^[a-z]+$", word):
                    vectors[word] = vector
                pbar.update(1)

    return vectors

In [4]:
def embed_text(text, word_vectors, max_length=5000):
    words = text.lower().split()[:max_length]
    embeddings = [word_vectors.get(word, np.zeros(300)) for word in words]
    padding = [np.zeros(300)] * (max_length - len(embeddings))
    return np.array(embeddings + padding)[:max_length]

In [5]:
class NewsGroupDataset(Dataset):
    def __init__(self, texts, labels, word_vectors, max_len):
        self.texts = texts
        self.labels = labels
        self.word_vectors = word_vectors

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        embeddings = embed_text(self.texts[idx], self.word_vectors, max_len)
        return torch.tensor(embeddings, dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.long)


In [6]:
class TextCNN(nn.Module):
    def __init__(self, embedding_dim, num_classes, max_len):
        super(TextCNN, self).__init__()
        self.conv1 = nn.Conv1d(embedding_dim, 512, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv1d(512, 512, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv1d(512, 64, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc = nn.Linear(64 * (max_len // 8), num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.relu(self.conv1(x))
        x = self.pool1(x)
        x = self.dropout(x)
        x = self.relu(self.conv2(x))
        x = self.pool2(x)
        x = self.dropout(x)
        x = self.relu(self.conv3(x))
        x = self.pool3(x)
        x = self.dropout(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

In [7]:
word_vectors = load_word_vectors()
newsgroups = fetch_20newsgroups(remove=("headers", "footers", "quotes"))
X = newsgroups.data
y = newsgroups.target

X_train, X_test, y_train, y_test = train_test_split(X, y,\
                test_size=0.2, random_state=42, shuffle=True)


GoogleNews-vectors-negative300.bin.gz: 1.53GB [00:21, 75.3MB/s]                               
Loading word vectors: 100%|██████████| 3000000/3000000 [00:28<00:00, 104292.78it/s]


In [8]:
# Create datasets and dataloaders
max_len = 500
train_dataset = NewsGroupDataset(X_train, y_train, word_vectors, max_len)
test_dataset = NewsGroupDataset(X_test, y_test, word_vectors, max_len)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [9]:
# Initialize the model, loss function, and optimizer
model = TextCNN(300, 20, max_len)
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())


In [10]:
# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    train_correct = 0
    train_total = 0
    for batch_embeddings, batch_labels in train_loader:
        batch_embeddings, batch_labels = batch_embeddings.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        outputs = model(batch_embeddings)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        train_total += batch_labels.size(0)
        train_correct += (predicted == batch_labels).sum().item()

    train_accuracy = 100 * train_correct / train_total

    # Evaluation
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for batch_embeddings, batch_labels in test_loader:
            batch_embeddings, batch_labels = batch_embeddings.to(device), batch_labels.to(device)
            outputs = model(batch_embeddings)
            _, predicted = torch.max(outputs.data, 1)
            test_total += batch_labels.size(0)
            test_correct += (predicted == batch_labels).sum().item()

    test_accuracy = 100 * test_correct / test_total
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Accuracy: {train_accuracy:.2f}%, Test Accuracy: {test_accuracy:.2f}%")

print("Training completed!")

Epoch [1/20], Train Accuracy: 11.58%, Test Accuracy: 16.17%
Epoch [2/20], Train Accuracy: 25.94%, Test Accuracy: 32.52%
Epoch [3/20], Train Accuracy: 38.61%, Test Accuracy: 40.21%
Epoch [4/20], Train Accuracy: 44.93%, Test Accuracy: 46.00%
Epoch [5/20], Train Accuracy: 50.72%, Test Accuracy: 50.68%
Epoch [6/20], Train Accuracy: 57.28%, Test Accuracy: 52.45%
Epoch [7/20], Train Accuracy: 60.47%, Test Accuracy: 55.02%
Epoch [8/20], Train Accuracy: 63.57%, Test Accuracy: 56.61%
Epoch [9/20], Train Accuracy: 66.77%, Test Accuracy: 56.16%
Epoch [10/20], Train Accuracy: 69.02%, Test Accuracy: 56.56%
Epoch [11/20], Train Accuracy: 70.69%, Test Accuracy: 56.39%
Epoch [12/20], Train Accuracy: 73.34%, Test Accuracy: 57.71%
Epoch [13/20], Train Accuracy: 73.93%, Test Accuracy: 57.98%
Epoch [14/20], Train Accuracy: 75.86%, Test Accuracy: 58.37%
Epoch [15/20], Train Accuracy: 76.63%, Test Accuracy: 57.58%
Epoch [16/20], Train Accuracy: 77.72%, Test Accuracy: 57.80%
Epoch [17/20], Train Accuracy: 79