In [1]:
!pip -q install datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import numpy as np

# Load AG News dataset
dataset = load_dataset('ag_news')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [4]:
train_df = dataset['train'].to_pandas()
test_df = dataset['test'].to_pandas()

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    120000 non-null  object
 1   label   120000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.8+ MB


In [6]:
# Convert text data to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train_df['text']).toarray()
X_test_tfidf = vectorizer.transform(test_df['text']).toarray()

In [7]:
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train_tfidf, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_tfidf, dtype=torch.float32)
y_train_tensor = torch.tensor(train_df['label'], dtype=torch.long)
y_test_tensor = torch.tensor(test_df['label'], dtype=torch.long)

In [8]:
# Define a simple neural network classifier
class TextClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        x = self.fc(x)
        return x

In [9]:
# Initialize model, loss function, and optimizer
input_dim = X_train_tensor.shape[1]

output_dim = len(np.unique(y_train_tensor))

model = TextClassifier(input_dim, output_dim)

criterion = nn.CrossEntropyLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

In [10]:
# Train the model
epochs = 10

batch_size = 64

# Iterate over each epoch
for epoch in range(epochs):
    # Iterate over the training data in batches
    for i in range(0, len(X_train_tensor), batch_size):
        # Get the current batch of input data and labels
        inputs = X_train_tensor[i:i+batch_size]
        labels = y_train_tensor[i:i+batch_size]

        # Clear the gradients of all optimized variables
        optimizer.zero_grad()

        # Forward pass: compute predicted outputs by passing inputs to the model
        outputs = model(inputs)

        # Calculate the batch loss
        loss = criterion(outputs, labels)

        # Backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()

        # Perform a single optimization step (parameter update)
        optimizer.step()

    # Print training accuracy every epoch
    with torch.no_grad():
        # Compute predicted outputs on the entire training dataset
        outputs = model(X_train_tensor)

        # Get the predicted classes
        _, predicted = torch.max(outputs, 1)

        # Calculate training accuracy
        train_acc = accuracy_score(y_train_tensor, predicted)

        # Print the training accuracy for the current epoch
        print(f"Epoch {epoch+1}/{epochs}, Training Accuracy: {train_acc:.4f}")

# Evaluate the model on test data
with torch.no_grad():
    # Compute predicted outputs on the test dataset
    outputs = model(X_test_tensor)

    # Get the predicted classes
    _, predicted = torch.max(outputs, 1)

    # Calculate test accuracy
    test_acc = accuracy_score(y_test_tensor, predicted)

    # Print the test accuracy
    print(f"Test Accuracy: {test_acc:.4f}")

Epoch 1/10, Training Accuracy: 0.8906
Epoch 2/10, Training Accuracy: 0.8992
Epoch 3/10, Training Accuracy: 0.9051
Epoch 4/10, Training Accuracy: 0.9091
Epoch 5/10, Training Accuracy: 0.9120
Epoch 6/10, Training Accuracy: 0.9147
Epoch 7/10, Training Accuracy: 0.9169
Epoch 8/10, Training Accuracy: 0.9181
Epoch 9/10, Training Accuracy: 0.9192
Epoch 10/10, Training Accuracy: 0.9204
Test Accuracy: 0.9067
