## 20 newsgroups training
 This notebook loads the 20 newsgroups dataset (https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups_vectorized.html#sklearn.datasets.fetch_20newsgroups_vectorized), preprocesses it and trains a simple model

In [1]:
import sys
import os
PROJ_DIR = os.path.realpath(os.path.dirname(os.path.abspath('')))
sys.path.append(os.path.join(PROJ_DIR,'src'))
import xai_faithfulness_experiments_lib_edits as ff


In [2]:
import sklearn.datasets
x_train, y_train = sklearn.datasets.fetch_20newsgroups_vectorized(subset='train', return_X_y=True)
x_test, y_test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test', return_X_y=True)
x_train = x_train.toarray()
x_test = x_test.toarray()

In [3]:
import numpy as np
# Save to assets
np.savez(os.path.join(PROJ_DIR, 'assets', 'data', '20newsgroups-truncated'),\
        x_train=x_train[:100],\
        x_test=x_test[:100],\
        y_train=y_train[:100],\
        y_test=y_test[:100])

Train a MLP model

In [6]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "CPU")
print(f'Using {device}')

import numpy as np

MODEL_NEURONS = [1000, 1000, 800, 500]
MODEL_EPOCHS= 100
MODEL_LR = 1.0e-2
MODEL_LABEL_NUM = np.unique(y_train).size

network = ff.MLPLarge(x_train.shape[1], MODEL_LABEL_NUM, MODEL_NEURONS).to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(network.parameters(), lr=MODEL_LR)


x_train_tensor = torch.tensor(x_train).float().to(device)
y_train_tensor = torch.tensor(y_train).to(device)
x_test_tensor = torch.tensor(x_test).float().to(device)
y_test_tensor = torch.tensor(y_test).to(device)

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class MyDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X = X_data
        self.y = y_data

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create an instance of your custom dataset
#train_dataset = MyDataset(x_train_tensor, y_train_tensor)

# Create a DataLoader
#batch_size = 32  # Set the batch size
#shuffle = True   # Shuffle the data every epoch
#data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)

data_loader = [(x_train_tensor, y_train_tensor)]

for epoch in range(MODEL_EPOCHS):
    for x_batch, y_batch in data_loader:
        optimizer.zero_grad()
        
        preds = network(x_batch)
        loss_value = loss(preds, y_batch)
        loss_value.backward()        
        optimizer.step()
    
        train_accuracy = (preds.argmax(dim=-1) == y_batch).float().mean()
        test_preds = network.forward(x_test_tensor)        
        test_accuracy = (test_preds.argmax(dim=-1) == y_test_tensor).float().mean()
        print(f'Epoch {epoch+1}/{MODEL_EPOCHS} - Loss: {loss_value.item():.4f} - Train accuracy: {train_accuracy:.4f} - Test accuracy: {test_accuracy:.4f}')
    
network.eval()
test_preds = network.forward(x_test_tensor)        
test_accuracy = (test_preds.argmax(dim=1) == y_test_tensor).float().mean()
print(test_accuracy.item())

Using cuda:0
Epoch 1/100 - Loss: 2.9957 - Train accuracy: 0.0521 - Test accuracy: 0.0503
Epoch 2/100 - Loss: 2.9948 - Train accuracy: 0.0505 - Test accuracy: 0.0523
Epoch 3/100 - Loss: 2.9979 - Train accuracy: 0.0504 - Test accuracy: 0.0534
Epoch 4/100 - Loss: 2.9933 - Train accuracy: 0.0567 - Test accuracy: 0.0536
Epoch 5/100 - Loss: 2.9893 - Train accuracy: 0.0532 - Test accuracy: 0.0530
Epoch 6/100 - Loss: 3.0046 - Train accuracy: 0.0530 - Test accuracy: 0.0635
Epoch 7/100 - Loss: 2.9769 - Train accuracy: 0.0596 - Test accuracy: 0.0807
Epoch 8/100 - Loss: 2.9662 - Train accuracy: 0.0754 - Test accuracy: 0.0705
Epoch 9/100 - Loss: 2.9484 - Train accuracy: 0.0749 - Test accuracy: 0.0884
Epoch 10/100 - Loss: 2.9530 - Train accuracy: 0.0945 - Test accuracy: 0.1115
Epoch 11/100 - Loss: 2.9459 - Train accuracy: 0.1132 - Test accuracy: 0.1020
Epoch 12/100 - Loss: 2.9412 - Train accuracy: 0.1060 - Test accuracy: 0.0936
Epoch 13/100 - Loss: 2.9329 - Train accuracy: 0.1058 - Test accuracy: 0.

In [7]:
# Save model
torch.save(network.state_dict(), os.path.join(PROJ_DIR,'assets','models','20newsgroups-mlp.pth'))

import json
MODELS_PATH = os.path.join(PROJ_DIR,'assets','models')
with open(os.path.join(MODELS_PATH, 'model-accuracies.json')) as fIn:
    models = json.load(fIn)
models['20newsgroups'] = test_accuracy.item()
with open(os.path.join(MODELS_PATH, 'model-accuracies.json'), 'w') as fOut:
    json.dump(models, fOut)