## 20 newsgroups training
 This notebook loads the 20 newsgroups dataset (https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups_vectorized.html#sklearn.datasets.fetch_20newsgroups_vectorized), preprocesses it and trains a simple model

In [None]:
import sys
import os
PROJ_DIR = os.path.realpath(os.path.dirname(os.path.dirname(os.path.abspath(''))))
sys.path.append(os.path.join(PROJ_DIR,'src'))
import gce_lib as ff

In [None]:
import sklearn.datasets
x_train, y_train = sklearn.datasets.fetch_20newsgroups_vectorized(subset='train', return_X_y=True)
x_test, y_test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test', return_X_y=True)
x_train = x_train.toarray()
x_test = x_test.toarray()

In [None]:
import numpy as np
# Save to assets (there's no need to save the whole thing)
np.savez(os.path.join(PROJ_DIR, 'assets', 'data', '20newsgroups'),\
        x_train=x_train[:100],\
        x_test=x_test[:100],\
        y_train=y_train[:100],\
        y_test=y_test[:100])

Train a MLP model

In [None]:
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "CPU")
print(f'Using {device}')

import numpy as np

MODEL_NEURONS = [1000, 1000, 800, 500]
MODEL_EPOCHS= 100
MODEL_LR = 1.0e-2
MODEL_LABEL_NUM = np.unique(y_train).size

network = ff.MLPLarge(x_train.shape[1], MODEL_LABEL_NUM, MODEL_NEURONS).to(device)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(network.parameters(), lr=MODEL_LR)


x_train_tensor = torch.tensor(x_train).float().to(device)
y_train_tensor = torch.tensor(y_train).to(device)
x_test_tensor = torch.tensor(x_test).float().to(device)
y_test_tensor = torch.tensor(y_test).to(device)

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class MyDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X = X_data
        self.y = y_data

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Create an instance of your custom dataset
#train_dataset = MyDataset(x_train_tensor, y_train_tensor)

# Create a DataLoader
#batch_size = 32  # Set the batch size
#shuffle = True   # Shuffle the data every epoch
#data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)

data_loader = [(x_train_tensor, y_train_tensor)]

for epoch in range(MODEL_EPOCHS):
    for x_batch, y_batch in data_loader:
        optimizer.zero_grad()
        
        preds = network(x_batch)
        loss_value = loss(preds, y_batch)
        loss_value.backward()        
        optimizer.step()
    
        train_accuracy = (preds.argmax(dim=-1) == y_batch).float().mean()
        test_preds = network.forward(x_test_tensor)        
        test_accuracy = (test_preds.argmax(dim=-1) == y_test_tensor).float().mean()
        print(f'Epoch {epoch+1}/{MODEL_EPOCHS} - Loss: {loss_value.item():.4f} - Train accuracy: {train_accuracy:.4f} - Test accuracy: {test_accuracy:.4f}')
    
network.eval()
test_preds = network.forward(x_test_tensor)        
test_accuracy = (test_preds.argmax(dim=1) == y_test_tensor).float().mean()
print(test_accuracy.item())

In [None]:
# Save model
torch.save(network.state_dict(), os.path.join(PROJ_DIR,'assets','models','20newsgroups-mlp.pth'))

import json
MODELS_PATH = os.path.join(PROJ_DIR,'assets','models')
with open(os.path.join(MODELS_PATH, 'model-accuracies.json')) as fIn:
    models = json.load(fIn)
models['20newsgroups'] = test_accuracy.item()
with open(os.path.join(MODELS_PATH, 'model-accuracies.json'), 'w') as fOut:
    json.dump(models, fOut)