## 20 newsgroups training
 This notebook loads the 20 newsgroups dataset (https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups_vectorized.html#sklearn.datasets.fetch_20newsgroups_vectorized), preprocesses it and trains a simple model

In [None]:
import sys
import os
PROJ_DIR = os.path.realpath(os.path.dirname(os.path.abspath('')))
sys.path.append(os.path.join(PROJ_DIR,'src'))



In [None]:
import sklearn.datasets
x_train, y_train = sklearn.datasets.fetch_20newsgroups_vectorized(return_X_y=True)
x_test, y_test = sklearn.datasets.fetch_20newsgroups_vectorized(return_X_y=True)

Train a MLP model

In [None]:
import torch
import numpy as np

MODEL_NEURONS = 1000
MODEL_EPOCHS= 2000
MODEL_LR = 1.0e-2
MODEL_LABEL_NUM = np.unique(y_train).size

class MLP(torch.nn.Module):
    def __init__(self, n_neurons):
        super(MLP, self).__init__()
        self.fc1 = torch.nn.Linear(x_train.shape[1], n_neurons)
        self.ac1 = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(n_neurons, 500)
        self.ac2 = torch.nn.ReLU()
        self.fc3 = torch.nn.Linear(500, 100)
        self.ac3 = torch.nn.ReLU()
        self.fc4 = torch.nn.Linear(100, MODEL_LABEL_NUM)
        #self.ac4 = torch.nn.Softmax()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.ac1(x)
        x = self.fc2(x)
        x = self.ac2(x)
        x = self.fc3(x)
        x = self.ac3(x)
        logits = self.fc4(x)
        x = logits#self.ac4(logits)
        return x

network = MLP(MODEL_NEURONS)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(network.parameters(), lr=MODEL_LR)


x_train_tensor = torch.tensor(x_train.toarray()).float()
y_train_tensor = torch.tensor(y_train)
x_test_tensor = torch.tensor(x_test.toarray()).float()
y_test_tensor = torch.tensor(y_test)

for epoch in range(MODEL_EPOCHS):
    optimizer.zero_grad()
    
    preds = network(x_train_tensor)
    loss_value = loss(preds, y_train_tensor)
    loss_value.backward()        
    optimizer.step()

    train_accuracy = (preds.argmax(dim=-1) == y_train_tensor).float().mean()
    test_preds = network.forward(x_test_tensor)        
    test_accuracy = (test_preds.argmax(dim=-1) == y_train_tensor).float().mean()
    print(f'Epoch {epoch}/{MODEL_EPOCHS} - Loss: {loss_value.item():.4f} - Train accuracy: {train_accuracy:.4f} - Test accuracy: {test_accuracy:.4f}')
    
network.eval()
test_preds = network.forward(x_test_tensor)        
test_accuracy = (test_preds.argmax(dim=1) == y_test_tensor).float().mean()
print(test_accuracy.item())

In [None]:
# Save model
torch.save(network.state_dict(), os.path.join(PROJ_DIR,'assets','models','20newsgroups-mlp.pth'))

import json
MODELS_PATH = os.path.join(PROJ_DIR,'assets','models')
with open(os.path.join(MODELS_PATH, 'model-accuracies.json')) as fIn:
    models = json.load(fIn)
models['20newsgroups'] = test_accuracy.item()
with open(os.path.join(MODELS_PATH, 'model-accuracies.json'), 'w') as fOut:
    json.dump(models, fOut)