In [1]:
import numpy as np
import sklearn
import sklearn.linear_model
from sklearn import preprocessing
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch import optim
import os
import gc
from datetime import datetime

os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
class Module(nn.Module):
    def __init__(self, final_dim):
        super().__init__()
        self.inp = nn.Linear(1024, 512)
        self.outp1 = nn.Linear(512, 256)
        self.outp2 = nn.Linear(256, final_dim)
    def forward(self, x):
        x = self.inp(x)
        x = F.relu(x)
        x = self.outp1(x)
        x = F.relu(x)
        x = torch.sigmoid(x)
        x = self.outp2(x)
        return x

In [3]:
def load_ds(dataset, mode):
    train_rep = np.load(dataset[0])
    train_lab = np.load(dataset[1])
    valid_rep = np.load(dataset[2])
    valid_lab = np.load(dataset[3])

    train_tar = []
    valid_tar = []
    if mode == 'binary':
        labels = [set(('I-PER', 'B-PER')), set(('O', 'I-ORG', 'B-ORG', 'I-LOC', 'B-LOC', 'I-MISC', 'B-MISC'))]
    else:
        labels = [set(('I-PER', 'B-PER')), set(('O')), set(('I-MISC', 'B-MISC')), set(('I-LOC', 'B-LOC')), set(('I-ORG', 'B-ORG'))]
    for lab in labels:
        train_tar.append([1 if x in lab else 0 for x in train_lab])
        valid_tar.append([1 if x in lab else 0 for x in valid_lab])
    train_tar = np.array(train_tar).T
    valid_tar = np.array(valid_tar).T

    train_rep, train_tar = map(torch.tensor, (train_rep, train_tar))
    valid_rep, valid_tar = map(torch.tensor, (valid_rep, valid_tar))

    return train_rep, train_tar, valid_rep, valid_tar

In [4]:
def train(model, train_dl, valid_dl, loss_func, opt, weights, epochs=20):
    if torch.cuda.is_available():
        model.cuda()
    weights = weights.to('cuda')
        
    prev_loss = -1
    train_dl = train_dl
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            xb = xb.to('cuda')
            yb = yb.to('cuda')
            pred = model(xb)
            loss = loss_func(pred, torch.argmax(yb, dim=1), weight=weights)

            loss.backward()
            opt.step()
            opt.zero_grad()

        model.eval()
        with torch.no_grad():
            valid_loss = sum(loss_func(model(xb.to('cuda')), torch.argmax(yb.to('cuda'), dim=1)) for xb, yb in valid_dl) / len(valid_dl)
            if abs(valid_loss - prev_loss) < 1e-5:
                break
            prev_loss = valid_loss

        print(epoch, valid_loss)

In [5]:
def evaluate(dataset, mode='binary'):
    train_rep, train_tar, valid_rep, valid_tar = load_ds(dataset, mode)
    train_d = TensorDataset(train_rep, train_tar)
    train_d = DataLoader(train_d, batch_size=128, pin_memory=True)
    valid_d = TensorDataset(valid_rep, valid_tar)
    valid_d = DataLoader(valid_d, batch_size=128, pin_memory=True)

    model = Module(2 if mode == 'binary' else 5)
    if mode == 'binary':
        weights = torch.tensor([10.0, 1.0])
    else:
        weights = torch.tensor([10.0, 1.0, 1.0, 1.0, 1.0])
    loss = F.cross_entropy
    opt = optim.SGD(model.parameters(), lr=0.01)

    if mode == 'binary':
        n_epochs = 25
    else:
        n_epochs = 50
    
    t = datetime.now()
    train(model, train_d, valid_d, loss, opt, weights, n_epochs)
    print('Time for the train step:', datetime.now() - t)

    t = datetime.now()
    model.eval()
    model.cpu()
    pred = torch.argmax(model(valid_rep), dim=1)
    pred = pred.detach().numpy()
    valid_tar = torch.argmax(valid_tar, dim=1)
    valid_tar = valid_tar.detach().numpy()
    print('Time for the analysis step:', datetime.now() - t)

    conf_matrix = sklearn.metrics.confusion_matrix(valid_tar, pred)
    
    valid_tar = np.array([valid_tar == 0], dtype=int).T
    pred = np.array([pred == 0], dtype=int).T

    accuracy = sklearn.metrics.accuracy_score(valid_tar, pred)
    recall = sklearn.metrics.recall_score(valid_tar, pred)
    f_score = sklearn.metrics.f1_score(valid_tar, pred)

    print('CONFUSION MATRIX:')
    print(conf_matrix)

    print('OVERALL ACCURACY:', accuracy)
    print('TRUE POSITIVE RATE:', recall)
    print('F-SCORE:', f_score)

In [6]:
gc.collect()
os.chdir('/kaggle/input')
dataset_paths = {'english_a': ['engdataset/data/representation.train.npy', 'engdataset/data/true_labels.train.npy', 'engdataset/data/representation.testa.npy', 'engdataset/data/true_labels.testa.npy'],
                 'english_b': ['engdataset/data/representation.train.npy', 'engdataset/data/true_labels.train.npy', 'engdataset/data/representation.testb.npy', 'engdataset/data/true_labels.testb.npy'],
                 'spanish': ['pi-inf442/representation.esp.train.npy', 'pi-inf442/true_labels.esp.train.npy', 'pi-inf442/representation.esp.testa.npy', 'pi-inf442/true_labels.esp.testa.npy'], 
                 'dutch': ['pi-inf442/representation.ned.train.npy', 'pi-inf442/true_labels.ned.train.npy', 'pi-inf442/representation.ned.testa.npy', 'pi-inf442/true_labels.ned.testa.npy'],
                 'portuguese': ['pi-inf442/representation.portuguese.train.npy', 'pi-inf442/true_labels.portuguese.train.npy', 'pi-inf442/representation.portuguese.test.npy', 'pi-inf442/true_labels.portuguese.test.npy']}

for language in dataset_paths:
    print(language)
    evaluate(dataset_paths[language])
    if language == 'english_a' or language == 'english_b':
        evaluate(dataset_paths[language], 'ner')
# evaluate(dataset_paths['english_a'])

english_a
0 tensor(0.3217, device='cuda:0')
1 tensor(0.1033, device='cuda:0')
2 tensor(0.0744, device='cuda:0')
3 tensor(0.0631, device='cuda:0')
4 tensor(0.0574, device='cuda:0')
5 tensor(0.0538, device='cuda:0')
6 tensor(0.0511, device='cuda:0')
7 tensor(0.0491, device='cuda:0')
8 tensor(0.0475, device='cuda:0')
9 tensor(0.0461, device='cuda:0')
10 tensor(0.0448, device='cuda:0')
11 tensor(0.0436, device='cuda:0')
12 tensor(0.0427, device='cuda:0')
13 tensor(0.0419, device='cuda:0')
14 tensor(0.0411, device='cuda:0')
15 tensor(0.0404, device='cuda:0')
16 tensor(0.0398, device='cuda:0')
17 tensor(0.0393, device='cuda:0')
18 tensor(0.0387, device='cuda:0')
19 tensor(0.0382, device='cuda:0')
20 tensor(0.0377, device='cuda:0')
21 tensor(0.0373, device='cuda:0')
22 tensor(0.0369, device='cuda:0')
23 tensor(0.0364, device='cuda:0')
24 tensor(0.0361, device='cuda:0')
Time for the train step: 0:02:34.025763
Time for the analysis step: 0:00:02.767351
CONFUSION MATRIX:
[[  8913    495]
 [  163