In [1]:
!pip install torchmetrics


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import numpy as np
from torchmetrics.classification import MulticlassAccuracy, F1Score

### Загружаем датасет

In [21]:
allowed_tokens = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']

data = []
labels = []

i_am_dummy = 0
with open("train.txt", "r", encoding="utf-8") as file:
    for line in file:
        tokens = list(line.strip().split('\t'))

        if tokens[-1] not in allowed_tokens:
          continue
            
        if tokens[-1] == 'O':
            i_am_dummy += 1
            if i_am_dummy != 27:
                continue
            else:
                i_am_dummy = 0

        data.append(tokens[0])
        labels.append(tokens[-1])

test_data, test_labels = [], []
with open('test.txt', "r", encoding="utf-8") as file:
    for line in file:
        tokens = list(line.strip().split('\t'))

        if tokens[-1] not in allowed_tokens:
          continue
            
        if tokens[-1] == 'O':
            i_am_dummy += 1
            if i_am_dummy != 27:
                continue
            else:
                i_am_dummy = 0

        test_data.append(tokens[0])
        test_labels.append(tokens[-1])

### Любуемся на дисбаланс классов

In [22]:
from collections import Counter

counts = Counter(labels)
print(dict(counts).values())

dict_values([7405, 7507, 5956, 4881, 5717, 5960, 1563])


### Получаем эмбеддинги

In [23]:
model = FastText(sentences=data, window=5, min_count=1, workers=4, sg=1)

embedded_input = [model.wv.get_vector(word) for word in data]
embedded_test = [model.wv.get_vector(word) for word in test_data]

### Загружаем всё в data loader

In [30]:
from sklearn.model_selection import train_test_split

t = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6}
new_labels = [t.get(key) for key in labels]
new_test_labels = [t.get(key) for key in test_labels]

targets = torch.Tensor(np.array(embedded_input))

# X_train, X_test, y_train, y_test = train_test_split(targets, new_labels, test_size=0.3, random_state=42)

train_data = list(zip(embedded_input, new_labels))
test_data = list(zip(embedded_test, new_test_labels))

np.random.shuffle(train_data)
np.random.shuffle(test_data)

### Попытка бороться с дисбалансом - добавляем веса для функции потерь

In [31]:
from collections import Counter

counts = Counter(new_labels)
class_weights = []

values = dict(counts).values()

for val in values:
    class_weights.append((1 - (val / sum(values))))

class_weights = torch.Tensor(class_weights).to('cuda:0')

print(class_weights)

tensor([0.8101, 0.8075, 0.8472, 0.8748, 0.8534, 0.8471, 0.9599],
       device='cuda:0')


In [32]:
number_of_unique_tags = len(set(labels))

print(number_of_unique_tags)

7


### Архитектура сети

In [33]:
class NERModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, num_classes, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        #x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out


### Параметры

In [36]:
num_epochs = 1000
bs = 512
lr = 1e-3
wd = 5e-4
device = 'cuda:0' 

input_size = 100  
hidden_size = 64  
output_size = 128 
num_layers = 2  
num_classes = number_of_unique_tags

nn_model = NERModel(input_size, hidden_size, output_size, num_layers, num_classes)
nn_model = nn_model.to(device)

train_dataloader = DataLoader(train_data, batch_size=bs, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=bs, shuffle=True)

criterion = nn.CrossEntropyLoss(weight=class_weights)

optimizer = torch.optim.Adam(nn_model.parameters(), lr=lr)

f1_metric = F1Score(num_classes=output_size, task='multiclass').to(device)
val_f1_metric = F1Score(num_classes=output_size, task='multiclass').to(device)

### Обучение + тест

In [37]:
nn_model.train()
for epoch in range(num_epochs):
    print('Epoch: {} / {}'.format(epoch + 1, num_epochs))
    for batch, gt in train_dataloader:
        batch, gt = batch.to(device), gt.to(device)
        optimizer.zero_grad()
        outputs = nn_model(batch)
        outputs = outputs.view(-1, num_classes)
        target_labels = gt.view(-1)
        loss = criterion(outputs, target_labels)
        loss.backward()
        optimizer.step()
        test_outputs = outputs.view(-1, num_classes)
        predicted_labels = torch.argmax(test_outputs, dim=1).to('cuda:0')
    
        f1_metric(predicted_labels, gt).item()
  
    print('Train f1-score: {}'.format(f1_metric.compute()))
    
    nn_model.eval()
    for batch, gt in test_dataloader:
        batch, gt = batch.to(device), gt.to(device)
        outputs = nn_model(batch)
        outputs = outputs.view(-1, num_classes)
        target_labels = gt.view(-1)
        test_outputs = outputs.view(-1, num_classes)
        predicted_labels = torch.argmax(test_outputs, dim=1).to('cuda:0')
        val_f1_metric(predicted_labels, gt).item()
        
    print('Test f1-score: {}'.format(val_f1_metric.compute()))
    nn_model.train()

'Epoch: 1 / 1000'
'Train f1-score: 0.20251865684986115'
'Test f1-score: 0.23503117263317108'
'Epoch: 2 / 1000'
'Train f1-score: 0.21356023848056793'
'Test f1-score: 0.244324192404747'
'Epoch: 3 / 1000'
'Train f1-score: 0.2190788835287094'
'Test f1-score: 0.24863740801811218'
'Epoch: 4 / 1000'
'Train f1-score: 0.2223062962293625'
'Test f1-score: 0.25088223814964294'
'Epoch: 5 / 1000'
'Train f1-score: 0.2242632508277893'
'Test f1-score: 0.25218209624290466'
'Epoch: 6 / 1000'
'Train f1-score: 0.22560209035873413'
'Test f1-score: 0.25287219882011414'
'Epoch: 7 / 1000'
'Train f1-score: 0.22656939923763275'
'Test f1-score: 0.2534995973110199'
'Epoch: 8 / 1000'
'Train f1-score: 0.22728204727172852'
'Test f1-score: 0.25383779406547546'
'Epoch: 9 / 1000'
'Train f1-score: 0.2278306484222412'
'Test f1-score: 0.2541792392730713'
'Epoch: 10 / 1000'
'Train f1-score: 0.22885429859161377'
'Test f1-score: 0.2554640769958496'
'Epoch: 11 / 1000'
'Train f1-score: 0.23007887601852417'
'Test f1-score: 0.256