In [81]:
import nltk
nltk.download('stopwords')

from transformers import AutoModel, AutoConfig, AutoTokenizer

model = AutoModel.from_pretrained('roberta-base')
tokenizer = AutoTokenizer.from_pretrained('roberta-base', do_lower_case=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vierinova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
import matplotlib.pyplot as plt
%matplotlib inline
import os
import json
import pickle
import torch
import numpy as np
import re
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from transformers import AutoTokenizer
import warnings
warnings.filterwarnings("ignore")

In [50]:
from datasets import load_dataset

dataset = load_dataset("wikitext", 'wikitext-103-v1', split='train')

Reusing dataset wikitext (/home/vierinova/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [36]:
def clean_text(text):
    text = text.replace(':', ',')
    text = text.replace('--', ',')
    text = text.replace('-', ',')
    text = text.replace(';', '.')
    text = text.replace(' ,', ',')
    text = text.replace('♫', '')
    text = text.replace('...', '')
    text = text.replace('@', '')

    text = re.sub(r'.\n', ' - ', text)
    text = re.sub(r'--\s?--', '', text)
    
    text = re.sub(r'\?+', '?', text)
    text = re.sub(r'\!+', '!', text)
    text = re.sub(r'\s+,', ',', text)
    
    text = re.sub(r',\s?,', ',', text)
    text = re.sub(r'\s+', ' ', text)


    
    return text.strip().lower()

In [42]:
clean_text('HIII: how are you ???   I-m!! Good...')

'hiii, how are you ? i,m! good'

In [None]:
dataset = dataset.map(lambda x: {'text': clean_text(x['text'])})
dataset = dataset.filter(lambda x: len(x['text']) > 0 and x['text'][0] != '=', )

In [84]:
target_ids = tokenizer.encode(".?,-!")[1:-1]
target_token2id = {t: tokenizer.encode(t)[-2] for t in ".?,-!"}
target_ids = list(target_token2id.values())

In [85]:
target_token2id

{'.': 4, '?': 116, ',': 6, '-': 12, '!': 328}

In [86]:
#processing texts
id2target = {
    0: 0,
    -1: -1,
}
for i, ti in enumerate(target_ids):
    id2target[ti] = i+1
target2id = {value: key for key, value in id2target.items()}

def create_target(text):
    encoded_words, targets = [], []
    
    words = text.split(' ')

    for word in words:
        target = 0
        for target_token, target_id in target_token2id.items():
            if word.endswith(target_token):
                target = id2target[target_id]
        
        encoded_word = tokenizer.encode(word, add_special_tokens=False)
        if len(encoded_word) == 0:
            continue
        for w in encoded_word:
            encoded_words.append(w)
        for _ in range(len(encoded_word)-1):
            targets.append(-1)
        targets.append(target)
        
        assert(len(encoded_word)>0)

    encoded_words = [tokenizer.cls_token_id or tokenizer.bos_token_id] +\
                    encoded_words +\
                    [tokenizer.sep_token_id or tokenizer.eos_token_id]
    targets = [-1] + targets + [-1]
    
    return encoded_words, targets


In [55]:
import torch

In [56]:
for k, v in target2id.items():
    if k != -1:
        print(k, tokenizer.decode(v))

0 <s>
1 .
2 ?
3 ,
4 -
5 !


In [57]:
# 3-запятая 0-пробел 1-точка 2-вопрос 4-параграф

In [58]:
def decode_symb(symb):
    if symb == -1:
        return ''
    if symb == 0:
        return ' '
    return tokenizer.decode(target2id[symb]) + ' '

In [59]:
def predict_model(text):
    encoded_input, targets = create_target(text)
    encoded_input_tens = torch.LongTensor([encoded_input])
    with torch.no_grad():
        output, _ = model(encoded_input_tens)
        
    output = output[0]
    output_idx = torch.argmax(output, dim=-1)
    array = []
    
    for token, symb, target in zip(encoded_input[1:-1], output_idx.numpy().tolist()[1:-1], targets[1:-1]):
        if target == -1:
            vale = tokenizer.decode(token)
        elif target == 0:
            vale = tokenizer.decode(token) + decode_symb(symb)
        else:
            vale = tokenizer.decode(token) + tokenizer.decode(target2id[target]) + ' '
        array.append(vale)
    return ''.join(array)

In [14]:
train_text = dataset['text']

In [48]:
encoded_texts, targets = [], []
for text in tqdm(train_text[:500_000]):
    enc, tag = create_target(text)
    encoded_texts.append(enc)
    targets.append(tag)

  0%|          | 0/500000 [00:00<?, ?it/s]

In [61]:
def merge(texts, targets):
    new_texts = []
    new_targets = []
    
    for t1, t2, y1, y2 in zip(texts, texts[1:], targets, targets[1:]):
        new_t = t1[:-1] + t2[1:]
        new_y = y1[:-1] + y2[1:]
        new_texts.append(new_t)
        new_targets.append(new_y)
    return new_texts, new_targets

In [50]:
train_texts, train_targets, test_texts, test_targets = encoded_texts[:-3000], targets[:-3000], \
                                                        encoded_texts[-3000:], targets[-3000:]

In [66]:
train_texts, train_targets = merge(train_texts, train_targets)
test_texts, test_targets = merge(test_texts, test_targets)

In [69]:
with open('processed_train_words_ro.txt', 'w') as words_file, \
    open('processed_train_targets_ro.txt', 'w') as targets_file:
    for words, targets in zip(train_texts, train_targets):
        words_file.write(' '.join(map(str, words)) + '\n')
        targets_file.write(' '.join(map(str, targets)) + '\n')

In [70]:
with open('processed_val_words_ro.txt', 'w') as words_file, \
    open('processed_val_targets_ro.txt', 'w') as targets_file:
    for words, targets in zip(test_texts, test_targets):
        words_file.write(' '.join(map(str, words)) + '\n')
        targets_file.write(' '.join(map(str, targets)) + '\n')

In [71]:
import os

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

#creating datasets
class BertDataset(Dataset):
    def __init__(self, path, path_targets, is_train=False):

        self.is_train = is_train
        self.encoded_texts = []
        self.targets = []
        with open(path, 'r') as f:
            for text in f.readlines():
                self.encoded_texts.extend(list(map(int, text.split())))
        with open(path_targets, 'r') as ft:
            for text in ft.readlines():
                self.targets.extend(list(map(int, text.split())))
        self.encoded_texts = np.array(self.encoded_texts)
        self.targets = np.array(self.targets)
        idxs = []
        
        for i, (text, target) in enumerate(zip(self.encoded_texts, self.targets)):
            if target >= 1:
                idxs.append(i)
                self.targets[i - 1] = target

        self.encoded_texts = np.delete(self.encoded_texts, idxs)
        self.targets = np.delete(self.targets, idxs)
            

    def __getitem__(self, idx):
        start_idx = idx * 512
        start_idx = max(0, start_idx)
        end_idx = start_idx + 512
        return torch.LongTensor(self.encoded_texts[start_idx: end_idx]),\
               torch.LongTensor(self.targets[start_idx: end_idx])

    def __len__(self):
        return len(self.encoded_texts)//512 - 1


def collate(batch):
    texts, targets = zip(*batch)
    return torch.stack(texts), torch.stack(targets)

def get_datasets():
    train_dataset = BertDataset('processed_train_words_ro.txt', 'processed_train_targets_ro.txt', is_train=True)
    valid_dataset = BertDataset('processed_val_words_ro.txt', 'processed_val_targets_ro.txt')
    return train_dataset, valid_dataset


def get_data_loaders(train_dataset, valid_dataset):
    train_loader = DataLoader(train_dataset, batch_size=2, num_workers=0, collate_fn=collate, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=2, collate_fn=collate)
    return train_loader, valid_loader

In [72]:
train_data, test_data = get_datasets()

In [73]:
train_loader, test_loader = get_data_loaders(train_data, test_data)

In [74]:
from torch import nn

class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.x = nn.Sequential(
            nn.Dropout(0,2),
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Dropout(0,2),
            nn.Linear(512, 1))
        self.linear = nn.Linear(768, 6)

    def forward(self, x):
        binary_output = torch.sigmoid(self.x(x))
        x = self.linear(x)
        return x, binary_output

In [75]:
import torch.nn.functional as F

In [76]:
from transformers import AutoModel


class BertPunctuator(nn.Module):
    def __init__(self):
        super().__init__()
        self.base = AutoModel.from_pretrained('roberta-base', return_dict=False)
        self.classifier = Classifier()

    def forward(self, x):
        base_x = self.base(x)
        if isinstance(base_x, tuple):
            embedding = base_x[0]
        else:
            embedding = base_x.last_hidden_state
        output, binary_output = self.classifier(embedding)
        output = F.log_softmax(output, dim=-1)
        return output, binary_output

    def train(self, mode=True):
        if mode:
            self.base.train()
            self.classifier.train()
        else:
            self.base.eval()
            self.classifier.eval()
        return self

    def eval(self):
        self.train(False)
        return self


In [77]:
from torch.optim.lr_scheduler import _LRScheduler


class LinearScheduler(_LRScheduler):

    def __init__(self, optimizer, max_steps=10000):
        self.max_steps = max_steps
        self.lr = 0
        super().__init__(optimizer, -1)

    def get_lr(self):
        self.lr = self.base_lrs[-1] * min(1, self._step_count/self.max_steps)
        return [base_lr * min(1, self._step_count/self.max_steps)
                for base_lr in self.base_lrs]

вводим-на самом деле

0-2

2-0

4-4


5-5

In [78]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cuda:0'

In [79]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import Adam
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

train_dataset, valid_dataset = get_datasets()
train_loader, valid_loader = get_data_loaders(train_dataset, valid_dataset)
model = BertPunctuator().to(device)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [80]:
np.unique(train_dataset.targets)

array([-1,  0,  1,  2,  3,  4,  5])

In [81]:
def get_target_weights(targets, output_dim, reduce_empty=True):
    import warnings
    warnings.filterwarnings("ignore")
    weights = compute_class_weight(class_weight='balanced', classes=range(-1, 6), y=targets)[1:] # exclude -1
    return weights


target_weights = torch.Tensor(get_target_weights(train_dataset.targets, 1)).clamp_max(1).to(device)

criterion = nn.NLLLoss(weight=target_weights, reduction='none')

optimizer_args = [
                {'params': model.base.parameters(), 'lr': 3e-5},
                {'params': model.classifier.parameters(), 'lr': 1e-4}
            ]
optimizer = torch.optim.Adam(optimizer_args)
sched = LinearScheduler(optimizer, 500)
num_epochs = 1

In [83]:
# torch.save(model.state_dict(), 'model_state_dict.pt')
# torch.save(optimizer.state_dict(), 'optimizer_state_dict.pt')

In [84]:
for epoch in range(num_epochs):
    model.train()
    torch.save(model.state_dict(), 'model_state_dict_roberta.pt')
    torch.save(optimizer.state_dict(), 'optimizer_state_dict_roberta.pt')
    with tqdm(enumerate(train_loader), total=len(train_loader)) as pbar:
        for i, data in pbar:
            optimizer.zero_grad()
            text, targets = data
            preds, binary_preds = model(text.to(device))

            # Mask some "empty" targets
            mask = ((targets == 0) & (np.random.rand(*targets.shape) < .1)) | (targets > 0)
            mask = mask.to(device)

            # Do not predict output after tokens which are not the end of a word
            not_a_word_mask = (targets == -1).to(device)
            word_mask = ~not_a_word_mask
            targets[not_a_word_mask] = 0

            losses = criterion(preds.reshape(-1, 6), targets.to(device).reshape(-1))
            losses = losses.reshape(text.size(0), text.size(1))
            mask = word_mask * mask
            
            loss = torch.sum(losses * mask) / torch.sum(mask)
            loss.backward()
            
            pbar.set_description(f"loss: {loss.item()}, lr: {optimizer.param_groups[0]['lr']}")

            nn.utils.clip_grad_norm_(model.parameters(), 1.5)
            optimizer.step()
            sched.step()

        # Save model every epoch
        torch.save(model.state_dict(), 'model_state_dict_roberta.pt')
        torch.save(optimizer.state_dict(), 'optimizer_state_dict_roberta.pt')

loss: 0.22112272679805756, lr: 3e-05:   5%| | 7114/138954 [16:33<5:07:39,  7.14iIOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

loss: 0.1052561029791832, lr: 3e-05:  11%| | 15114/138954 [35:08<4:48:32,  7.15iIOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

loss: 0.1377411037683487, lr: 3e-05:  17%|▏| 22990/138954 [53:24<4:28:40,  7.19iIOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order t

In [85]:
1

1

In [86]:
from sklearn.metrics import (classification_report, confusion_matrix,
                f1_score, roc_auc_score, precision_score, recall_score)

def get_classification_report(target, preds):
    report = classification_report(target, preds, output_dict=True)
    report_print = classification_report(target, preds, digits=3)
    return report, report_print


def get_eval_metrics(targets, preds):
    metrics = {}

    preds = np.exp(preds)
    preds = preds.reshape(-1, 6)
    targets = targets.reshape(-1)
    pred_index = preds.argmax(-1)

    cls_report, cls_report_print = get_classification_report(targets, pred_index)
    print(cls_report_print)
    metrics['cls_report'] = cls_report


    macro_precision = precision_score(targets, pred_index, average='macro')
    metrics['precision'] = macro_precision

    macro_recall = recall_score(targets, pred_index, average='macro')
    metrics['recall'] = macro_recall

    macro_f1_score = f1_score(targets, pred_index, average='macro')
    metrics['f_score'] = macro_f1_score

    auc_score = roc_auc_score(targets, preds, average='macro', multi_class='ovo')
    metrics['auc'] = auc_score
    return metrics



In [87]:
# Valid loop

model.eval()
valid_loss = 0
all_valid_preds = []
all_valid_targets = []
for data in tqdm(valid_loader):

    text, targets = data
    with torch.no_grad():
        preds, _ = model(text.to(device))

    word_mask = targets != -1
    preds = preds[word_mask]
    targets = targets[word_mask]

    loss = criterion(preds.view(-1, 6), targets.to(device).view(-1))
    valid_loss += loss.mean().item()
    all_valid_preds.append(preds.detach().cpu().numpy())
    all_valid_targets.append(targets)


valid_loss /= len(valid_loader)
all_valid_preds = np.concatenate(all_valid_preds)
all_valid_targets = np.concatenate(all_valid_targets)

metrics = get_eval_metrics(all_valid_targets, all_valid_preds)
metrics["loss"] = valid_loss

100%|█████████████████████████████████████████| 781/781 [00:24<00:00, 32.38it/s]


              precision    recall  f1-score   support

           0      0.998     0.908     0.951    504793
           1      0.779     0.948     0.855     26685
           2      0.424     0.600     0.497        70
           3      0.479     0.953     0.638     40426
           4      0.896     0.893     0.894      1154
           5      0.514     0.436     0.472       204

    accuracy                          0.913    573332
   macro avg      0.682     0.790     0.718    573332
weighted avg      0.951     0.913     0.924    573332



In [6]:
import os

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence
N=512
class BertDataset(Dataset):
    def __init__(self, path, path_targets, is_train=False, pred_len=N):

        self.is_train = is_train
        self.encoded_texts = [0] * N
        self.targets = [-1] * N
        self.pred_len = pred_len
        with open(path, 'r') as f:
            for text in f.readlines():
                self.encoded_texts.extend(list(map(int, text.split())))
        with open(path_targets, 'r') as ft:
            for text in ft.readlines():
                self.targets.extend(list(map(int, text.split())))
        self.encoded_texts.extend([0] * N)
        self.targets.extend([-1] * N)
        idxs = []
        
        for i, (text, target) in enumerate(zip(self.encoded_texts, self.targets)):
            if target >= 1:
                idxs.append(i)
                self.targets[i - 1] = target

        self.encoded_texts = np.delete(self.encoded_texts, idxs)
        self.targets = np.delete(self.targets, idxs)

            
    def __getitem__(self, idx):
        start_idx = idx * self.pred_len
        start_idx = max(0, start_idx)
        end_idx = start_idx + N
        return torch.LongTensor(self.encoded_texts[start_idx: end_idx]),\
               torch.LongTensor(self.targets[start_idx: end_idx])

    def __len__(self):
        return (len(self.encoded_texts) - 512)//self.pred_len - 1


def collate(batch):
    texts, targets = zip(*batch)
    return torch.stack(texts), torch.stack(targets)

def get_datasets(pred_len):
    train_dataset = BertDataset('processed_train_words_ro.txt', 'processed_train_targets_ro.txt', is_train=True)
    valid_dataset = BertDataset('processed_val_words_ro.txt', 'processed_val_targets_ro.txt', pred_len=pred_len)
    return train_dataset, valid_dataset


def get_data_loaders(train_dataset, valid_dataset):
    train_loader = DataLoader(train_dataset, batch_size=2, num_workers=0, collate_fn=collate, shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=4, collate_fn=collate)
    return train_loader, valid_loader

In [20]:
import os
from glob import glob
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import AdamW
from torch import nn

import numpy as np
import pickle

from torch.utils.data import Dataset, DataLoader
from itertools import product

def collate(batch):
    texts, targets = zip(*batch)
    try:
        texts, targets = torch.stack(texts), torch.stack(targets)
    except Exception:
        return texts[0][None, ...], targets[0][None, ...]
    return texts, targets

def combine(pred_num, preds):

    ps = []
    for i in range(preds.shape[0]):
        start_idx = max(0, i-512//pred_num+1)
        end_idx = min(preds.shape[0], i+1)

        p = []
        for j, k in enumerate(range(start_idx, end_idx)):
            j = end_idx - start_idx - j - 1
            p.append(preds[k][j*pred_num:(j+1)*pred_num])
        p = np.stack(p)
        if p.shape[0] > 2:
            p = p[1:-1, :, :]
            
        ps.append(np.log(np.exp(p).mean(0)))
    ps = np.concatenate(ps)
    return ps

device = torch.device('cuda:0')
torch.cuda.set_device(device)



In [10]:
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.x = nn.Sequential(
            nn.Dropout(0,2),
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Dropout(0,2),
            nn.Linear(512, 1))
        self.linear = nn.Linear(768, 6)

    def forward(self, x):
        binary_output = torch.sigmoid(self.x(x))
        x = self.linear(x)
        return x, binary_output

In [11]:
class BertPunctuator(nn.Module):
    def __init__(self):
        super().__init__()
        config = AutoConfig.from_pretrained('roberta-base')
        self.base = AutoModel.from_pretrained('roberta-base')
        self.classifier = Classifier()

    def forward(self, x):
        embedding = self.base(x).last_hidden_state

        output, binary_output = self.classifier(embedding)
        output = F.log_softmax(output, dim=-1)
        return output, binary_output

    def train(self, mode=True):
        if mode:
            self.base.train()
            self.classifier.train()
        else:
            self.base.eval()
            self.classifier.eval()
        return self

    def eval(self):
        self.train(False)
        return self

In [18]:
import nltk
from pprint import pprint
from transformers import AutoModel, AutoConfig, AutoTokenizer

import torch.nn.functional as F

In [121]:
model = BertPunctuator()
model.to(device)
model.load_state_dict(torch.load('model_state_dict_roberta.pt', map_location=device))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [23]:
from sklearn.metrics import (classification_report, confusion_matrix,
                f1_score, roc_auc_score, precision_score, recall_score)

def get_classification_report(target, preds):
    report = classification_report(target, preds, output_dict=True)
    report_print = classification_report(target, preds)
    return report, report_print


def get_eval_metrics(targets, preds, make_paragraphs = True, short_sentenses = False):
    # TODO: get the desired metric list from config-frozen.yaml
    """
    Calculates metrics on validation data
    """
    metrics = {}
    if make_paragraphs == False:
        preds = preds[:, :-1]
    preds = np.exp(preds)
    preds = preds.reshape(-1, 6)
    targets = targets.reshape(-1)
    pred_index = preds.argmax(-1)
    
    if short_sentenses == True:
        new_preds = []
        for i in range(len(pred_index)):
            cur_pred_idx = pred_index[i]
            if cur_pred_idx == 3:
                next_index = preds[i][preds[i].argsort()[:-2]]
                if preds[i, cur_pred_idx] - preds[i, next_index] < 0.3:
                    cur_pred_idx = next_index
            new_preds.append(cur_pred_idx)
        new_preds = np.array(new_preds)
        pred_index = new_preds

    cls_report, cls_report_print = get_classification_report(targets, pred_index)
    print(cls_report_print)
    metrics['cls_report'] = cls_report

In [25]:
def make_multi_preds(N_PREDICTIONS_FOR_TOKEN, model):
    PREDICTION_NUM = N_PREDICTIONS_FOR_TOKEN
    WINDOW_SHIFT = 512 // PREDICTION_NUM
    train, test_dataset = get_datasets(pred_len=WINDOW_SHIFT)
    train_loader, test_loader = get_data_loaders(train, test_dataset)
    model.eval()
    all_test_preds = []

    for data in tqdm(test_loader):
        text, targets = data
        with torch.no_grad():
            preds, _ = model(text.to(device))

        all_test_preds.append(preds.detach().cpu().numpy())


    all_valid_target = test_dataset.targets
    all_valid_preds = np.concatenate(all_test_preds)
    ps = combine(512 // PREDICTION_NUM, all_valid_preds)
    _targets = np.array(all_valid_target[:ps.shape[0]])

    ps = ps[_targets != -1]
    _targets = _targets[_targets != -1]

    return(get_eval_metrics(_targets, ps), ps)


    
1 .
    
2 ?
    
3 ,
    
4 -
    
5 !

In [26]:
rev, ps1 = make_multi_preds(1, model)

100%|█████████████████████████████████████████| 391/391 [00:23<00:00, 16.49it/s]


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    504793
           1       0.78      0.95      0.86     26685
           2       0.42      0.60      0.50        70
           3       0.48      0.95      0.64     40426
           4       0.90      0.89      0.89      1154
           5       0.51      0.44      0.47       204

    accuracy                           0.91    573332
   macro avg       0.68      0.79      0.72    573332
weighted avg       0.95      0.91      0.92    573332



In [28]:
rev, ps2 = make_multi_preds(2, model)

100%|█████████████████████████████████████████| 782/782 [00:47<00:00, 16.50it/s]


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    504976
           1       0.79      0.95      0.86     26691
           2       0.47      0.63      0.54        70
           3       0.48      0.96      0.64     40439
           4       0.90      0.89      0.89      1154
           5       0.50      0.45      0.47       204

    accuracy                           0.91    573534
   macro avg       0.69      0.80      0.73    573534
weighted avg       0.95      0.91      0.93    573534



In [29]:
rev, ps4 = make_multi_preds(4, model)

100%|███████████████████████████████████████| 1564/1564 [01:34<00:00, 16.46it/s]


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    505176
           1       0.79      0.95      0.86     26697
           2       0.49      0.66      0.56        70
           3       0.49      0.96      0.64     40447
           4       0.90      0.88      0.89      1154
           5       0.50      0.44      0.47       204

    accuracy                           0.92    573748
   macro avg       0.69      0.80      0.73    573748
weighted avg       0.95      0.92      0.93    573748



In [30]:
rev, ps8 = make_multi_preds(8, model)

100%|███████████████████████████████████████| 3129/3129 [03:10<00:00, 16.44it/s]


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    505221
           1       0.79      0.95      0.86     26698
           2       0.49      0.66      0.56        70
           3       0.49      0.96      0.65     40452
           4       0.90      0.88      0.89      1154
           5       0.53      0.46      0.49       204

    accuracy                           0.92    573799
   macro avg       0.70      0.80      0.74    573799
weighted avg       0.95      0.92      0.93    573799



In [31]:
rev, ps16 = make_multi_preds(16, model)

100%|███████████████████████████████████████| 6257/6257 [06:20<00:00, 16.44it/s]


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    505264
           1       0.80      0.95      0.87     26701
           2       0.51      0.66      0.57        70
           3       0.49      0.96      0.65     40453
           4       0.90      0.89      0.90      1154
           5       0.53      0.45      0.49       204

    accuracy                           0.92    573846
   macro avg       0.70      0.80      0.74    573846
weighted avg       0.95      0.92      0.93    573846



In [32]:
rev, ps32 = make_multi_preds(32, model)

100%|█████████████████████████████████████| 12515/12515 [12:41<00:00, 16.44it/s]


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    505275
           1       0.80      0.95      0.87     26701
           2       0.50      0.66      0.57        70
           3       0.49      0.96      0.65     40453
           4       0.90      0.89      0.90      1154
           5       0.52      0.45      0.48       204

    accuracy                           0.92    573857
   macro avg       0.70      0.80      0.74    573857
weighted avg       0.95      0.92      0.93    573857



In [None]:
accuracy 0.92
macro avg 0.74
weighted avg 0.93

In [33]:
rev, ps64 = make_multi_preds(64, model)

100%|█████████████████████████████████████| 25029/25029 [25:22<00:00, 16.44it/s]


              precision    recall  f1-score   support

           0       1.00      0.91      0.95    505282
           1       0.80      0.95      0.87     26702
           2       0.51      0.66      0.57        70
           3       0.49      0.96      0.65     40453
           4       0.91      0.89      0.90      1154
           5       0.53      0.46      0.49       204

    accuracy                           0.92    573865
   macro avg       0.70      0.80      0.74    573865
weighted avg       0.95      0.92      0.93    573865



In [312]:
text='a book display with works on critical race theory critical race theory (crt) is a cross disciplinary intellectual and social movement of civil rights scholars and activists who seek to examine the intersection of race society and law in the united states and to challenge mainstream american liberal approaches to racial justice the word critical in its name is an academic term that refers to critical thinking critical theory and scholarly criticism rather than criticizing or blaming people crt is also used in sociology to explain social political and legal structures and power distribution through the lens of race for example the crt conceptual framework is one way to study racial bias in laws and institutions such as the how and why of incarceration rates and how sentencing differs among racial groups in the united states it first arose in the 1970s like other critical schools of thought such as critical legal studies which examines how legal rules protect the status quo a key crt concept is intersectionality the way in which different forms of inequality and identity are affected by interconnections of race class gender and disability scholars of crt view race as a social construct with no biological basis one tenet of crt is that racism and disparate racial outcomes are the result of complex changing and often subtle social and institutional dynamics rather than explicit and intentional prejudices of individuals crt scholars argue that the social and legal construction of race advances the interests of white people at the expense of people of color and that the liberal notion of u.s. law as "neutral" plays a significant role in maintaining a racially unjust social order, where formally color,blind laws continue to have racially discriminatory outcomes'

In [314]:
predict_model(text)

'a book, display, with works on critical race theory. critical race theory (crt) is a cross, disciplinary, intellectual, and social movement of civil rights scholars, and activists, who seek to examine the intersection of race, society, and law in the united states, and to challenge mainstream, american, liberal approaches to racial justice. the word critical, in its name, is an academic term that refers to critical thinking, critical theory, and scholarly criticism, rather than criticizing or blaming people. crt is also used in sociology to explain social, political, and legal structures, and power distribution, through the lens of race. for example, the crt conceptual framework is one way to study racial bias in laws and institutions, such as the how, and why of incarceration rates, and how sentencing differs among racial groups. in the united states. it first arose in the 1970s. like other critical schools of thought, such as critical legal studies, which examines how legal rules pr

In [47]:
prepared_text=clean_text(text)

In [48]:
prepared_text

'a book display with works on critical race theory critical race theory (crt) is a cross,disciplinary intellectual and social movement of civil,rights scholars and activists who seek to examine the intersection of race, society, and law in the united states and to challenge mainstream american liberal approaches to racial justice. the word critical in its name is an academic term that refers to critical thinking, critical theory, and scholarly criticism, rather than criticizing or blaming people. crt is also used in sociology to explain social, political, and legal structures and power distribution through the lens of race. for example, the crt conceptual framework is one way to study racial bias in laws and institutions, such as the how and why of incarceration rates and how sentencing differs among racial groups in the united states. it first arose in the 1970s, like other critical schools of thought, such as critical legal studies, which examines how legal rules protect the status q

In [299]:
def decode_symb(symb):
    if symb == -1:
        return ''
    if symb == 0:
        return ' '
    decoded = tokenizer.decode(target2id[symb])
    return decoded + ' '

def predict_model(text, short_sentenses=False):
    encoded_input, targets = create_target(text)
    encoded_input_tens = torch.LongTensor([encoded_input]).to(device)
    with torch.no_grad():
        output = model(encoded_input_tens)[0]
    output = output[0]
    output = torch.softmax(output, dim=-1)
    output_idx = torch.argmax(output, dim=-1)
    if short_sentenses:
        new_preds = []
        for i in range(len(output_idx)):
            cur_pred_idx = output_idx[i]
            next_index = output[i].argsort()[-2]
            if output[i, 1] > 0.004:
                cur_pred_idx = torch.LongTensor([1])[0]
            
            new_preds.append(cur_pred_idx.detach().cpu().numpy().tolist())
        new_preds = np.array(new_preds)
        output_idx = torch.from_numpy(new_preds)
        pred_index = new_preds


    array = []
    
    for token, symb, target in zip(encoded_input[1:-1], output_idx.cpu().numpy().tolist()[1:-1], targets[1:-1]):
        if target == -1:
            vale = tokenizer.decode(token)
        elif target == 0:
            vale = tokenizer.decode(token) + decode_symb(symb)
        else:
            token_dec = tokenizer.decode(token)
            target_dec = tokenizer.decode(target2id[target])
            vale = token_dec
            if token_dec.strip()[-1] != target_dec.strip()[-1]:
                vale = vale + target_dec
            vale = vale + ' '
        array.append(vale)
    return ''.join(array)

In [300]:
predict_model('If i could save time in the bottle first thing that i would to do is to save '
              'every day till eternaty passes away')

'If i could save time in the bottle, first thing that i would to do, is, to save every day, till eternaty passes away. '

In [301]:
predict_model('If i could save time in the bottle first thing that i would to do is to save '
              'every day till eternaty passes away', short_sentenses=True)

'If i could save time. in the bottle. first thing that i would to do, is. to save. every day. till eternaty. passes away. '

In [253]:
import re

In [None]:
get_eval_metrics()

In [308]:
text ='I hate him because of his stubbornness How do I handle this'
prepared_text=clean_text(text)

In [311]:
prepared_text

'i hate him because of his stubbornness how do i handle this'

In [310]:
predict_model(prepared_text)

'i hate him, because of his stubbornness. how do i handle this? '

In [70]:
a, b = model(torch.LongTensor([encoded_input]))

In [74]:
 model(torch.LongTensor([encoded_input]))

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0395,  0.0281, -0.0182,  ..., -0.2276, -0.0229,  0.0374],
         [-0.2320, -0.5142, -0.1307,  ..., -0.2004,  0.0440,  0.3378],
         [-0.0358, -0.0342,  0.1395,  ..., -0.4272,  0.0173,  0.3511],
         ...,
         [ 0.1921,  0.1700, -0.0792,  ..., -0.0158,  0.0190,  0.2110],
         [-0.0427,  0.0319, -0.0388,  ..., -0.2669, -0.0295,  0.0111],
         [-0.0277, -0.0830,  0.1402,  ..., -0.1500, -0.0749,  0.1609]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-5.5298e-03, -2.4003e-01, -1.8299e-01, -1.2790e-01,  1.5247e-01,
          2.2248e-01,  2.3798e-01, -7.0888e-02, -9.6683e-02, -1.8161e-01,
          2.7784e-01,  5.1404e-02, -1.4909e-01,  5.5379e-02, -1.3808e-01,
          5.0662e-01,  2.9180e-01, -4.7211e-01,  6.5626e-02,  8.2297e-03,
         -2.2727e-01,  9.9285e-02,  4.8312e-01,  2.9684e-01,  1.3221e-01,
          1.0195e-01, -1.8300e-01, -1.1211e-02,  1.4573e-01,  2.194

In [73]:
 model(torch.LongTensor([encoded_input]))[0]

tensor([[[-0.0395,  0.0281, -0.0182,  ..., -0.2276, -0.0229,  0.0374],
         [-0.2320, -0.5142, -0.1307,  ..., -0.2004,  0.0440,  0.3378],
         [-0.0358, -0.0342,  0.1395,  ..., -0.4272,  0.0173,  0.3511],
         ...,
         [ 0.1921,  0.1700, -0.0792,  ..., -0.0158,  0.0190,  0.2110],
         [-0.0427,  0.0319, -0.0388,  ..., -0.2669, -0.0295,  0.0111],
         [-0.0277, -0.0830,  0.1402,  ..., -0.1500, -0.0749,  0.1609]]],
       grad_fn=<NativeLayerNormBackward0>)

In [151]:
def make_multi_preds(N_PREDICTIONS_FOR_TOKEN, model):
    PREDICTION_NUM = N_PREDICTIONS_FOR_TOKEN
    WINDOW_SHIFT = 512 // PREDICTION_NUM
    train, test_dataset = get_datasets(pred_len=WINDOW_SHIFT)
    train_loader, test_loader = get_data_loaders(train, test_dataset)
    model.eval()
    all_test_preds = []

    for data in tqdm(test_loader):
        text, targets = data
        with torch.no_grad():
            preds, _ = model(text.to(device))

        all_test_preds.append(preds.detach().cpu().numpy())


    all_valid_target = test_dataset.targets
    all_valid_preds = np.concatenate(all_test_preds)
    ps = combine(512 // PREDICTION_NUM, all_valid_preds)
    _targets = np.array(all_valid_target[:ps.shape[0]])

    ps = ps[_targets != -1]
    _targets = _targets[_targets != -1]

    return get_eval_metrics(_targets, ps, short_sentenses=True)

In [153]:
metrics = make_multi_preds(1, model)

  0%|          | 0/391 [00:00<?, ?it/s]

IndexError: arrays used as indices must be of integer (or boolean) type

In [None]:
metrics