In [12]:
from tokenizers import ByteLevelBPETokenizer
from transformers import BertTokenizerFast
from pathlib import Path

In [13]:
path_tscpt = '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/'

In [14]:
paths = [str(x) for x in Path(path_tscpt).glob("**/*.txt")]

In [15]:
paths

['/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/IRCall_200169348620200825_singlech.txt',
 '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/IRCall_200120093160210610_singlech.txt',
 '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/IRCall_100178121860210225_singlech.txt',
 '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/IRCall_100195082760200317_singlech.txt',
 '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/IRCall_3001137036D0200720_singlech.txt',
 '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/IRCall_200133287620201003_singlech.txt',
 '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/IRCall_2001504400C0200708_singlech.txt',
 '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_ch/IRCall_200189949860200403_singlech.txt',
 '/Users/akshaykekuda/Desktop/CSR-SA/manual_score_transcriptions/single_

In [16]:
len(paths)

14081

In [17]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer()
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "[CLS]",
    "[PAD]",
    "[UNK]",
    "[MASK]",
    "[SEP]",
], show_progress=True)






In [5]:
tokenizer

Tokenizer(vocabulary_size=23063, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [24]:
model_path = '../attention_model/sa_tokenizer/'
tokenizer.save_model(model_path)

['../attention_model/sa_tokenizer/vocab.txt']

In [25]:
import os, json
with open(os.path.join(model_path, "config.json"), "w") as f:
  tokenizer_cfg = {
      "do_lower_case": True,
      "unk_token": "[UNK]",
      "sep_token": "[SEP]",
      "pad_token": "[PAD]",
      "cls_token": "[CLS]",
      "mask_token": "[MASK]",
      "model_max_length": 512,
      "max_len": 512,
  }
  json.dump(tokenizer_cfg, f)

In [26]:
tokenizer = BertTokenizerFast.from_pretrained(model_path, model_max_len=512, add_special_tokens=True)

In [27]:
tokenizer

PreTrainedTokenizerFast(name_or_path='../attention_model/sa_tokenizer/', vocab_size=23063, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [28]:
from transformers import DataCollatorForLanguageModeling

In [29]:
from torch.utils.data import Dataset, DataLoader, random_split

class MLMDataSet(Dataset):
    def __init__(self, paths, tokenizer):
        self.sentences = []
        for file in paths:
            with open(file, 'r') as f:
                sent = f.readlines()
                self.sentences.extend([line.strip('\n') for line in sent])
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, idx):
        return self.tokenizer(self.sentences[idx])

In [31]:
mlm_ds = MLMDataSet(paths[:10], tokenizer)

In [32]:
train_size = int(0.8 * len(mlm_ds))
test_size = len(mlm_ds) - train_size
train_dataset, test_dataset = random_split(mlm_ds, [train_size, test_size])


In [43]:
collator = DataCollatorForLanguageModeling(tokenizer)
dataloader_transcripts_train = DataLoader(train_dataset, batch_size=4, shuffle=True,
                                                  collate_fn=collator.torch_call)
dataloader_transcripts_test = DataLoader(test_dataset, batch_size=4, shuffle=True,
                                                  collate_fn=collator.torch_call)

In [34]:
import torch.nn as nn
class MLMNetwork(nn.Module):
    def __init__(self, embedding_size, tokenizer, dropout_rate,num_heads):
        super(MLMNetwork, self).__init__()
        vocab_size = len(tokenizer)
        pad_idx = tokenizer.pad_token_id
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx = pad_idx)
        self.multihead_attn = nn.MultiheadAttention(embedding_size, dropout=dropout_rate, num_heads=num_heads, batch_first=True)
        self.fcn = nn.Linear(embedding_size, vocab_size)

    def forward(self, inputs):
        embed_output = self.embedding(inputs['input_ids'])
        padding_mask = inputs['attention_mask']==0
        query = key = value = embed_output
        attn_out, wt = self.multihead_attn(query, key, value, key_padding_mask=padding_mask)
        masked_indices = inputs['labels']!=-100
        
        masked_out = attn_out[masked_indices]
        out = self.fcn(masked_out)
        return out

In [35]:
import torch.optim as optim

model = MLMNetwork(128, tokenizer, 0.5, 2)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

In [36]:
len(dataloader_transcripts_train)

119

In [37]:
len(dataloader_transcripts_test)

119

In [38]:
from tqdm import tqdm
loss_arr = []
model.train()
for i in range(1):
    epoch_loss = 0
    for batch in tqdm(dataloader_transcripts_train):
        output = model(batch)
        labels = batch['labels'][batch['labels']!=-100]
        loss = criterion(output, labels)
        optimizer.zero_grad()
        epoch_loss += loss.detach().item()
        loss.backward()
        optimizer.step()
        loss_arr.append(loss.detach().item())

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|█████████████████████████████████████████| 119/119 [00:02<00:00, 40.04it/s]


In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.plot(loss_arr)
plt.show()

In [44]:
ex = next(iter(dataloader_transcripts_test))

In [45]:
ex['labels']

tensor([[-100, -100, -100,  101, -100, -100,   22, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100],
        [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100,  456, -100, -100, -100, -100, -100, -100,  105, -100, -100,
         -100, -100, -100, -100],
        [-100, 1853,    7, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100],
        [-100, -100,    5,  480,  101, 1042, -100, -100, -100, -100,  699, -100,
         -100, -100, -100, -100, -100, -100, -100, -100,  786, -100, -100, -100,
         -100, -100, -100, -100]])

In [48]:
[tokenizer.convert_ids_to_tokens(id) for id in ex['input_ids']]

[['[CLS]',
  'what',
  'is',
  'it',
  'i',
  "'",
  '[MASK]',
  'sorry',
  'oh',
  'okay',
  '.',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]'],
 ['[CLS]',
  'today',
  'is',
  'the',
  'second',
  'so',
  'your',
  'payment',
  'is',
  'gonna',
  'your',
  'cancellation',
  'dates',
  'are',
  '[MASK]',
  'be',
  'the',
  'first',
  'of',
  'every',
  'month',
  '[MASK]',
  'you',
  'run',
  'and',
  'get',
  '.',
  '[SEP]'],
 ['[CLS]',
  '[MASK]',
  '[MASK]',
  '[SEP]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]',
  '[PAD]'],
 ['[CLS]',
  'yeah',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  '[MASK]',
  'make',
  'it',
  'would',
  'be',
  '[MASK]'

In [None]:
[id for id in ex['input_ids']]

In [53]:
ex['labels'][ex['labels']!=-100]

[tensor([[False, False, False,  True, False, False,  True, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False],
         [False, False, False, False, False, False, False, False, False, False,
          False, False, False, False,  True, False, False, False, False, False,
          False,  True, False, False, False, False, False, False],
         [False,  True,  True, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False, False, False,
          False, False, False, False, False, False, False, False],
         [False, False,  True,  True,  True,  True, False, False, False, False,
           True, False, False, False, False, False, False, False, False, False,
           True, False, False, False, False, False, False, False]])]

In [49]:
[tokenizer.convert_ids_to_tokens(id) for id in [ex['labels'][ex['labels']!=-100]]]

[['it',
  'm',
  'gonna',
  'so',
  'cat',
  '.',
  "'",
  'cause',
  'it',
  'might',
  'able',
  'check']]

In [51]:
tokenizer.convert_ids_to_tokens(ex['labels'][ex['labels']!=-100])

['it',
 'm',
 'gonna',
 'so',
 'cat',
 '.',
 "'",
 'cause',
 'it',
 'might',
 'able',
 'check']

In [6]:
import torch
model.eval()
with torch.no_grad():
    for item in dataloader_transcripts_train:
        encoded_txt = item['input_ids']
        print("target", tokenizer.convert_ids_to_tokens(item['labels'][item['labels']!=-100]))
        pred = model(item)
        pred_ids = torch.argmax(pred, dim=-1)
        print("pred", tokenizer.convert_ids_to_tokens(pred_ids))

NameError: name 'model' is not defined

In [7]:
model

NameError: name 'model' is not defined

In [None]:
pred_ids = torch.argmax(pred, dim=-1)