In [3]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel

In [2]:
!pip install pytorch_pretrained_bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 5.1 MB/s 
Collecting boto3
  Downloading boto3-1.25.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 40.6 MB/s 
Collecting botocore<1.29.0,>=1.28.0
  Downloading botocore-1.28.0-py3-none-any.whl (9.3 MB)
[K     |████████████████████████████████| 9.3 MB 53.7 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 7.5 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 59.6 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
phrase = 'He ran quickly after the red bus and caught it'
tokenized_phrase = tokenizer.tokenize(phrase)

In [9]:
tokenized_phrase

['he', 'ran', 'quickly', 'after', 'the', 'red', 'bus', 'and', 'caught', 'it']

In [10]:
bert_model = BertModel.from_pretrained('bert-base-multilingual-uncased').eval()

100%|██████████| 623743758/623743758 [00:18<00:00, 34612487.77B/s]


In [14]:
tokenized_phrase = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(phrase))

In [15]:
tokenized_phrase

[10191, 15695, 23559, 10515, 10103, 10452, 15952, 10110, 34576, 10197]

In [None]:
with torch.no_grad():
    embeddings = bert_model(torch.tensor(tokenized_phrase).unsqueeze(0))


In [22]:
embeddings[0][0].size()

torch.Size([1, 10, 768])

In [23]:
classifier = nn.Linear(768, 20)

predict = classifier(embeddings)

10

In [None]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):
	#open file
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
        bert_model = BertModel.from_pretrained('bert-base-multilingual-uncased').eval()
        with open(data_dir + train_lang + '.train', 'r') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]
	    #init vocabs of tokens for encoding {<str> token: <int> id}
        self.target_vocab = {} # {p: 1, a: 2, r: 3, pu: 4}
        self.word_vocab = {} # {cat: 1, sat: 2, on: 3, mat: 4, '.': 5}
	    
        # Cat sat on mat. -> [1, 2, 3, 4, 5]
        # p    a  r  p pu -> [1, 2, 3, 1, 4]
        # chars  -> [1, 2, 3, 4, 5, 2, 3, 4]

	    #init encoded sequences lists (processed data)
        self.embeddings = []
        self.encoded_targets = []
        # n=1 because first value is padding
        n_word = 1
        n_target = 1
        for line in train:
            sequence = []
            target = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
            sequence = ' '.join(sequence)
            #TODO check tokens with ##
            tokens = tokenizer.tokenize(sequence)
            tokenized = tokenizer.convert_tokens_to_ids(tokens)
            with torch.no_grad():
                embeddings = bert_model(torch.tensor(tokenized_phrase).unsqueeze(0))
            
            self.embeddings.append(embeddings)
            self.encoded_targets.append(target)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.embeddings[index], # [1, 2, 3, 4, 6] len=5
            'target': self.encoded_targets[index], #  (1)
        }

In [None]:
def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(item['data'])
        target.append(torch.as_tensor(item['target']))
    data = pad_sequence(data, batch_first=True, padding_value=0.)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}