In [1]:
!pip install matplotlib pandas razdel pytorch_pretrained_bert transformers
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from razdel import tokenize
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
import torch
from pytorch_pretrained_bert import BertModel, BertTokenizer, BertForMaskedLM, BertConfig
from transformers import BertTokenizer
import joblib

[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
def tokenize_text(text):
    """Токенизация"""
    words = [_.text for _ in list(tokenize(text))]
    return words


device = ('cuda') if torch.cuda.is_available() else torch.device('cpu')


class QADataset(Dataset):

    def __init__(self, tokenizer: BertTokenizer,
                 paragraph_tokens: list,
                 question_tokens: list,
                 answer_spans: list,
                 word2index: dict,
                 verbose=True,
                 max_seq_len=512,
                pad_token="[PAD]"):

        super().__init__()
        self.tokenizer = tokenizer
        self.word2index = word2index
        self.word2bert_tokens = {word: self.tokenizer.tokenize(word) for word
                                 in tqdm(list(self.word2index.keys())[1:])}
        self.word2bert_indices = {
            word: [self.tokenizer.vocab[bert_token] for bert_token in
                   self.word2bert_tokens[word]] for word in
            self.word2bert_tokens.keys()}

        self.sequence_length = max_seq_len
        self.pad_index = self.word2index[pad_token]

        self.x_data = []
        self.y_data = []
        self.bert_spans = []
        self.load_x_y(paragraph_tokens, question_tokens, answer_spans)

    def load_x_y(self, paragraphs, questions, spans, verbose=True):
        for par, quest, span in tqdm(zip(paragraphs, questions, spans),
                                     desc="Loading data", disable=not verbose):
            tokens = ["[CLS]"] + par + ["[SEP]"] + quest + ["[SEP]"]
            start, end = span.split(",")
            start, end = int(start), int(end)
            bert_tokens = [self.word2bert_indices[word] for word in tokens]
            bert_span_start = sum(len(x) for x in bert_tokens[:start + 1])
            bert_span_end = sum(len(x) for x in bert_tokens[:end + 1]) # прибавляем 1, т.к. у нас в начале есть еще токен CLS
            
            bert_tokens = sum(bert_tokens, [])
            if len(bert_tokens) > 512:
                
                par_tokens = [self.word2bert_indices[word] for word in ["[CLS]"] + par]
                quest_tokens = [self.word2bert_indices[word] for word in ["[SEP]"] + quest + ["[SEP]"]]
                if bert_span_start <= len(par_tokens)/2: # если спан в первой половине параграфа
                    slice_ = len(bert_tokens) - 512
                    bert_tokens = sum(par_tokens[:-slice_] + quest_tokens, [])
                elif bert_span_start > len(par_tokens)/2:
                    slice_ = len(bert_tokens) - 512
                    bert_span_start = bert_span_start - slice_ 
                    bert_span_end = bert_span_end - slice_
                    bert_tokens = sum(par_tokens[slice_:] + quest_tokens, [])
                elif bert_span_start > 1024:
                    factor = floor(len(bert_tokens)/512)
                    slice_ = len(bert_tokens) - 512*factor
                    bert_tokens = sum(par_tokens[slice_:] + que_tokens, [])
                    bert_span_start = bert_span_start - slice_
                    bert_span_end = bert_span_end - slice_
      
                
            bert_span = (abs(bert_span_start), abs(bert_span_end-1))

            
            target_1 = [0] * self.sequence_length
            if bert_span_start < self.sequence_length:
                target_1[bert_span_start] = 1

            target_2 = [0] * self.sequence_length
            if bert_span_end < self.sequence_length:
                target_2[bert_span_end-1] = 1
            
            target = [target_1, target_2]
            
            self.x_data.append(bert_tokens)
            self.y_data.append(target)
            self.bert_spans.append(bert_span)

    def padding(self, sequence):
        if len(sequence) > self.sequence_length:
            sequence = sequence[: self.sequence_length]
        elif len(sequence) < self.sequence_length:
            sequence += [self.pad_index for i in
                         range(self.sequence_length - len(sequence))]
        return sequence

    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):

        x = self.x_data[idx]
        x = self.padding(x)
        x = torch.Tensor(x).long()
        y = self.y_data[idx]
        y = torch.Tensor(y).long()
        bert_span = self.bert_spans[idx]
        bert_span = torch.Tensor(bert_span).long()

        return x, y, bert_span

In [3]:
data = pd.read_csv("sdsj2017_sberquad_with_spans(minus_30_examples).csv")
data['span_len'] = data.apply(lambda row: int(row.word_answer_span.split(",")[1]) - int(row.word_answer_span.split(",")[0]), axis = 1) 
data['span_avg'] = data.apply(lambda row: (int(row.word_answer_span.split(",")[1])+int(row.word_answer_span.split(",")[0]))/2, axis = 1)
# data["span_start"] = data.apply(lambda row: int(row.word_answer_span.split(",")[0]), axis = 1)
# data["span_end"] = data.apply(lambda row: int(row.word_answer_span.split(",")[1]), axis = 1)
# data = data[(data.span_len <= 10) & (data.span_avg <= 150) & (data.span_end <= 150)]
data = data[(data.span_len <= 10) & (data.span_avg <= 150)]
# data = data.sample(frac=0.11, random_state=42) 

tokenizer = BertTokenizer.from_pretrained("lm", do_lower_case=False)

In [4]:
# 47681
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47681 entries, 0 to 50333
Data columns (total 10 columns):
paragraph_id        47681 non-null int64
question_id         47681 non-null int64
paragraph           47681 non-null object
question            47681 non-null object
answer              47681 non-null object
char_answer_span    47681 non-null object
paragraph_tokens    47681 non-null object
word_answer_span    47681 non-null object
span_len            47681 non-null int64
span_avg            47681 non-null float64
dtypes: float64(1), int64(3), object(6)
memory usage: 4.0+ MB


In [5]:
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('Bummer!  Training on CPU ...')
else:
    print('You are good to go!  Training on GPU ...')

You are good to go!  Training on GPU ...


In [6]:
from sklearn.model_selection import train_test_split


train, temp = train_test_split(data, test_size=0.2, random_state=42)
dev, test = train_test_split(temp, test_size=0.5, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)
dev = dev.reset_index(drop=True)

In [7]:
par_tokens_train = [i.split() for i in train.paragraph_tokens]
que_tokens_train = [tokenize_text(i) for i in train.question]
answer_spans_train = train.word_answer_span

par_tokens_test = [i.split() for i in test.paragraph_tokens]
que_tokens_test = [tokenize_text(i) for i in test.question]
answer_spans_test = test.word_answer_span

par_tokens_dev = [i.split() for i in dev.paragraph_tokens]
que_tokens_dev = [tokenize_text(i) for i in dev.question]
answer_spans_dev = dev.word_answer_span

In [8]:
word2index = {"[PAD]":0, "[CLS]":1, "[SEP]":2}

par_tokens = [i.split() for i in data.paragraph_tokens]
que_tokens = [tokenize_text(i) for i in data.question]
answer_spans = data.word_answer_span


for sent in par_tokens:
    for token in sent:
        if token not in word2index:
            word2index[token] = len(word2index)

for que in que_tokens:
    for token in que:
        if token not in word2index:
            word2index[token] = len(word2index)

In [9]:
train_data = QADataset(tokenizer=tokenizer,
                           paragraph_tokens=par_tokens_train,
                           question_tokens=que_tokens_train,
                           answer_spans=answer_spans_train,
                           word2index=word2index)

test_data = QADataset(tokenizer=tokenizer,
                          paragraph_tokens=par_tokens_test,
                          question_tokens=que_tokens_test,
                          answer_spans=answer_spans_test,
                          word2index=word2index)
    
dev_data = QADataset(tokenizer=tokenizer,
                   paragraph_tokens=par_tokens_dev,
                   question_tokens=que_tokens_dev,
                   answer_spans=answer_spans_dev,
                   word2index=word2index)

train_loader = DataLoader(train_data, batch_size=32, drop_last=True)
test_loader = DataLoader(test_data, batch_size=32, drop_last=True)
dev_loader = DataLoader(dev_data, batch_size=32, drop_last=True)

HBox(children=(FloatProgress(value=0.0, max=154578.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Loading data', max=1.0, style=ProgressS…




HBox(children=(FloatProgress(value=0.0, max=154578.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Loading data', max=1.0, style=ProgressS…




HBox(children=(FloatProgress(value=0.0, max=154578.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Loading data', max=1.0, style=ProgressS…




In [11]:
class Classifier(torch.nn.Module):
    
    def __init__(self, 
               hidden_size=768,
               linear_out=2,
               batch_first=True):
  
        super(Classifier, self).__init__()
            
        self.output_model_file = "lm/pytorch_model.bin"
        self.output_config_file = "lm/config.json"
        self.tokenizer = BertTokenizer.from_pretrained("lm/", do_lower_case=False)
        self.config = BertConfig.from_json_file(self.output_config_file)
        self.model = BertForMaskedLM(self.config)
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.state_dict = torch.load(self.output_model_file, map_location=device)
        self.model.load_state_dict(self.state_dict)
        self.layernorm = torch.nn.LayerNorm(hidden_size)
        self.drop = torch.nn.Dropout(p=0.5)
        self.lstm = torch.nn.LSTM(hidden_size, 300, bidirectional=True)
        self.linear = torch.nn.Linear(600, linear_out)  
        self.relu = torch.nn.ReLU()
    
    def get_embeddings(self, x_instance):
        indexed_tokens = x_instance.tolist()
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_ids = [1] * len(indexed_tokens)
        segments_tensors = torch.tensor([segments_ids])
        self.model.eval()
        with torch.no_grad():
            encoded_layers, _ = self.model.bert(tokens_tensor.to(device),
                                       segments_tensors.to(device))
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1, 0, 2)
        token_vecs_cat = []
        for token in token_embeddings:
            # cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
            # token_vecs_cat.append(cat_vec)
            cat_vec = torch.stack((token[-1], token[-2], token[-3], token[-4]))
            mean_vec = torch.mean(cat_vec, 0)
            token_vecs_cat.append(mean_vec)
        token_vecs_cat = torch.stack(token_vecs_cat, dim=0)
        return token_vecs_cat

    
    def embed_data(self, x): 
        entries = [] 
        for entry in x:
            emb = self.get_embeddings(entry.to(device)).to(device)
            entries.append(emb)
        return torch.stack(entries)
    
        
    def forward(self, x):

        h = self.embed_data(x)
        h = self.layernorm(h)
        h = self.drop(h)
        h = h.permute(1, 0, 2)
        output, _ = self.lstm(h)
        output = self.drop(output)
        output = self.relu(output)
        pred = self.linear(output)
        pred = pred.permute(1, 0, 2)
        
        return pred

In [12]:
def train_model(model, epochs, train_loader, dev_loader, optimizer, criterion):
    
    train_losses = []
    val_losses = []
    
    mean_val_loss = 0
    best_val_loss = 7.0
    
    for n_epoch in range(epochs):
        
        all_f_scores = []
        train_f_scores = []
        
        progress_bar = tqdm(total=len(train_loader.dataset),
                            desc='Epoch {}'.format(n_epoch + 1))

        for x, y, bert_span in train_loader:
            optimizer.zero_grad()
            pred = model.forward(x.to(device))
            
            start_logits, end_logits = pred.permute(0,2,1).split(1, dim=1)
            start_logits = start_logits.squeeze(1)
            end_logits = end_logits.squeeze(1)
            
            start_positions, end_positions = bert_span.split(1, dim=-1)
            start_positions = start_positions.squeeze(-1)
            end_positions = end_positions.squeeze(-1)
            
            start_loss = criterion(start_logits.to(device), start_positions.to(device))
            end_loss = criterion(end_logits.to(device), end_positions.to(device))
            loss = (start_loss + end_loss) / 2            
            
            loss.backward()
            optimizer.step()
            # scheduler.step()
            
            train_losses.append(loss.item())
            progress_bar.set_postfix(loss=np.mean(train_losses[-500:]))
            progress_bar.update(x.shape[0])
            
            
            # evaluate
            pred_spans = []
            real_tokens = []
                
            start = torch.argmax(start_logits, dim=1).cpu().tolist()
            end = torch.argmax(end_logits, dim=1).cpu().tolist()
                                
            for i,(s,e) in enumerate(zip(start, end)):
                pred_spans.append(x[i].tolist()[s:e+1])
                
            for i,span in enumerate(bert_span):
                span = span.tolist()
                real = x[i].tolist()[int(span[0]):int(span[1])+1]
                real_tokens.append(real)
                                
            f_scores = []
            for r,p in zip(real_tokens, pred_spans):
                f_score = compute_f1(p, r)
                f_scores.append(f_score)
                    
            mean_f_scores = np.mean(f_scores)
            train_f_scores.append(mean_f_scores)
                    
        print("Evaluating: train F1-score = {:.3f} ".format(np.mean(train_f_scores)))
            
            
        
        progress_bar.close()
        
        # model.eval()    
        
        with torch.no_grad():
            progress_bar = tqdm(total=len(dev_loader.dataset), desc='Validation:')
            
            for x, y, bert_span in dev_loader:
                pred = model.forward(x.to(device))
                
                start_logits, end_logits = pred.permute(0,2,1).split(1, dim=1)
                start_logits = start_logits.squeeze(1)
                end_logits = end_logits.squeeze(1)
            
                start_positions, end_positions = bert_span.split(1, dim=-1)
                start_positions = start_positions.squeeze(-1)
                end_positions = end_positions.squeeze(-1)
            
                start_loss = criterion(start_logits.to(device), start_positions.to(device))
                end_loss = criterion(end_logits.to(device), end_positions.to(device))
                loss = (start_loss + end_loss) / 2            
                
                val_losses.append(loss.item()) 
                mean_val_loss = np.mean(val_losses[-500:])
                progress_bar.set_postfix(loss=mean_val_loss)
                progress_bar.update(x.shape[0])
                
                
                # evaluate
                pred_spans = []
                real_tokens = []
                
                start = torch.argmax(start_logits, dim=1).cpu().tolist()
                end = torch.argmax(end_logits, dim=1).cpu().tolist()
                                
                for i,(s,e) in enumerate(zip(start, end)):
                    pred_spans.append(x[i].tolist()[s:e+1])
                
                for i,span in enumerate(bert_span):
                    span = span.tolist()
                    real = x[i].tolist()[int(span[0]):int(span[1])+1]
                    real_tokens.append(real)
                                
                f_scores = []
                for r,p in zip(real_tokens, pred_spans):
                    f_score = compute_f1(r, p)
                    f_scores.append(f_score)
                    
                mean_f_scores = np.mean(f_scores)
                all_f_scores.append(mean_f_scores)
                    
            print("Evaluating: dev F1-score = {:.3f} ".format(np.mean(all_f_scores)))
                
                
            progress_bar.close()

        torch.save(model, "classifier.pkl")
        joblib.dump(train_losses, "train_losses.pkl")
        joblib.dump(val_losses, "val_losses.pkl")

        if mean_val_loss < best_val_loss:
            best_val_loss = mean_val_loss
        else:
            print('Early stopping')
            torch.save(model, "classifier-final.pkl")
            joblib.dump(train_losses, "train_losses-final.pkl")
            joblib.dump(val_losses, "val_losses-final.pkl")
            break
            
    return train_losses, val_losses

In [13]:
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
from tqdm.auto import tqdm
import torch
import numpy as np



def compute_f1(gold_toks, pred_toks):
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
         return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [16]:
epochs = 15

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = Classifier().to(device)

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=8e-6)

In [None]:
print("Training the model...")
train_losses, val_losses = train_model(model=model, epochs=epochs, optimizer=optimizer,
                         criterion=criterion, train_loader=train_loader, dev_loader=dev_loader)

Training the model...


HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=38144.0, style=ProgressStyle(description_wi…

In [None]:
# 1 epoch - loss = 5.34


In [14]:
model = torch.load("classifier-4.pkl")
model.eval()

In [41]:
# import joblib
# losses = joblib.load('val_losses.pkl')



In [19]:
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
from tqdm.auto import tqdm
import torch
import numpy as np



def compute_f1(gold_toks, pred_toks):
    common = Counter(gold_toks) & Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1



def test_model(model, test_loader, device):
    
    count = 0

    all_real_spans = []
    all_pred_spans = []
    
    model.eval()
    
    for x, y, bert_span in tqdm(test_loader):
        
        print(bert_span[0])
            
        with torch.no_grad():
            
            pred = model.forward(x.to(device))
                
            start_logits, end_logits = pred.permute(0,2,1).split(1, dim=1)
            start_logits = start_logits.squeeze(1)
            end_logits = end_logits.squeeze(1)
            
            start_positions, end_positions = bert_span.split(1, dim=-1)
            start_positions = start_positions.squeeze(-1)
            end_positions = end_positions.squeeze(-1)
            
            start_loss = criterion(start_logits.to(device), start_positions.to(device))
            end_loss = criterion(end_logits.to(device), end_positions.to(device))
            loss = (start_loss + end_loss) / 2            
                    
            # save predictions
                
            start = torch.argmax(start_logits, dim=1).cpu().tolist()
            end = torch.argmax(end_logits, dim=1).cpu().tolist()
                                
            for i,(s,e) in enumerate(zip(start, end)):
                all_pred_spans.append(x[i].tolist()[s:e+1])
                
            for i,span in enumerate(bert_span):
                span = span.tolist()
                real = x[i].tolist()[int(span[0]):int(span[1])+1]
                all_real_spans.append(real)
            
            count += 1
            if count == 10:
                break
                                                                
    return all_real_spans, all_pred_spans

In [34]:
ll_real_spans, all_pred_spans = test_model(model, test_loader, device)

HBox(children=(FloatProgress(value=0.0, max=149.0), HTML(value='')))

tensor([204, 211])
tensor([73, 89])
tensor([188, 192])
tensor([ 99, 104])
tensor([189, 194])
tensor([202, 204])
tensor([111, 117])
tensor([211, 217])
tensor([ 98, 109])
tensor([124, 133])


In [35]:
answers = []
for i,(x,y) in enumerate(zip(ll_real_spans, all_pred_spans)):
    answers.append([tokenizer.decode(ll_real_spans[i]), tokenizer.decode(all_pred_spans[i])])

In [36]:
for i in answers:
    print(i)

['В опытах Кольхерстера', 'в начале XX века']
['К IV веку', 'древний греческий город Византий, расположенный на европейском побережье Босфора']
['В болотистых лесах', '']
['через аксиомы теории множеств', 'счётным или несчётным']
['аутофагосомами', 'аутофагосомами']
['импровизации', '']
['Пэт Куинн', 'Пол Морис']
['в зарубежной деловой практике', '']
['экономических методов управления', '']
['В октябре 1901 года', '']
['Армянскую грамматику', 'Армянскую грамматику, который, однако, ещё свободна от сильной латинизации и рационализации грамматических правил, характерных последующим авторам. В 1637 году Симеон Джугаеци ( кон. XVI века [UNK] 1657 год. ) пишет труд Книга называемая грамматикой, в котором подробно анализировал все аспекты армянского языка [UNK] грамматику, фонетику и т. д. Следующим исследователем стал Клемент Галанус, издавший в 1645 году Грамматику и логику, так же посвященную армянскому языку. Последний пользовался трудами предшествующих армянских грамматиков. Воскан Ерев

In [48]:
criterion = torch.nn.CrossEntropyLoss(ignore_index=-1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,
                                 weight_decay=0.01)

In [49]:
(all_f1_scores, 
 all_test_answers, 
 all_real_answers, 
 all_test_answer_spans, 
 all_real_answer_spans, 
 mean_test_losses) = test_model(model=model, 
                                test_loader=train_loader, 
                                criterion=criterion,
                               device=device)

HBox(children=(FloatProgress(value=0.0, max=1192.0), HTML(value='')))

tensor([1, 9])
Testing: test loss = 0.046 
[15, 205]
Evaluating: test accuracy = 0.000 
Evaluating: test F1-score = 0.062 
tensor([165, 168])
Testing: test loss = 0.059 
[294, 222]
Evaluating: test accuracy = 0.000 
Evaluating: test F1-score = 0.038 
tensor([65, 72])
Testing: test loss = 0.118 
[118, 202]
Evaluating: test accuracy = 0.000 
Evaluating: test F1-score = 0.075 
