<a href="https://colab.research.google.com/github/annnyway/QA-for-Russian/blob/master/Baseline_Lorenzo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install razdel pytorch_pretrained_bert regex
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from razdel import tokenize
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
import torch
from pytorch_pretrained_bert import BertModel, BertTokenizer, BertForMaskedLM, BertConfig
from google.colab import drive
drive.mount('/content/drive')

Collecting razdel
  Downloading https://files.pythonhosted.org/packages/cf/f0/664eb27854d7de7c3605b5cd2a155cf069143fb00902ac479325bf1a98b7/razdel-0.4.0-py2.py3-none-any.whl
Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 3.0MB/s 
Installing collected packages: razdel, pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2 razdel-0.4.0
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww

In [0]:
def tokenize_text(text):
    """Токенизация"""
    words = [_.text for _ in list(tokenize(text))]
    return words


class QADataset(Dataset):

    def __init__(self, tokenizer: BertTokenizer,
                 paragraph_tokens: list,
                 question_tokens: list,
                 answer_spans: list,
                 word2index: dict,
                 verbose=True,
                 max_seq_len=512,
                pad_token="[PAD]"):

        super().__init__()
        self.tokenizer = tokenizer
        self.word2index = word2index
        self.word2bert_tokens = {word: self.tokenizer.tokenize(word) for word
                                 in tqdm(list(self.word2index.keys())[1:])}
        self.word2bert_indices = {
            word: [self.tokenizer.vocab[bert_token] for bert_token in
                   self.word2bert_tokens[word]] for word in
            self.word2bert_tokens.keys()}

        self.sequence_length = max_seq_len
        self.pad_index = self.word2index[pad_token]

        self.x_data = []
        self.y_data = []
        self.load_x_y(paragraph_tokens, question_tokens, answer_spans)

    def load_x_y(self, paragraphs, questions, spans, verbose=True):
        for par, quest, span in tqdm(zip(paragraphs, questions, spans),
                                     desc="Loading data", disable=not verbose):
            tokens = ["[CLS]"] + par + ["[SEP]"] + quest + ["[SEP]"]
            start, end = span.split(",")
            start, end = int(start), int(end)
            bert_tokens = [self.word2bert_indices[word] for word in tokens]
            bert_span_start = sum(len(x) for x in bert_tokens[:start + 1])
            bert_span_end = sum(len(x) for x in bert_tokens[:end + 1]) # прибавляем 1, т.к. у нас в начале есть еще токен CLS
            span = (bert_span_start, bert_span_end) 
            
            bert_tokens = sum(bert_tokens, [])
            par_tokens = [self.word2bert_indices[word] for word in ["[CLS]"] + par]
            que_tokens = [self.word2bert_indices[word] for word in ["[SEP]"] + quest + ["[SEP]"]]
            if len(bert_tokens) + 3 > 512:
                if bert_span_start <= len(par_tokens)/2: # если спан в первой половине параграфа
                    slice_ = len(bert_tokens) - 512
                    bert_tokens = sum(par_tokens[:-slice_] + que_tokens, [])
                elif bert_span_start > len(par_tokens)/2:
                    slice_ = len(bert_tokens) - 512
                    bert_tokens = sum(par_tokens[slice_:] + que_tokens, [])
                    bert_span_start = bert_span_start - slice_
                    bert_span_end = bert_span_end - slice_
                    span = (bert_span_start, bert_span_end)
                elif bert_span_start > 1024:
                    factor = floor(len(bert_tokens)/512)
                    slice_ = len(bert_tokens) - 512*factor
                    bert_tokens = sum(par_tokens[slice_:] + que_tokens, [])
                    bert_span_start = bert_span_start - slice_
                    bert_span_end = bert_span_end - slice_
      
            self.x_data.append(bert_tokens)
            self.y_data.append(span)

    def padding(self, sequence):
        if len(sequence) > self.sequence_length:
            sequence = sequence[: self.sequence_length]
        elif len(sequence) < self.sequence_length:
            sequence += [self.pad_index for i in
                         range(self.sequence_length - len(sequence))]
        return sequence

    def create_y_matrix(self, span):
        startspan = span[0]
        endspan = span[1]
        ansten = torch.zeros(512).long()
        for i in range(startspan, endspan):
            if i <= 512:
                ansten[i] = 1
            else:
                continue
        return ansten  


    def __len__(self):
        return len(self.x_data)

    def __getitem__(self, idx):

        x = self.x_data[idx]
        x = self.padding(x)
        x = torch.Tensor(x).long()
        y = self.y_data[idx]
        return x, y


In [0]:
data = pd.read_csv("/content/drive/My Drive/COLABDRIVE/sdsj2017_sberquad_with_spans(minus_30_examples).csv")
data['span_len'] = data.apply(lambda row: int(row.word_answer_span.split(",")[1]) - int(row.word_answer_span.split(",")[0]), axis = 1) 
data['span_avg'] = data.apply(lambda row: (int(row.word_answer_span.split(",")[1])+int(row.word_answer_span.split(",")[0]))/2, axis = 1)
data = data[(data.span_len <= 10) & (data.span_avg <= 150)]
data = data.sample(frac=0.11, random_state=42) 

tokenizer = BertTokenizer.from_pretrained("/content/drive/My Drive/COLABDRIVE/", do_lower_case=False)

In [69]:
train_on_gpu = torch.cuda.is_available()
if not train_on_gpu:
    print('Bummer!  Training on CPU ...')
else:
    print('You are good to go!  Training on GPU ...')

You are good to go!  Training on GPU ...


In [70]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=42)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

par_tokens_train = [i.split() for i in train.paragraph_tokens]
que_tokens_train = [tokenize_text(i) for i in train.question]
answer_spans_train = train.word_answer_span

par_tokens_test = [i.split() for i in test.paragraph_tokens]
que_tokens_test = [tokenize_text(i) for i in test.question]
answer_spans_test = test.word_answer_span

word2index = {"[PAD]":0, "[CLS]":1, "[SEP]":2}

for sent in par_tokens_train:
    for token in sent:
        if token not in word2index:
            word2index[token] = len(word2index)

for sent in par_tokens_test:
    for token in sent:
        if token not in word2index:
            word2index[token] = len(word2index)
            
for que in que_tokens_train:
    for token in que:
        if token not in word2index:
            word2index[token] = len(word2index)

for que in que_tokens_test:
    for token in que:
        if token not in word2index:
            word2index[token] = len(word2index)

train_data = QADataset(tokenizer=tokenizer,
                   paragraph_tokens=par_tokens_train,
                   question_tokens=que_tokens_train,
                   answer_spans=answer_spans_train,
                   word2index=word2index)

test_data = QADataset(tokenizer=tokenizer,
                   paragraph_tokens=par_tokens_test,
                   question_tokens=que_tokens_test,
                   answer_spans=answer_spans_test,
                   word2index=word2index)

HBox(children=(IntProgress(value=0, max=89660), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', description='Loading data', max=1, style=ProgressStyle(d…

HBox(children=(IntProgress(value=0, max=89660), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', description='Loading data', max=1, style=ProgressStyle(d…

In [0]:
torch.set_default_tensor_type(torch.cuda.FloatTensor)


class Classifier(torch.nn.Module):
    
#from Softmax we get the probability of word being in the answer, 
#we need as Y for loss a matrix (word positionally encoded) of 0 and 1 with 1 being words in span.
    def __init__(self, 
               hidden_size=3072,  
               linear_out=24,
               batch_first=True):
  
        super(Classifier, self).__init__()
            
        self.output_model_file = "/content/drive/My Drive/COLABDRIVE/pytorch_model.bin"
        self.output_config_file = "/content/drive/My Drive/COLABDRIVE/config.json"
        self.tokenizer = BertTokenizer.from_pretrained("/content/drive/My Drive/COLABDRIVE/", do_lower_case=False)
        self.config = BertConfig.from_json_file(self.output_config_file)
        self.model = BertForMaskedLM(self.config)
        device = ('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.state_dict = torch.load(self.output_model_file, map_location=device)
        self.model.load_state_dict(self.state_dict)

        if torch.cuda.is_available():       
            self.simplelayer = torch.nn.Linear(in_features = hidden_size, out_features = linear_out).cuda()
            self.dropout = torch.nn.Dropout(p=0.3).cuda()
            self.relu = torch.nn.LeakyReLU().cuda()
            self.simplelayer2 = torch.nn.Linear(in_features = linear_out, out_features=1).cuda()
        else:
            self.simplelayer = torch.nn.Linear(in_features = hidden_size, out_features = linear_out)
            self.dropout = torch.nn.Dropout(p=0.3)
            self.relu = torch.nn.LeakyReLU()
            self.simplelayer2 = torch.nn.Linear(in_features = linear_out, out_features=1)
        
    
    def get_embeddings(self, x_instance):   
        indexed_tokens = x_instance.tolist()
        break_sentence = indexed_tokens.index(102)
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_ids = [0] * (break_sentence+1)
        segments_ids += [1] * (len(indexed_tokens) - break_sentence - 1) 
        segments_tensors = torch.tensor([segments_ids])
        self.model.eval()
        with torch.no_grad():
            encoded_layers, _ = self.model.bert(tokens_tensor,
                                       segments_tensors)
        token_embeddings = torch.stack(encoded_layers, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1, 0, 2)
        token_vecs_cat = []
        for token in token_embeddings:
            cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]),
                                dim=0)
            token_vecs_cat.append(cat_vec)
        token_vecs_cat = torch.stack(token_vecs_cat, dim=0)
        return token_vecs_cat

    
    def embed_data(self, x): 
        entries = [] 
        #data_iterator = tqdm(x, desc='Loading embeddings')    
        for entry in x:
            emb = self.get_embeddings(entry)
            entries.append(emb)
        return torch.stack(entries)

  
    def forward(self, x):    #super(CNN, self).__init__()
        
        h = self.embed_data(x)
        h = self.dropout(h)
        h = self.simplelayer(h)
        h = self.relu(h)
        pred = self.simplelayer2(h)
        
        return pred

In [0]:
train_loader = DataLoader(train_data, batch_size=32, drop_last=True)
test_loader = DataLoader(test_data, batch_size=32, drop_last=True)

epochs = 3

device = ('cuda') if torch.cuda.is_available() else torch.device('cpu')

model1 = Classifier()
model2 = Classifier()
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model1.parameters(), lr=0.001)

iteration_losses = []
epoch_train_losses = []
epoch_test_losses = []
accuracies = []
f_scores = []

#state = torch.load("/content/drive/My Drive/COLABDRIVE/sberqa_model.pt")
#model.load_state_dict(state['state_dict'])
#optimizer.load_state_dict(state['optimizer'])

In [0]:
for n_epoch in range(epochs):

    try:
            
        train_losses = []
        test_losses = []
        test_targets = []
        test_preds = []
    
        progress_bar = tqdm(total=len(train_loader.dataset), desc='Epoch {}'.format(n_epoch + 1))

        for x, y in train_loader:
    
            optimizer.zero_grad()
            pred = model1.forward(x.to(device))
            loss = criterion(pred.to(device), y[0].unsqueeze(1).long().to(device))
            loss.backward()
            optimizer.step()
            metric = np.mean(iteration_losses[-500:])
            iteration_losses.append(loss.item())
            train_losses.append(loss.item())
            progress_bar.set_postfix(loss=metric)
            progress_bar.update(x.shape[0])
        
        progress_bar.close()
        epoch_train_losses.append(np.mean(train_losses))

    except KeyboardInterrupt:

        progress_bar.close()
    
        break
    
    state = {
    'epoch': n_epoch,
    'state_dict': model1.state_dict,
    'optimizer': optimizer.state_dict,
    'loss': epoch_train_losses
    }
    torch.save(state, "/content/drive/My Drive/COLABDRIVE/sberqa_model.pt")

plt.title('Training Process')
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.grid()
plt.plot(iteration_losses)

HBox(children=(IntProgress(value=0, description='Epoch 1', max=4196, style=ProgressStyle(description_width='in…

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [127]:
spanlist = []
predictedlist = []
test_loss = []
with torch.no_grad():
      progress_bar = tqdm(total=len(test_loader.dataset), desc='Test')

      for datatest in test_loader:
          texts, span = datatest
          pred = model1.forward(x.to(device))
          test_loss.append(float(torch.nn.functional.cross_entropy(pred.to(device), span[0].unsqueeze(1).long().to(device)).cpu().numpy())) # sum up batch loss
          spanlist.append(span[0].unsqueeze(1).long().unsqueeze().tolist())
          predictedlist.append(pred.unsqueeze().tolist())
          progress_bar.set_postfix(loss=np.mean(test_loss))
          progress_bar.update(texts.shape[0])
      progress_bar.close()

HBox(children=(IntProgress(value=0, description='Test', max=1049, style=ProgressStyle(description_width='initi…

In [0]:
#torch.save(model, "/content/drive/My Drive/COLABDRIVE/sber_model2.pt")

In [129]:
#pred.permute(0,2,1).shape, y[0].unsqueeze(1).shape
for i in zip(spanlist[3], predictedlist[3]):
   print(i)


(tensor([ 62,  60,  51, 219,  60,  73,  29,  71,  29, 138, 203,  54,  44,  13,
         78, 158,  74, 154, 166,  60,  53, 104,  52,  41, 250, 112,  86,  15,
         44,  18, 133,  66]), [[-2.1560466289520264], [-0.4421525001525879], [0.33259937167167664], [-1.7611263990402222], [0.7883318066596985], [-6.427777290344238], [-7.024852752685547], [-0.9606631398200989], [-5.970176696777344], [-7.25921106338501], [-8.20097827911377], [-2.3909122943878174], [3.855077028274536], [-1.8192896842956543], [-0.932801365852356], [3.165069341659546], [-2.3229541778564453], [2.7586779594421387], [1.6572129726409912], [1.7737575769424438], [-5.951364994049072], [2.1464734077453613], [-0.450021892786026], [-6.6325531005859375], [-2.6952524185180664], [-6.612743377685547], [1.541932463645935], [-4.681996822357178], [-1.355236530303955], [2.190624713897705], [4.452929496765137], [3.440922260284424], [-2.3457276821136475], [-3.8599183559417725], [-4.2422943115234375], [-2.5083250999450684], [-4.5356607437

In [50]:
aaa = [2, 3, 4, 6]
aaa.index(3)

1