In [None]:
!pip install transformers

In [1]:
import pandas as pd
import string
import re

from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch import nn
import torch.nn.functional as F
import torch

import nltk
from nltk.tokenize import word_tokenize
import random
import numpy as np
from tqdm import tqdm_notebook, tqdm

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
import transformers

import os
import pickle
from IPython.display import clear_output
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#os.chdir('/content/drive/My Drive/Huawei_for_job')

In [3]:
punctuation = set('!,.-?:;')
nothing = ''
labels = [nothing] + list(punctuation)

le = preprocessing.LabelEncoder()
le.fit(labels)

LabelEncoder()

### main part

In [4]:
tokenizer = transformers.BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [5]:
def data_encoding(data_path):
  with open(data_path, 'r', encoding='utf-8') as f:
    data = f.readlines()

  X = []
  Y = []

  for line in tqdm_notebook(data):
    word, punc = line.split('\t')
    if word in string.punctuation:
      continue
    punc = punc.strip()
    tokens = tokenizer.tokenize(word)
    x = tokenizer.convert_tokens_to_ids(tokens)
    y = [int(punc)]
    if len(x) > 0:
      if len(x) > 1:
        y = (len(x)-1)*[0]+y
      X += x
      Y += y
  return X, Y

In [6]:
with open('data_X.pickle', 'rb') as f:
  X = pickle.load(f)

with open('data_y.pickle', 'rb') as f:
  y = pickle.load(f)

In [7]:
SEGMENT_SIZE = 32

def create_segments(x, segment_size = SEGMENT_SIZE):
    X = []
    #для первого и последнего слова добавляем слова
    x_pad = x[-((segment_size-1)//2-1):]+x+x[:segment_size//2]

    for i in tqdm_notebook(range(len(x_pad)-segment_size+2)):
        segment = x_pad[i:i+segment_size-1]
        X.append(segment)

    return np.array(X)

In [8]:
from utils import check_one, gold_str

# функция для проверки пунктуации
def check_metric(model, device):

  model.eval()

  tokenized_input = list(filter(lambda x: x not in string.punctuation, word_tokenize(gold_str)))
  tokens = tokenizer.convert_tokens_to_ids(tokenized_input)
  segments = torch.tensor(create_segments(tokens), dtype = torch.long).to(device)
  outputs = le.inverse_transform(model(segments).argmax(dim=-1).cpu()).tolist()

  i = 0
  j = 0
  while j<len(outputs):
    tokenized_input.insert(i+1, outputs[j])
    i += 2
    j += 1

  hypothesis = ' '.join(tokenized_input)
  print(hypothesis)
  return check_one(gold_str, hypothesis)

In [None]:
check_metric(model, torch.device('cuda'))

In [9]:
from transformers import BertModel

class BertPunc(nn.Module):  
    
    def __init__(self, dropout, seq_len = SEGMENT_SIZE):
        super(BertPunc, self).__init__()
        self.bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased')
        self.bert_vocab_size = tokenizer.vocab_size
        self.bn = nn.BatchNorm1d((seq_len-1)*768)
        self.fc1 = nn.Linear((seq_len-1)*768, ((seq_len-1)*768)//2)
        self.fc2 = nn.Linear(((seq_len-1)*768)//2, len(labels))
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids = None):
        x = self.bert(input_ids = input_ids)[0]
        x = x.view(x.shape[0], -1)
        x = self.fc2(self.dropout(F.relu(self.fc1(self.dropout(self.bn(x))))))
        return x

### Sampling

In [None]:
# under sampling
full_indexes = []
y = np.array(y)
n_samples = 20000000
max_len = 80000
for i in tqdm_notebook(range(8)):
  indexes = []
  for idx, j in enumerate(y[:n_samples]):
    if len(indexes) >= max_len:
      continue
    if j == i:
      indexes.append(idx)
  full_indexes += indexes

full_indexes = np.array(full_indexes)
X = create_segments(X[:n_samples])
X = X[full_indexes]
y = y[full_indexes]

In [11]:
X = X[:500000]
y = y[:500000]

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, stratify = y)
X_train = torch.from_numpy(X_train).long()
X_val = torch.from_numpy(X_val).long()
y_train = torch.tensor(y_train, dtype = torch.long)
y_val = torch.tensor(y_val, dtype = torch.long)

In [13]:
BATCH_SIZE = 64

train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

### Training

In [14]:
def train(model, dataloader, optimizer, criterion, clip):
    
    model.train()
    epoch_loss = 0.
    acc = 0.
    with tqdm(desc="batch", total=len(dataloader)) as pbar_outer:
      for i, batch in enumerate(dataloader, 1):

            optimizer.zero_grad()     

            x, labels = batch    
            x = x.to(device)
            labels = labels.to(device)             
            
            outputs = model(x)
            
            loss = criterion(outputs, labels)
            
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            
            optimizer.step()
            
            epoch_loss += loss.item()
            acc += metrics.accuracy_score(labels.cpu().data.numpy().flatten(), outputs.argmax(dim=1).cpu().data.numpy().flatten())

            if i%100 == 0:
              pbar_outer.update(100)
              tqdm.write("train_loss:{}".format(loss.item()))

            if i%5000 == 0:
              torch.save(model.state_dict(), 'puncBert_{}.pth'.format(i))
        
    train_loss = round( (epoch_loss / len(dataloader)), 3)
    train_acc = round( (acc / len(dataloader)), 3)
        
    return train_loss, train_acc

In [15]:
def evaluate(model, dataloader, criterion):
    
    model.eval()
    epoch_loss = 0
    acc = 0.
    
    with torch.no_grad():
      with tqdm(desc="batch", total=len(dataloader)) as pbar_outer:
        for i, batch in enumerate(dataloader, 1):

            x, labels = batch    
            x = x.to(device)
            labels = labels.to(device)             
            
            outputs = model(x)
        
            loss = criterion(outputs, labels)
            
            epoch_loss += loss.item()
            acc += metrics.accuracy_score(labels.cpu().data.numpy().flatten(), outputs.argmax(dim=1).cpu().data.numpy().flatten())
            
            if i%100 == 0:
              pbar_outer.update(100)
              tqdm.write("val_loss:{}  val_acc:{}".format(loss.item(), acc/i))
            
        valid_loss = round((epoch_loss / len(dataloader)), 3)
        metric = check_metric(model, device)
        val_acc = round( (acc / len(dataloader)), 3)
  
    return valid_loss, metric, val_acc

In [None]:
import matplotlib.pyplot as plt
import time


device = torch.device('cuda')
#metrics = -100

model = BertPunc(0.3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, 0.75)

clip = 3
num_epochs = 50
best_loss = 100
history_val = []
history_train = []

for epoch in tqdm_notebook(range(num_epochs)):
  
  train_loss, train_acc = train(model, train_dataloader, optimizer, criterion, clip)
  valid_loss, metric, val_acc = evaluate(model, val_dataloader, criterion)
  scheduler.step()

  history_val.append(valid_loss)
  history_train.append(train_loss)

  clear_output(True)
  plt.plot(history_train, label = 'train_loss')
  plt.plot(history_val, label = 'valid_loss')
  plt.legend()
  plt.show()
  time.sleep(3)


  print('Epoch: {} \n Train Loss {}  Val loss {}  Train acuracy {}  Val acuracy {}  Metric {}'.format(epoch + 1, train_loss, valid_loss, train_acc, val_acc, metric))

  if valid_loss < best_loss:
    best_loss = valid_loss
    torch.save(model.state_dict(), 'puncBert.pth')

In [33]:
X, y = next(iter(val_dataloader))

In [17]:
check_metric(model, device)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))


Начиная  жизнеописание , героя ? моего , Алексея , Федоровича ? Карамазова , нахожусь  в  некотором , недоумении . А  именно : хотя  я , и  называю : Алексея , Федоровича : моим  героем . но , однако , сам  знаю , что : человек . он , отнюдь , не  великий , а  посему , и  предвижу . неизбежные . вопросы , вроде : таковых : чем  же ? замечателен ? ваш ? Алексей , Федорович , что ? вы ? выбрали ? его ? своим  героем ?


0.8771929824561403

In [None]:
tqdm._instances.clear()

In [42]:
torch.cuda.empty_cache()

In [43]:
!nvidia-smi

Sun Jul 19 22:38:30 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    33W / 250W |  16265MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces