# Векторное представления текстов

## Библиотеки

In [None]:
!pip install --quiet dvc[gdrive] fasttext

In [2]:
from copy import deepcopy

import fasttext
import fasttext.util
import matplotlib.pyplot as plt
from matplotlib.image import imread
from mpl_toolkits import mplot3d
from matplotlib import gridspec
from PIL import Image
import io
import os
from urllib.request import urlopen
from skimage.segmentation import mark_boundaries
from nltk.tokenize import RegexpTokenizer

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import requests
from scipy.stats import norm
import torch

import dvc.api

from sklearn.metrics import classification_report
from torch.utils.tensorboard import SummaryWriter

from torchvision import datasets, transforms

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

## Код для обучения

In [5]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()
    
    output = model(x_batch.to(model.device))
    
    loss = loss_function(output, y_batch.to(model.device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

In [6]:
def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)
            
        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total

In [7]:
def trainer(count_of_epoch, 
            batch_size, 
            dataset,
            model, 
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)
    
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        batch_generator = tqdm(
            torch.utils.data.DataLoader(dataset=dataset, 
                                        batch_size=batch_size, 
                                        shuffle=True, pin_memory=True), 
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size>0))
        
        epoch_loss = train_epoch(train_generator=batch_generator, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optima, 
                    callback=callback)
        
        iterations.set_postfix({'train epoch loss': epoch_loss})

## Что это и зачем нужно?

## Пример классификации твитов

### Загрузим выборку
Рекомендую всем ознакомиться с dvc (если проблема аунтетификации, перезагрузите ядро юпитер)

In [None]:
with dvc.api.open(
        'sem17/data/dataset.csv',
        repo='https://github.com/andriygav/MachineLearningSeminars',
        ) as f:
        dataset = pd.read_csv(f)

### Посмотрим на данные

In [9]:
dataset = dataset[dataset[['tag', 'message']].notnull().all(1)]

In [10]:
dataset = dataset.sample(125000, random_state=42)
train_mask = np.random.rand(len(dataset), ) < 0.8
dataset_train = dataset[train_mask]
dataset_test = dataset[~train_mask]

In [11]:
dataset_train.sample(5, random_state=42)

Unnamed: 0,tag,message
227654,1.0,@JoelMadden YAYYY! proven I'm stronger in watc...
407532,1.0,"@SamLuminate Isnt Ponderosa beautiful, ill be ..."
13992,0.0,&quot;the show&quot; is playing and @leelonn i...
548653,0.0,as u can see my pic aint working
526933,0.0,@wyndwalker dang sorry buddy i missed this som...


In [12]:
dataset_train.describe()

Unnamed: 0,tag
count,100018.0
mean,0.49986
std,0.500002
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


### Построим модель RNN (как 2 семинара назад)


In [13]:
class RNNclassifier(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
    def __init__(self, vocab_dim, output_dim, emb_dim = 10, hidden_dim = 10, 
                 num_layers = 3, bidirectional = False, p=0.7):
        super(RNNclassifier, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_dim, emb_dim)
        self.encoder = torch.nn.LSTM(emb_dim, hidden_dim, num_layers, 
                                     bidirectional=bidirectional, 
                                     batch_first=True, dropout=p)
        self.linear = torch.nn.Linear(
            2*num_layers*int(bidirectional + 1)*hidden_dim, 
            output_dim)
    def forward(self, input):
        input = self.embedding(input)
        _, (h, c) = self.encoder(input)
        act = torch.cat([h, c], dim=0).transpose(0, 1)
        act = act.reshape(len(input), -1)
        return self.linear(act)

In [14]:
class Tokenizer(object):
    def __init__(self, word_to_ind, tokenizer):
        self.word_to_ind = word_to_ind
        self.tokenizer = tokenizer
    def __call__(self, sentences, max_length = 10, pad_to_max_length = False):
        tokens = self.tokenizer.tokenize_sents(sentences)
        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))
        tokens = [['[CLS]']+s+['[SEP]'] + ['[PAD]']*(max_length-len(s)) \
                  if len(s) < max_length \
                  else ['[CLS]']+s[:max_length]+['[SEP]'] \
                  for s in tokens ]
        ids = [[self.word_to_ind.get(w, self.word_to_ind['[UNK]']) for w in sent] for sent in tokens]
        return torch.tensor(ids)

### Разбиение на слова --- токенайзер

In [None]:
word_to_ind = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 3, '[SEP]': 4}
for sent in tqdm(dataset_train.values[:, 1]):
    for word in RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+').tokenize(sent):
        if word not in word_to_ind:
            word_to_ind[word] = word_to_ind.__len__()

In [12]:
len(word_to_ind)

109561

In [13]:
len(set(dataset_train.values[:, 0]))

2

In [15]:
tokenizer = Tokenizer(word_to_ind, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))

In [16]:
train_data_sent = tokenizer(dataset_train.values[:, 1])
test_data_sent = tokenizer(dataset_test.values[:, 1])

In [17]:
dataset_train_pt = torch.utils.data.TensorDataset(
    train_data_sent, torch.tensor(dataset_train.values[:, 0].tolist()).long())
dataset_test_pt = torch.utils.data.TensorDataset(
    test_data_sent, torch.tensor(dataset_test.values[:, 0].tolist()).long())

### Инициализация модели

In [38]:
config = dict()
config['vocab_dim'] = len(word_to_ind)
config['output_dim'] = len(set(dataset.values[:, 0]))
config['emb_dim'] = 100
config['hidden_dim'] = 10
config['num_layers'] = 3
config['bidirectional'] = False
config['p'] = 0.7

model = RNNclassifier(**config)
_ = model.to(device)

### Качество до обучения

In [42]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset_test_pt, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
model.eval()
for it, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)

    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
    real.extend(y_batch.cpu().numpy().tolist())

print(classification_report(real, pred))

              precision    recall  f1-score   support

           0       0.54      0.04      0.08     12474
           1       0.50      0.96      0.66     12501

    accuracy                           0.50     24975
   macro avg       0.52      0.50      0.37     24975
weighted avg       0.52      0.50      0.37     24975



### Обучение модели

In [43]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

In [None]:
trainer(count_of_epoch=5, 
        batch_size=64, 
        dataset=dataset_train_pt,
        model=model, 
        loss_function=loss_function,
        optimizer = optimizer,
        lr=0.001,
        callback=None)

### Качество после обучения

In [45]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset_test_pt, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
test_loss = 0
model.eval()
for it, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)

    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
    real.extend(y_batch.cpu().numpy().tolist())

print(classification_report(real, pred))

              precision    recall  f1-score   support

           0       0.65      0.73      0.69     12474
           1       0.69      0.61      0.65     12501

    accuracy                           0.67     24975
   macro avg       0.67      0.67      0.67     24975
weighted avg       0.67      0.67      0.67     24975



## Word2Vec (на основе vec формата fasttext)

Используя опыт предыдущего семинара хочется "дообучать" нейросеть вместо того, чтобы обучать с нуля.

Предлагается к примеру использовать предобученный слой nn.Embedings.

### Скачивание модели

In [None]:
!dvc get https://github.com/andriygav/MachineLearningSeminars sem17/data/cc.en.10.bin

### Загрузка fasttext модели

In [None]:
ft = fasttext.load_model('cc.en.10.bin', )

### Генерация VEC формата

In [None]:
word_to_ind = dict()
matrix_fasttext = []
for i, w in enumerate(tqdm(ft.get_words(on_unicode_error='replace'))):
    v = ft.get_word_vector(w)
    if w not in word_to_ind:
        word_to_ind[w] = i
        matrix_fasttext.append(v)
for w in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
    word_to_ind[w] = word_to_ind.__len__()
    matrix_fasttext.append(np.zeros_like(matrix_fasttext[-1]))


### Получения векторизаваных данных

In [20]:
tokenizer = Tokenizer(word_to_ind, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))

In [21]:
train_data_sent = tokenizer(dataset_train.values[:, 1])
test_data_sent = tokenizer(dataset_test.values[:, 1])

In [22]:
dataset_train_pt = torch.utils.data.TensorDataset(
    train_data_sent, torch.tensor(dataset_train.values[:, 0].tolist()).long())
dataset_test_pt = torch.utils.data.TensorDataset(
    test_data_sent, torch.tensor(dataset_test.values[:, 0].tolist()).long())

### Инициализация моделей

In [23]:
config = dict()
config['vocab_dim'] = len(word_to_ind)
config['output_dim'] = len(set(dataset.values[:, 0]))
config['emb_dim'] = 10
config['hidden_dim'] = 10
config['num_layers'] = 3
config['bidirectional'] = False
config['p'] = 0.7

model = RNNclassifier(**config)
_ = model.to(device)

### Использование VEC формата фастекста в модели

In [27]:
model.embedding.weight.data.copy_(torch.tensor(matrix_fasttext))
for param in model.embedding.parameters():
    param.requires_grad = False
model.to(device)

RNNclassifier(
  (embedding): Embedding(2000004, 10)
  (encoder): LSTM(10, 10, num_layers=3, batch_first=True, dropout=0.7)
  (linear): Linear(in_features=60, out_features=2, bias=True)
)

### Качество до обучения

In [28]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset_test_pt, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
model.eval()
for it, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)

    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
    real.extend(y_batch.cpu().numpy().tolist())

print(classification_report(real, pred))

              precision    recall  f1-score   support

           0       0.49      0.98      0.66     12339
           1       0.56      0.02      0.04     12643

    accuracy                           0.50     24982
   macro avg       0.53      0.50      0.35     24982
weighted avg       0.53      0.50      0.34     24982



### Обучение модели

In [29]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

In [None]:
trainer(count_of_epoch=5, 
        batch_size=64, 
        dataset=dataset_train_pt,
        model=model, 
        loss_function=loss_function,
        optimizer = optimizer,
        lr=0.001,
        callback=None)

### Качество после обучения

In [31]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset_test_pt, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
model.eval()
for it, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)

    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
    real.extend(y_batch.cpu().numpy().tolist())

print(classification_report(real, pred))

              precision    recall  f1-score   support

           0       0.63      0.62      0.62     12339
           1       0.63      0.64      0.64     12643

    accuracy                           0.63     24982
   macro avg       0.63      0.63      0.63     24982
weighted avg       0.63      0.63      0.63     24982



## Полноценный fasttext

### Задание модели

In [74]:
class RNNclassifierFastText(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device
    def __init__(self, output_dim, emb_dim = 10, hidden_dim = 10, 
                 num_layers = 3, bidirectional = False, p=0.7):
        super(RNNclassifierFastText, self).__init__()
        self.encoder = torch.nn.LSTM(emb_dim, hidden_dim, num_layers, 
                                     bidirectional=bidirectional, 
                                     batch_first=True, dropout=p)
        self.linear = torch.nn.Linear(
            2*num_layers*int(bidirectional + 1)*hidden_dim, 
            output_dim)
    def forward(self, input):
        _, (h, c) = self.encoder(input)
        act = torch.cat([h, c], dim=0).transpose(0, 1)
        act = act.reshape(len(input), -1)
        return self.linear(act)

In [64]:
class TokenizerFastText(object):
    def __init__(self, ft, tokenizer):
        self.ft = ft
        self.tokenizer = tokenizer
    def __call__(self, sentences, max_length = 10, pad_to_max_length = False):
        tokens = self.tokenizer.tokenize_sents(sentences)
        if not pad_to_max_length:
            max_length = min(max_length, max(map(len, tokens)))
        tokens = [['[CLS]']+s+['[SEP]'] + ['[PAD]']*(max_length-len(s)) \
                  if len(s) < max_length \
                  else ['[CLS]']+s[:max_length]+['[SEP]'] \
                  for s in tokens ]
        vectors = [[self.ft.get_word_vector(w) for w in sent] for sent in tokens]
        return torch.tensor(vectors)

### Векторизация всех текстов

In [65]:
tokenizer = TokenizerFastText(ft, RegexpTokenizer('[a-zA-Z]+|[^\w\s]|\d+'))

In [70]:
train_data_sent = tokenizer(dataset_train.values[:, 1])
test_data_sent = tokenizer(dataset_test.values[:, 1])

In [72]:
dataset_train_pt = torch.utils.data.TensorDataset(
    train_data_sent, torch.tensor(dataset_train.values[:, 0].tolist()).long())
dataset_test_pt = torch.utils.data.TensorDataset(
    test_data_sent, torch.tensor(dataset_test.values[:, 0].tolist()).long())

### Инициализация модели

In [75]:
config = dict()
config['output_dim'] = len(set(dataset.values[:, 0]))
config['emb_dim'] = 10
config['hidden_dim'] = 10
config['num_layers'] = 3
config['bidirectional'] = False
config['p'] = 0.7

model = RNNclassifierFastText(**config)
_ = model.to(device)

### Качество до обучения

In [76]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset_test_pt, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
model.eval()
for it, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)

    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
    real.extend(y_batch.cpu().numpy().tolist())

print(classification_report(real, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00     12339
           1       0.51      1.00      0.67     12643

    accuracy                           0.51     24982
   macro avg       0.25      0.50      0.34     24982
weighted avg       0.26      0.51      0.34     24982



### Обучение модели

In [77]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

In [None]:
trainer(count_of_epoch=5, 
        batch_size=64, 
        dataset=dataset_train_pt,
        model=model, 
        loss_function=loss_function,
        optimizer = optimizer,
        lr=0.001,
        callback=None)

### Качество после обучения

In [79]:
batch_generator = torch.utils.data.DataLoader(dataset=dataset_test_pt, 
                                              batch_size=64, 
                                              pin_memory=True)
            
pred = []
real = []
model.eval()
for it, (x_batch, y_batch) in enumerate(batch_generator):
    x_batch = x_batch.to(device)
    with torch.no_grad():
        output = model(x_batch)

    pred.extend(torch.argmax(output, dim=-1).cpu().numpy().tolist())
    real.extend(y_batch.cpu().numpy().tolist())

print(classification_report(real, pred))

              precision    recall  f1-score   support

           0       0.59      0.78      0.67     12339
           1       0.68      0.46      0.55     12643

    accuracy                           0.62     24982
   macro avg       0.63      0.62      0.61     24982
weighted avg       0.63      0.62      0.61     24982



### Репрезентация слов (к сожалению плохой пример вышел из-за reduce)

In [57]:
ft.get_analogies("ios", "google", "android", )

[(0.992003321647644, 'quickpet'),
 (0.9900119304656982, 'WOTA'),
 (0.9890460968017578, 'EBF4'),
 (0.9875317215919495, 'Photshop'),
 (0.9871889352798462, 'n810'),
 (0.9865942597389221, 'Snowtrooper'),
 (0.9860414266586304, 'Hamachi'),
 (0.9857523441314697, 'SKII'),
 (0.9834288954734802, 'PMD2'),
 (0.9827696084976196, 'CM10.2')]

In [55]:
ft.get_nearest_neighbors('king')

[(0.9711152911186218, 'prince'),
 (0.9525136351585388, 'centurian'),
 (0.9459097981452942, 'knight'),
 (0.9448733329772949, 'musketeers'),
 (0.9438545107841492, 'bellringer'),
 (0.9397228360176086, 'victorius'),
 (0.9386382102966309, 'reverred'),
 (0.9370817542076111, 'rennaisance'),
 (0.9363556504249573, 'peasent'),
 (0.9356678128242493, 'halycon')]

## Приемы unsupervise обучения эмбедингов. На основе BERT.

Основное приемущество векторного представления в том, что он обучается не зависимо от задачи.

Для обучения представления используются вспомогательные задачи.

### Предсказание токена на основе окрестности

![image](images/img1.png)

### Предсказание, что предложение следует за предыдущем

![image](images/img2.png)

### Другие задачи, которые можно дообучать на основе предобученых векторов

#### Выбор варианта из списка альтернатив
Примерный формат данных:
* Premise: The man broke his toe. What was the CAUSE of this?
* Alternative 1: He got a hole in his sock. 
* Alternative 2: He dropped a hammer on his foot.

#### Recognizing Textual Entailment
Примерный формат данных:
* Premise: If you help the needy, God will reward you.
* Hypothesis: Giving money to a poor man has good consequences.

#### Word in Context
Примерный формат данных:
* Context 1: There's a lot of trash on the **bed** of the river.
* Context 2: I keep a glass of water next to my **bed** when I sleep.

#### Answer To Passage
Примерный формат данных
* Question: Is france the same timezone as the uk.
* Hypothesis: At the Liberation of France in the summer of 1944, Metropolitan France kept GMT+2 as it was the time then used by the Allies (British Double Summer Time). In the winter of 1944--1945, Metropolitan France switched to GMT+1, same as in the United Kingdom, and switched again to GMT+2 in April 1945 like its British ally. In September 1945, Metropolitan France returned to GMT+1 (pre-war summer time), which the British had already done in July 1945. Metropolitan France was officially scheduled to return to GMT+0 on November 18, 1945 (the British returned to GMT+0 in on October 7, 1945), but the French government canceled the decision on November 5, 1945, and GMT+1 has since then remained the official time of Metropolitan France.

#### Более подробно для русского и английского языка
* [SuperGLUE](https://super.gluebenchmark.com)
* [Russian SuperGLUE](https://russiansuperglue.com)