In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, BertModel
import pandas as pd
from datasets import Dataset
import numpy as np
import torch
import random

In [None]:
checkpoint = 'cointegrated/rubert-tiny'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

In [15]:
df = pd.read_csv('siamese_dataset')

raw_dataset = Dataset.from_pandas(df)

In [45]:
def preprocess_data(raw_dataset):
    batch_number = len(raw_dataset['f_sents'])//16

    f_sent_raw_batches = [raw_dataset['f_sents'][16*i: 16*(i+1)] for i in range(batch_number)]
    s_sent_raw_batches = [raw_dataset['s_sents'][16*i: 16*(i+1)] for i in range(batch_number)]
    labels_batches = [raw_dataset['similarity'][16*i: 16*(i+1)] for i in range(batch_number)]

    f_sent_tokenized_batches = [tokenizer(batch, max_length=20, padding='max_length', truncation=True, return_tensors='pt') for batch in f_sent_raw_batches]
    s_sent_tokenized_batches = [tokenizer(batch, max_length=20, padding='max_length', truncation=True, return_tensors='pt') for batch in s_sent_raw_batches]

    idx = [i for i in range(batch_number)]
    random.shuffle(idx)

    f_sent_shuffled = []
    s_sent_shuffled = []
    labels_shuffled = []

    for i in idx:
        f_sent_shuffled.append(f_sent_tokenized_batches[i])
        s_sent_shuffled.append(s_sent_tokenized_batches[i])
        labels_shuffled.append(labels_batches[i])

    return f_sent_shuffled, s_sent_shuffled, labels_shuffled

In [46]:
f_sents, s_sents, labels = preprocess_data(raw_dataset)

In [7]:
class Lambda(torch.nn.Module):
    def __init__(self, lambd):
        super().__init__()
        self.lambd = lambd
    
    def forward(self, x):
         return self.lambd(x)

In [8]:
class SiameseNN(torch.nn.Module):
    def __init__(self):
        super(SiameseNN, self).__init__()
        l1_norm = lambda x: 1 - torch.abs(x[0] - x[1])
        self.encoder = BertModel.from_pretrained(checkpoint)
        self.merged = Lambda(l1_norm)
        self.fc1 = torch.nn.Linear(312, 2)
        self.softmax = torch.nn.Softmax()

    
    def forward(self, x):
        first_encoded = self.encoder(**x[0]).pooler_output
        #print("First: ", first_encoded)
        second_encoded = self.encoder(**x[1]).pooler_output
        l1_distance = self.merged([first_encoded, second_encoded])
        #print(l1_distance.shape)
        fc1 = self.fc1(l1_distance)
        fc1 = self.softmax(fc1)
        return fc1

In [9]:
model = SiameseNN()

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [49]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
loss_fn = torch.nn.CrossEntropyLoss()
num_epochs = 20

In [59]:
def train_model(model, optimizer, loss_fn, num_epochs):
    for epoch in range(num_epochs):
        for s_batch, f_batch, target in zip(f_sents, s_sents, labels):
            output = model([s_batch, f_batch])
            loss = loss_fn(output, torch.tensor(target))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print(loss.item())

In [None]:
train_model(model, optimizer, loss_fn, num_epochs)

In [42]:
model([f_sents[0], s_sents[0]])

  fc1 = self.softmax(fc1)


tensor([[0.0035, 0.9965],
        [0.0063, 0.9937],
        [0.0043, 0.9957],
        [0.0055, 0.9945],
        [0.0041, 0.9959],
        [0.0041, 0.9959],
        [0.0043, 0.9957],
        [0.0041, 0.9959],
        [0.0037, 0.9963],
        [0.0048, 0.9952],
        [0.0036, 0.9964],
        [0.0044, 0.9956],
        [0.0036, 0.9964],
        [0.0063, 0.9937],
        [0.0035, 0.9965],
        [0.0038, 0.9962]], grad_fn=<SoftmaxBackward0>)

In [58]:
def get_similarity(f_sent, s_sent):
        f_sent = tokenizer(f_sent, max_length=20, padding='max_length', return_tensors='pt')
        s_sent = tokenizer(s_sent, max_length=20, padding='max_length', return_tensors='pt')
        return model([f_sent, s_sent])


f_sent = "Нарисуй Альберта Эйнштейна в стиле Ван Гога"
s_sent = "Нарисуй изображение"


get_similarity(f_sent, s_sent)

  fc1 = self.softmax(fc1)


tensor([[0.0832, 0.9168]], grad_fn=<SoftmaxBackward0>)

In [332]:
torch.save(model.state_dict(), 'siamese_state')