In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, BertModel
import pandas as pd
from datasets import Dataset
import numpy as np
import torch
import random

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
checkpoint = 'cointegrated/rubert-tiny'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = BertModel.from_pretrained(checkpoint)

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [294]:
with open('siamese_dataset', 'r') as f:
    text = f.readlines()
    text = text[1:]

In [201]:
df = pd.read_csv('siamese_dataset')

In [244]:
#proper processing from text file
raw_dataset = Dataset.from_pandas(df)

In [295]:
#long processing from the text file
first_column = []
for i in text:
    test = i.split(',')
    test[0] = test[0].replace('"', '')
    first_column.append(test[0])

second_column = []
for i in text:
    test = i.split(',')
    test[1] = test[1].replace('"', '')
    second_column.append(test[1])

labels_column = []
for i in text:
    test = i.split(',')
    test[2] = test[2].replace('"', '')
    test[2] = test[2].replace('\n', '')
    test[2] = int(test[2])
    labels_column.append(test[2])

df = pd.DataFrame({'first': first_column, 'second': second_column, 'labels': labels_column})
raw_dataset = Dataset.from_pandas(df)

def tokenize_function(example):
    return tokenizer(example['first'], example['second'])

dataset = raw_dataset.map(tokenize_function, batched=True)
dataset = dataset.remove_columns(['first', 'second'])



In [296]:
class Lambda(torch.nn.Module):
    def __init__(self, lambd):
        super().__init__()
        self.lambd = lambd
    
    def forward(self, x):
         return self.lambd(x)

In [298]:
class SiameseNN(torch.nn.Module):
    def __init__(self):
        super(SiameseNN, self).__init__()
        l1_norm = lambda x: 1 - torch.abs(x[0] - x[1])
        self.encoder = BertModel.from_pretrained(checkpoint)
        self.merged = Lambda(l1_norm)
        self.fc1 = torch.nn.Linear(312, 2)
        self.softmax = torch.nn.Softmax()

    
    def forward(self, x):
        first_encoded = self.encoder(**x[0]).pooler_output
        #print("First: ", first_encoded)
        second_encoded = self.encoder(**x[1]).pooler_output
        l1_distance = self.merged([first_encoded, second_encoded])
        #print(l1_distance.shape)
        fc1 = self.fc1(l1_distance)
        fc1 = self.softmax(fc1)
        return fc1


In [299]:
model = SiameseNN()

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [300]:
first_sent_tokenized = tokenizer(first_column, max_length=20, padding='max_length', truncation=True, return_tensors='pt')
second_sent_tokenized = tokenizer(first_column, max_length=20, padding='max_length', truncation=True, return_tensors='pt')

In [302]:
batch_number = len(first_sent_tokenized['input_ids'])//16

In [305]:
f_sent_raw_batches = [raw_dataset['first'][16*i: 16*(i+1)] for i in range(batch_number)]
s_sent_raw_batches = [raw_dataset['second'][16*i: 16*(i+1)] for i in range(batch_number)]
labels_batches = [labels_column[16*i: 16*(i+1)] for i in range(batch_number)]

In [307]:
f_sent_tokenized_batches = [tokenizer(batch, max_length=20, padding='max_length', truncation=True, return_tensors='pt') for batch in f_sent_raw_batches]
s_sent_tokenized_batches = [tokenizer(batch, max_length=20, padding='max_length', truncation=True, return_tensors='pt') for batch in s_sent_raw_batches]

In [309]:
idx = [i for i in range(batch_number)]
random.shuffle(idx)

[14, 13, 3, 5, 10, 2, 4, 9, 6, 1, 15, 7, 16, 0, 12, 8, 17, 11]

In [310]:
f_sent_shuffled = []
s_sent_shuffled = []
labels_shuffled = []

for i in idx:
    f_sent_shuffled.append(f_sent_tokenized_batches[i])
    s_sent_shuffled.append(s_sent_tokenized_batches[i])
    labels_shuffled.append(labels_batches[i])

In [313]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)
loss_fn = torch.nn.CrossEntropyLoss()

In [314]:
num_epochs = 40

for epoch in range(num_epochs):
    for s_batch, f_batch, target in zip(f_sent_tokenized_batches, s_sent_tokenized_batches, labels_batches):
        output = model([s_batch, f_batch])
        loss = loss_fn(output, torch.tensor(target))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item())

  fc1 = self.softmax(fc1)


0.9397059082984924
0.9196892976760864
0.9065229296684265
0.9207167029380798
0.9066671133041382
0.8932352066040039
0.8718301057815552
0.8498995900154114
0.7771111726760864
0.5950617790222168
0.6093410849571228
0.60396808385849
0.6023807525634766
0.5955376625061035
0.587152898311615
0.553960919380188
0.5671082735061646
0.5453800559043884
0.8297969698905945
0.8353330492973328
0.8189462423324585
0.7687628269195557
0.7558953762054443
0.7529784440994263
0.7450742125511169
0.7289241552352905
0.6758740544319153
0.5421608686447144
0.5407372117042542
0.5329168438911438
0.5207831859588623
0.5119249224662781
0.5083759427070618
0.5344517827033997
0.5161306262016296
0.5303639769554138
0.8851749897003174
0.8798651695251465
0.7489895224571228
0.6072186827659607
0.5973057746887207
0.5865492820739746
0.5790407657623291
0.562112033367157
0.5325314998626709
0.49003252387046814
0.48698562383651733
0.4844975173473358
0.4781672954559326
0.4698370099067688
0.46951255202293396
0.5170429348945618
0.489005357027

In [316]:
model([f_sent_tokenized_batches[0], s_sent_tokenized_batches[0]])

  fc1 = self.softmax(fc1)


tensor([[0.0049, 0.9951],
        [0.0061, 0.9939],
        [0.0083, 0.9917],
        [0.0053, 0.9947],
        [0.0062, 0.9938],
        [0.0063, 0.9937],
        [0.0053, 0.9947],
        [0.0076, 0.9924],
        [0.0055, 0.9945],
        [0.0042, 0.9958],
        [0.0086, 0.9914],
        [0.0044, 0.9956],
        [0.0051, 0.9949],
        [0.0043, 0.9957],
        [0.0044, 0.9956],
        [0.0048, 0.9952]], grad_fn=<SoftmaxBackward0>)

In [333]:
f_sent = "Нарисуй Альберта Эйнштейна в стиле Ван Гога"
s_sent = "Нарисуй изображение"

f_sent = tokenizer(f_sent, max_length=20, padding='max_length', return_tensors='pt')
s_sent = tokenizer(s_sent, max_length=20, padding='max_length', return_tensors='pt')

model([f_sent, s_sent])

  fc1 = self.softmax(fc1)


tensor([[0.0112, 0.9888]], grad_fn=<SoftmaxBackward0>)

In [332]:
torch.save(model.state_dict(), 'siamese_state')