In [1]:
import torch
import numpy as np
import sentence_transformers

from util import parse_db, Document, read_markup_tsv
from torch.utils.data import Dataset, DataLoader

import random

records = parse_db("data_/0525_parsed.db")

In [2]:
import pickle

with open('pretrain','rb') as fp:
    pretrain_data = pickle.load(fp)

In [3]:
model_use = sentence_transformers.SentenceTransformer('distiluse-base-multilingual-cased-v2')

I0316 18:54:47.177613 10656 SentenceTransformer.py:39] Load pretrained SentenceTransformer: distiluse-base-multilingual-cased-v2
I0316 18:54:47.179576 10656 SentenceTransformer.py:43] Did not find folder distiluse-base-multilingual-cased-v2
I0316 18:54:47.180572 10656 SentenceTransformer.py:49] Try to download model from server: https://sbert.net/models/distiluse-base-multilingual-cased-v2.zip
I0316 18:54:47.183564 10656 SentenceTransformer.py:100] Load SentenceTransformer from folder: C:\Users\Ellie/.cache\torch\sentence_transformers\sbert.net_models_distiluse-base-multilingual-cased-v2
I0316 18:54:48.903966 10656 SentenceTransformer.py:124] Use pytorch device: cuda


In [4]:
from tqdm.notebook import tqdm
from hnswlib import Index as HnswIndex

def model_embed_text(model, text):
    return model.encode([text],show_progress_bar = False)

class Hnsw:
    def __init__(self, model):
        self.model = model
        self.vector_dim = 512
        self.hnsw = HnswIndex(space='l2', dim=self.vector_dim)

    def build_hnsw(self, texts):
        n = len(texts)
        self.hnsw.init_index(max_elements=n, ef_construction=100, M=16)
        embeddings = np.zeros((n, self.vector_dim))
        for i, text in enumerate(tqdm(texts)):
            embeddings[i] = model_embed_text(self.model, text)
        self.hnsw.add_items(embeddings)


In [5]:
use_hnsw = Hnsw(model_use)
pretrain_data.sort(key=lambda x: x.get("date")) 

In [6]:
use_hnsw.build_hnsw([r["title"] + ' ' + r["text"] for r in pretrain_data[:120000]])

HBox(children=(FloatProgress(value=0.0, max=120000.0), HTML(value='')))




In [7]:
random.seed(0)

def separate_samples(markup, train_percent=0.85):

    count_samples = len(markup)
    shuffled_ids = list(range(count_samples))
    random.shuffle(shuffled_ids)
    train_len = round(count_samples * train_percent)

    train_markup, test_markup = [], []

    for i, id in enumerate(shuffled_ids):
        if i < train_len:
            train_markup.append(markup[id])
        else:
            test_markup.append(markup[id])        
           
    return train_markup, test_markup


def get_data(markup, records):
    input_samples, other_samples, qualities = [], [], []

    for mrkp in markup:
        first_url, second_url, quality = mrkp.values()

        input_samples.append(url2record[first_url]['title'] + ' ' + url2record[first_url]['text'])
        other_samples.append(url2record[second_url]['title'] + ' ' + url2record[second_url]['text'])
        if quality == 'OK':
            qualities.append(1)
        else:
            qualities.append(0)
    return input_samples, other_samples, qualities

url2record = dict()
    
for i, record in enumerate(records):
    url2record[record["url"]] = record

markup = read_markup_tsv("data_/ru_clustering_0525_urls.tsv")

train_markup, test_markup = separate_samples(markup)
train_inputs, train_others, train_qualities = get_data(train_markup, records)
test_inputs, test_others, test_qualities  = get_data(test_markup, records)


In [8]:
from sentence_transformers import CrossEncoder
from torch.utils.data import DataLoader

model_name = 'bert-base-multilingual-uncased'

ensemble_models_predicts = []

for model_no in range(5):
    model = CrossEncoder(f'{model_name}UNC/{model_no}')
    ensemble_models_predicts.append(model.predict(list(zip(test_inputs, test_others))))

I0316 19:15:23.272417 10656 CrossEncoder.py:54] Use pytorch device: cuda


HBox(children=(FloatProgress(value=0.0, description='Batches', max=70.0, style=ProgressStyle(description_width…




I0316 19:15:40.096313 10656 CrossEncoder.py:54] Use pytorch device: cuda


HBox(children=(FloatProgress(value=0.0, description='Batches', max=70.0, style=ProgressStyle(description_width…




I0316 19:15:56.921633 10656 CrossEncoder.py:54] Use pytorch device: cuda


HBox(children=(FloatProgress(value=0.0, description='Batches', max=70.0, style=ProgressStyle(description_width…




I0316 19:16:13.738384 10656 CrossEncoder.py:54] Use pytorch device: cuda


HBox(children=(FloatProgress(value=0.0, description='Batches', max=70.0, style=ProgressStyle(description_width…




I0316 19:16:30.689315 10656 CrossEncoder.py:54] Use pytorch device: cuda


HBox(children=(FloatProgress(value=0.0, description='Batches', max=70.0, style=ProgressStyle(description_width…




In [9]:
def entropy(preds):
    entropies = []
    
    for pred in preds:
        entropy_per_sample = - pred * np.log(pred)  - (1 - pred) * np.log(1 - pred)
        entropies.append(entropy_per_sample)
    
    entropies = np.array(entropies)
    
    return entropies

In [10]:
def get_uncertainty(predicts):
    total = sum(predicts)/len(predicts)
    
    total_uncertainty = entropy(total)
    
    individual_entropy = []
    
    for model_predict in predicts:
        
        individual_entropy.append(entropy(model_predict))
    
    expected_data_uncertainty = sum(individual_entropy)/len(individual_entropy)
    knowledge_uncertainty = total_uncertainty - expected_data_uncertainty     
    
    return total, total_uncertainty, expected_data_uncertainty, knowledge_uncertainty

In [11]:
total, total_uncertainty, _, _,  = get_uncertainty(ensemble_models_predicts)

In [52]:
test_inputs = np.array(test_inputs)
test_others = np.array(test_others)

uncert_inp = test_inputs[total_uncertainty > 0.05]
uncert_oth = test_others[total_uncertainty > 0.05]

In [53]:
uncert_inp.shape

(122,)

In [54]:
uncert_queries = np.append(uncert_inp, uncert_oth)

In [55]:
uncert_queries.shape

(244,)

In [56]:
def get_samples(queries, hard_negatives=1, soft_negatives=1):
    samples = []
    
    for i in range(hard_negatives + soft_negatives):
        samples.append([])

    for row in tqdm(queries):
        vector = model_embed_text(model_use, row)                  
        
        labels = list(use_hnsw.hnsw.knn_query(vector, k=20)[0][0])

        for i in range(hard_negatives):
            samples[i].append(labels[i * 2])

        for i in range(soft_negatives):
            samples[i + hard_negatives].append(random.randint(0, len(pretrain_data[:120000]) - 1))
    
    return sum(samples,[])

In [57]:
bad_samples_indexes = get_samples(uncert_queries, 3, 2)

train_oth_new_pairs = []

for index in bad_samples_indexes:
    train_oth_new_pairs.append(pretrain_data[index]['title'] + ' ' + pretrain_data[index]['text'])
    

HBox(children=(FloatProgress(value=0.0, max=244.0), HTML(value='')))




In [58]:
new_qual = np.zeros_like(bad_samples_indexes)

In [59]:
new_qual.shape

(1220,)

In [60]:
tile_uncert_inp = np.tile(uncert_queries, int(len(bad_samples_indexes)/len(uncert_queries)))

In [61]:
train_inputs = np.append(np.array(train_inputs), tile_uncert_inp)
train_others = np.append(np.array(train_others), train_oth_new_pairs)
train_qualities = np.append(np.array(train_qualities), new_qual)

In [62]:
assert train_inputs.shape == train_others.shape == train_qualities.shape

In [64]:
from sentence_transformers import InputExample
from sentence_transformers.cross_encoder.evaluation import CEBinaryAccuracyEvaluator

evaluator = CEBinaryAccuracyEvaluator(list(zip(test_inputs, test_others)), test_qualities)

train_examples = [InputExample(texts=[inp, oth], label = float(qual)) for inp, oth, qual in zip(train_inputs, train_others, train_qualities)]

In [None]:
from sentence_transformers import CrossEncoder
from torch.utils.data import DataLoader

model_name = 'bert-base-multilingual-uncased'

for model_no in range(5):
    model = CrossEncoder(model_name, num_labels= 1)
    
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
    
    model.fit(train_dataloader=train_dataloader,
             evaluator = evaluator,
             warmup_steps = 600,
             epochs = 9,
             output_path =f'{model_name}UNC2_/{model_no}'
             )