In [1]:
from util import parse_db, Document, read_markup_tsv
import random
from torch.utils.data import Dataset, DataLoader

records = parse_db("data_/0525_parsed.db")

class CustomDataset(Dataset):

    def __init__(self, x, y):
        self.len = len(x)
        self.data_x = x
        self.data_y = y
    
    def __getitem__(self, index):
        return (self.data_x[index], self.data_y[index])
    
    def __len__(self):
        return self.len


In [2]:
random.seed(0)

#Выбираем только "OK" разметки и делим данные на train/test
def pick_ok_samples(markup, train_percent=0.85):
    temp_ok = []
    temp_bad = []
    for mrkp in markup:
      if mrkp['quality'] == 'OK':
        temp_ok.append(mrkp)
      else:
        temp_bad.append(mrkp)
    
    count_samples = len(temp_ok)
    shuffled_ids = list(range(count_samples))
    random.shuffle(shuffled_ids)
    train_len = round(count_samples * train_percent)

    train_markup, test_markup = [], []

    for i, id in enumerate(shuffled_ids):
      if i < train_len:
        train_markup.append(temp_ok[id])
      else:
        test_markup.append(temp_ok[id])
        
    for sample in temp_bad:
        if len(test_markup) > 2 * (count_samples - train_len):
           break
        test_markup.append(sample)
        
    
    return train_markup, test_markup


def get_train_data(markup, records):
    input_samples, pos_samples = [], []

    for mrkp in markup:
        first_url, second_url, quality = mrkp.values()
        if quality == 'OK':
            input_samples.append(url2record[first_url]['title'] + ' ' + url2record[first_url]['text'])
            pos_samples.append(url2record[second_url]['title'] + ' ' + url2record[second_url]['text'])
    
    return input_samples, pos_samples

def get_test_records(markup, records):
    test_records = []
    for record in records:
      for mrkp in markup:
        first_url, second_url, _ = mrkp.values()
        if record['url'] == first_url or record['url'] == second_url:
            test_records.append(record)
    
    return test_records


url2record = dict()

for i, record in enumerate(records):
    url2record[record["url"]] = record


markup = read_markup_tsv("data_/ru_clustering_0525_urls.tsv")
train_markup, test_markup = pick_ok_samples(markup)
input_samples, pos_samples = get_train_data(train_markup, records)
test_records = get_test_records(test_markup, records)


In [3]:
print(len(test_records))
print(len(test_markup))

print(test_markup[0])
print(test_markup[-1])

4382
2191
{'first_url': 'https://lenta.ru/news/2020/05/25/cubinka/?utm_medium=social&utm_source=telegram', 'second_url': 'https://www.kp.ru/daily/27134/4223524/', 'quality': 'OK'}
{'first_url': 'https://www.facenews.ua/news/2020/481127/', 'second_url': 'https://368.media/2020/05/25/v-odesskoj-oblasti-blokirovali-nezakonnyj-sbyt-kontrabandnyh-sigaret-iz-postsovetskih-stran/', 'quality': 'BAD'}


In [4]:
from collections import Counter
from statistics import median, mean
from sklearn.cluster import AgglomerativeClustering
from purano.clusterer.metrics import calc_metrics

def get_quality(markup, embeds, records, dist_threshold, print_result=False):
    clustering_model = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=dist_threshold,
        linkage="average",
        affinity="cosine"
    )

    clustering_model.fit(embeds)
    labels = clustering_model.labels_

    idx2url = dict()
    url2record = dict()
    for i, record in enumerate(records):
        idx2url[i] = record["url"]
        url2record[record["url"]] = record

    url2label = dict()
    for i, label in enumerate(labels):
        url2label[idx2url[i]] = label
        
    metrics = calc_metrics(markup, url2record, url2label)[0]
    if print_result:
        print()
        print("Accuracy: {:.1f}".format(metrics["accuracy"] * 100.0))
        print("Positives Recall: {:.1f}".format(metrics["1"]["recall"] * 100.0))
        print("Positives Precision: {:.1f}".format(metrics["1"]["precision"] * 100.0))
        print("Positives F1: {:.1f}".format(metrics["1"]["f1-score"] * 100.0))
        print("Distance: ", dist_threshold)
        sizes = list(Counter(labels).values())
        print("Max cluster size: ", max(sizes))
        print("Median cluster size: ", median(sizes))
        print("Avg cluster size: {:.2f}".format(mean(sizes)))
        
        return
    return metrics["1"]["f1-score"]


In [30]:
import sentence_transformers
from tqdm.notebook import tqdm

use = sentence_transformers.SentenceTransformer('distiluse-base-multilingual-cased-v2')

import numpy as np

def use_get_embedding(text, model):
    return model.encode([text],show_progress_bar = False)

def use_records_to_embeds(records, model):
    embeddings = np.zeros((len(records), 512))
    for i, record in enumerate(tqdm(records)):
        embeddings[i] = use_get_embedding(record["title"] + " " + record["text"], model)
    return embeddings

train_use_embeddings = use_records_to_embeds(test_records, use)
get_quality(test_markup, train_use_embeddings, test_records, 0.37, print_result=True)

HBox(children=(FloatProgress(value=0.0, max=4382.0), HTML(value='')))



Accuracy: 94.0
Positives Recall: 90.7
Positives Precision: 97.2
Positives F1: 93.8
Distance:  0.37
Max cluster size:  25
Median cluster size:  2
Avg cluster size: 2.33


In [5]:
from torch.utils.data import Dataset, DataLoader


class CustomDataset(Dataset):

    def __init__(self, x, y):
        self.len = len(x)
        self.data_x = x
        self.data_y = y
    
    def __getitem__(self, index):
        return (self.data_x[index], self.data_y[index])
    
    def __len__(self):
        return self.len


dataset = CustomDataset(input_samples, pos_samples)

In [6]:
import math

import torch
from torch import nn

class GELU(nn.Module):
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


class FeedForwardTop(nn.Module):
    def __init__(self, dropout = 0.05):
        super().__init__()
        self.f1 = nn.Linear(512, 512)
        self.dropout = nn.Dropout(dropout) 
        self.gelu = GELU()
        self.f2 = nn.Linear(512, 512)

    
    def forward(self, x):
        x = self.dropout(self.gelu(self.f1(x)))
        x = self.f2(x)

        return x


In [7]:
from tqdm.notebook import tqdm

def use_get_embedding(text, model, ff):
    return ff(torch.tensor(model.encode([text], show_progress_bar = False))).detach().numpy()

def use_records_to_embeds(records, model, ff):
    embeddings = np.zeros((len(records), 512))
    for i, record in enumerate(records):
        embeddings[i] = use_get_embedding(record["title"] + " " + record["text"], model, ff)
    return embeddings

In [9]:
import sentence_transformers
from tqdm.notebook import tqdm

use = sentence_transformers.SentenceTransformer('distiluse-base-multilingual-cased-v2')


I0228 20:10:15.523839 10136 SentenceTransformer.py:39] Load pretrained SentenceTransformer: distiluse-base-multilingual-cased-v2
I0228 20:10:15.524805 10136 SentenceTransformer.py:43] Did not find folder distiluse-base-multilingual-cased-v2
I0228 20:10:15.524805 10136 SentenceTransformer.py:49] Try to download model from server: https://sbert.net/models/distiluse-base-multilingual-cased-v2.zip
I0228 20:10:15.526800 10136 SentenceTransformer.py:100] Load SentenceTransformer from folder: C:\Users\Ellie/.cache\torch\sentence_transformers\sbert.net_models_distiluse-base-multilingual-cased-v2
I0228 20:10:17.277148 10136 SentenceTransformer.py:124] Use pytorch device: cuda


In [None]:
import torch.nn.functional as F
import numpy as np
from tqdm.notebook import tqdm


torch.manual_seed(0)

device = 'cpu'
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


for lr in [0.001, 0.005, 0.01, 0.05, 0.1]:
    for batch_size in [16, 32, 64]:
        for dropout in [0.05, 0.15, 0.5]:
            net = FeedForwardTop().to(device)

            for p in net.parameters():
                if p.dim() > 1:
                    torch.nn.init.xavier_uniform_(p)
            
            optimizer = torch.optim.Adam(net.parameters(), lr=lr)
            train_loader = DataLoader(dataset, batch_size=batch_size)
            
            EPOCHES = 5
            
            
            for epoch in range(EPOCHES):
                total_loss = 0
                net.train()
                for i, (input_samples, pos_samples) in enumerate(tqdm(train_loader)):
                    optimizer.zero_grad()
            
                    q_vectors = torch.tensor(use.encode(input_samples, show_progress_bar = False)).to(device)
                    ctx_vectors = torch.tensor(use.encode(pos_samples, show_progress_bar = False)).to(device)
            
                    q_vectors = net(q_vectors)
                    q_vectors = F.normalize(q_vectors, p=2, dim = -1)
            
                    ctx_vectors = net(ctx_vectors)
                    ctx_vectors = F.normalize(ctx_vectors, p=2, dim = -1)
            
                    scores = torch.matmul(q_vectors, torch.transpose(ctx_vectors, 0, 1))
            
                    if len(q_vectors.size()) > 1:
                        q_num = q_vectors.size(0)
                        scores = scores.view(q_num, -1)
                    
                    softmax_scores = F.log_softmax(scores, dim=1)
                    
                    pos_idx_per_question = torch.tensor(list(range(q_vectors.size(0)))).to(device)
            
            
                    loss = F.nll_loss(softmax_scores,
                               pos_idx_per_question,
                               reduction='mean')
            
                    loss.backward()
                    optimizer.step()
            
                    total_loss += loss.data
                    
                    '''
                    if (i + 1) % 45 == 0:
                        print(f'epoch: {(epoch + 1)} batch_idx: {(i + 1)} loss: {total_loss/45}')
                        total_loss = 0
                    ''' 
                    
                net.eval()
                test_embeds = use_records_to_embeds(test_records, use, net)
                
                print(f'EPOCH {epoch} lr: {lr} batch_size: {batch_size}, dropout: {dropout}, loss_on_epoch: {total_loss/len(train_loader)}')
                
                f1_scores = {}
                
                for dist in np.linspace(0.15, 0.37, 11):
                    f1_scores[str(dist)] = get_quality(test_markup, test_embeds, test_records, dist, print_result=False)

                f1_max_score = max(list(f1_scores.values()))
                for dist in f1_scores.keys():
                    if f1_scores[dist] == f1_max_score:
                        print(f'F1-score: {f1_max_score * 100} dist: {dist}')
                print()
                    
            print('\n !!!!!!!!!! \n')

HBox(children=(FloatProgress(value=0.0, max=388.0), HTML(value='')))


EPOCH 0 lr: 0.001 batch_size: 16, dropout: 0.05, loss_on_epoch: 2.034864902496338
F1-score: 82.35294117647058 dist: 0.172



HBox(children=(FloatProgress(value=0.0, max=388.0), HTML(value='')))


EPOCH 1 lr: 0.001 batch_size: 16, dropout: 0.05, loss_on_epoch: 1.9880783557891846
F1-score: 83.45187292555713 dist: 0.194



HBox(children=(FloatProgress(value=0.0, max=388.0), HTML(value='')))


EPOCH 2 lr: 0.001 batch_size: 16, dropout: 0.05, loss_on_epoch: 1.977678894996643
F1-score: 83.59191349318289 dist: 0.194



HBox(children=(FloatProgress(value=0.0, max=388.0), HTML(value='')))


