In [1]:
import torch
import numpy as np
import sentence_transformers

from util import parse_db, Document, read_markup_tsv
from torch.utils.data import Dataset, DataLoader

import random

records = parse_db("data_/0525_parsed.db")


In [104]:
from collections import Counter
from statistics import median, mean
from sklearn.cluster import AgglomerativeClustering
from purano.clusterer.metrics import calc_metrics


def get_quality(markup, dist_matrix, records, dist_threshold, print_result=False):
    clustering_model = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=dist_threshold,
        linkage="average",
        affinity="precomputed"
    )

    clustering_model.fit(dist_matrix)
    labels = clustering_model.labels_
    
    idx2url = dict()
    url2record = dict()
    for i, record in enumerate(records):
        idx2url[i] = record["url"]
        url2record[record["url"]] = record

    url2label = dict()
    for i, label in enumerate(labels):
        url2label[idx2url[i]] = label
        
    metrics = calc_metrics(markup, url2record, url2label)[0]
    if print_result:
        print(metrics)
        print("Accuracy: {:.1f}".format(metrics["accuracy"] * 100.0))
        print("Positives Recall: {:.1f}".format(metrics["1"]["recall"] * 100.0))
        print("Positives Precision: {:.1f}".format(metrics["1"]["precision"] * 100.0))
        print("Positives F1: {:.1f}".format(metrics["1"]["f1-score"] * 100.0))
        print("Distance: ", dist_threshold)
        sizes = list(Counter(labels).values())
        print("Max cluster size: ", max(sizes))
        print("Median cluster size: ", median(sizes))
        print("Avg cluster size: {:.2f}".format(mean(sizes)))
        return
    return metrics["1"]["f1-score"]

In [3]:
from sentence_transformers import InputExample

random.seed(0)

def separate_samples(markup, train_percent=0.85):

    count_samples = len(markup)
    shuffled_ids = list(range(count_samples))
    random.shuffle(shuffled_ids)
    train_len = round(count_samples * train_percent)

    train_markup, test_markup = [], []

    for i, id in enumerate(shuffled_ids):
        if i < train_len:
            train_markup.append(markup[id])
        else:
            test_markup.append(markup[id])        
           
    return train_markup, test_markup


def get_data(markup, records):
    input_samples, other_samples, qualities = [], [], []

    for mrkp in markup:
        first_url, second_url, quality = mrkp.values()

        input_samples.append(url2record[first_url]['title'] + ' ' + url2record[first_url]['text'])
        other_samples.append(url2record[second_url]['title'] + ' ' + url2record[second_url]['text'])
        if quality == 'OK':
            qualities.append(1)
        else:
            qualities.append(0)
    return input_samples, other_samples, qualities


url2record = dict()

for i, record in enumerate(records):
    url2record[record["url"]] = record


markup = read_markup_tsv("data_/ru_clustering_0525_urls.tsv")
train_markup, test_markup = separate_samples(markup)
train_inputs, train_others, train_qualities  = get_data(train_markup, records)
test_inputs, test_others, test_qualities  = get_data(test_markup, records)

In [39]:
def get_data_quality(markup, url2record):
    quality_records = []
    for mrkp in markup:
        first_url, second_url, quality = mrkp.values()
        quality_records.append(url2record[first_url])
        quality_records.append(url2record[second_url])
    return quality_records     

valid_size = 50
valid_records = get_data_quality(test_markup[:valid_size], url2record)

temp = test_inputs[:valid_size] + test_others[:valid_size]

valid_samples = []

for i in temp:
    for j in temp:
        valid_samples.append([i,j])        


In [59]:
from sentence_transformers import CrossEncoder
from torch.utils.data import DataLoader

model = CrossEncoder('bert-base-multilingual-uncased', num_labels= 1)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [None]:
from sentence_transformers.cross_encoder.evaluation import CEBinaryAccuracyEvaluator
evaluator = CEBinaryAccuracyEvaluator(list(zip(test_inputs, test_others)), test_qualities)

train_examples = [InputExample(texts=[inp, oth], label = qual) for inp, oth, qual in zip(train_inputs, train_others, train_qualities)]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
model.fit(train_dataloader=train_dataloader,
         evaluator = evaluator,
         warmup_steps = 600,
         epochs=1)

In [86]:
result = model.predict(valid_samples)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=313.0, style=ProgressStyle(description_widt…




In [99]:
result = result.reshape((2*valid_size, 2*valid_size))
result = 1.0 - (result + np.transpose(result))/2.0

for i in range(2*valid_size):
    for j in range(2*valid_size):
        if i == j:
            res[i,j] = 0


In [105]:
get_quality(test_markup[:valid_size], res, valid_records,0.001, print_result= True)

{'0': {'precision': 0.46, 'recall': 1.0, 'f1-score': 0.6301369863013699, 'support': 23}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 27}, 'accuracy': 0.46, 'macro avg': {'precision': 0.23, 'recall': 0.5, 'f1-score': 0.31506849315068497, 'support': 50}, 'weighted avg': {'precision': 0.2116, 'recall': 0.46, 'f1-score': 0.28986301369863016, 'support': 50}}
Accuracy: 46.0
Positives Recall: 0.0
Positives Precision: 0.0
Positives F1: 0.0
Distance:  0.001
Max cluster size:  1
Median cluster size:  1.0
Avg cluster size: 1.00


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
