In [1]:
import torch
import numpy as np
import sentence_transformers

from util import parse_db, Document, read_markup_tsv
from torch.utils.data import Dataset, DataLoader

import random

records = parse_db("data_/0525_parsed.db")


In [2]:
from collections import Counter
from statistics import median, mean
from sklearn.cluster import AgglomerativeClustering
from purano.clusterer.metrics import calc_metrics


def get_quality(markup, dist_matrix, records, dist_threshold, print_result=False):
    clustering_model = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=dist_threshold,
        linkage="average",
        affinity="precomputed"
    )

    clustering_model.fit(dist_matrix)
    labels = clustering_model.labels_
    print(labels)
    idx2url = dict()
    url2record = dict()
    for i, record in enumerate(records):
        idx2url[i] = record["url"]
        url2record[record["url"]] = record

    url2label = dict()
    for i, label in enumerate(labels):
        url2label[idx2url[i]] = label
        
    metrics = calc_metrics(markup, url2record, url2label)[0]
    if print_result:
        print()
        print("Accuracy: {:.1f}".format(metrics["accuracy"] * 100.0))
        print("Positives Recall: {:.1f}".format(metrics["1"]["recall"] * 100.0))
        print("Positives Precision: {:.1f}".format(metrics["1"]["precision"] * 100.0))
        print("Positives F1: {:.1f}".format(metrics["1"]["f1-score"] * 100.0))
        print("Distance: ", dist_threshold)
        sizes = list(Counter(labels).values())
        print("Max cluster size: ", max(sizes))
        print("Median cluster size: ", median(sizes))
        print("Avg cluster size: {:.2f}".format(mean(sizes)))
        return
    return metrics["1"]["f1-score"]

In [3]:
from sentence_transformers import InputExample

random.seed(0)

def separate_samples(markup, train_percent=0.85):

    count_samples = len(markup)
    shuffled_ids = list(range(count_samples))
    random.shuffle(shuffled_ids)
    train_len = round(count_samples * train_percent)

    train_markup, test_markup = [], []

    for i, id in enumerate(shuffled_ids):
        if i < train_len:
            train_markup.append(markup[id])
        else:
            test_markup.append(markup[id])        
           
    return train_markup, test_markup


def get_data(markup, records):
    input_samples, other_samples, qualities = [], [], []

    for mrkp in markup:
        first_url, second_url, quality = mrkp.values()

        input_samples.append(url2record[first_url]['title'] + ' ' + url2record[first_url]['text'])
        other_samples.append(url2record[second_url]['title'] + ' ' + url2record[second_url]['text'])
        if quality == 'OK':
            qualities.append(1)
        else:
            qualities.append(0)
    return input_samples, other_samples, qualities


url2record = dict()

for i, record in enumerate(records):
    url2record[record["url"]] = record


markup = read_markup_tsv("data_/ru_clustering_0525_urls.tsv")
train_markup, test_markup = separate_samples(markup)
train_inputs, train_others, train_qualities  = get_data(train_markup, records)
test_inputs, test_others, test_qualities  = get_data(test_markup, records)

In [4]:
print(test_inputs[3])
print(test_others[3])
print(test_qualities[3])

Киностудия Disney выпустила мультфильм о гее По сюжету парень хочет рассказать правду о своей сексуальной жизни родителям, но не может решиться. 
       Disney в партнерстве с Pixar впервые в своей истории выпустили девятиминутный мультфильм с главным героем-геем — Out, пишет  The New York Times . Главный герой Грег хочет переехать в другой город со своим партнером, но никак не может решиться на то, чтобы рассказать семье о своей сексуальной ориентации. Режиссером анимационной ленты стал Стивен Клэй Хантер, который ранее работал над мультфильмами «В поисках Немо» и «ВАЛЛ-И».
Pixar выпустила короткометражку с главным героем геем Студия Pixar впервые в своей истории выпустила мультфильм, главным героем которого стал гей. Короткометражка называется Out, длится девять минут и рассказывает историю персонажа по имени Грег. Он не знает, как сообщить родителям о своей сексуальной ориентации. Грег собирается переехать к своему парню Мануэлю, а с переездом ему решают помочь родители. Дальше уже 

In [29]:
def get_data_quality(markup, url2record):
    quality_records = []
    for mrkp in markup:
        first_url, second_url, quality = mrkp.values()
        quality_records.append(url2record[first_url])
        quality_records.append(url2record[second_url])
    return quality_records     

valid_size = 3
valid_records = get_data_quality(test_markup[:valid_size], url2record)

temp = test_inputs[:valid_size] + test_others[:valid_size]

valid_samples = []

for i in temp:
    for j in temp:
        valid_samples.append([i,j])        


In [22]:
from sentence_transformers import CrossEncoder
from torch.utils.data import DataLoader

model = CrossEncoder('bert-base-multilingual-uncased', num_labels= 1)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [23]:
from sentence_transformers.cross_encoder.evaluation import CEBinaryAccuracyEvaluator
evaluator = CEBinaryAccuracyEvaluator(list(zip(test_inputs, test_others)), test_qualities)

train_examples = [InputExample(texts=[inp, oth], label = qual) for inp, oth, qual in zip(train_inputs, train_others, train_qualities)]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
model.fit(train_dataloader=train_dataloader,
         evaluator = evaluator,
         warmup_steps = 600,
         epochs=1)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=789.0, style=ProgressStyle(description_wi…

I0303 16:59:15.612779 10360 CEBinaryAccuracyEvaluator.py:48] CESoftmaxAccuracyEvaluator: Evaluating the model on  dataset after epoch 0:





I0303 16:59:29.173570 10360 CEBinaryAccuracyEvaluator.py:56] Accuracy: 95.82





In [27]:
result = model.predict(valid_samples)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=2.0, style=ProgressStyle(description_width=…




In [31]:
print(result)

[0.9965984  0.00215674 0.00219849 0.00216658 0.00220205 0.00221573
 0.00220085 0.99673826 0.00212976 0.00212844 0.0025838  0.00214319
 0.00222246 0.00225734 0.99670213 0.00213076 0.0022059  0.99646276
 0.00228272 0.00218905 0.00216477 0.99657935 0.00217327 0.00217405
 0.00231127 0.00336298 0.00226796 0.00225861 0.9967333  0.00229788
 0.00231042 0.0021926  0.99509084 0.00214962 0.00220766 0.99642557]


In [32]:
result = result.reshape((2*valid_size, 2*valid_size))
print(result)

[[0.9965984  0.00215674 0.00219849 0.00216658 0.00220205 0.00221573]
 [0.00220085 0.99673826 0.00212976 0.00212844 0.0025838  0.00214319]
 [0.00222246 0.00225734 0.99670213 0.00213076 0.0022059  0.99646276]
 [0.00228272 0.00218905 0.00216477 0.99657935 0.00217327 0.00217405]
 [0.00231127 0.00336298 0.00226796 0.00225861 0.9967333  0.00229788]
 [0.00231042 0.0021926  0.99509084 0.00214962 0.00220766 0.99642557]]


In [33]:
result = 1.0 - (result + np.transpose(result))/2.0

for i in range(2*valid_size):
    for j in range(2*valid_size):
        if i == j:
            result[i,j] = 0

print(result)

[[0.         0.9978212  0.9977895  0.9977754  0.99774337 0.99773693]
 [0.9978212  0.         0.99780643 0.99784124 0.9970266  0.9978321 ]
 [0.9977895  0.99780643 0.         0.9978522  0.9977631  0.00422323]
 [0.9977754  0.99784124 0.9978522  0.         0.9977841  0.99783814]
 [0.99774337 0.9970266  0.9977631  0.9977841  0.         0.99774724]
 [0.99773693 0.9978321  0.00422323 0.99783814 0.99774724 0.        ]]


In [34]:
print(len(result[result>0.5]))
print(len(result[result<0.5]))

28
8


In [65]:
clustering_model = AgglomerativeClustering(
        n_clusters=None,
        distance_threshold=0.5,
        linkage="single",
        affinity="precomputed"
    )

In [66]:
clustering_model.fit(result)

AgglomerativeClustering(affinity='precomputed', distance_threshold=0.5,
                        linkage='single', n_clusters=None)

In [67]:
labels = clustering_model.labels_

In [78]:
len(set(labels))

5

In [74]:
#Мы дали 6 предложений на вход, 2 предложения имеют взаимную метку "OK" - то есть один кластер, и еще 4 предложения, которые  
# никак к другу не относятся, в итоге как раз 5. То есть до этого момента все работает.


In [75]:
from sklearn.metrics import classification_report


def calc_metrics(markup, url2record, labels):
    not_found_count = 0
    for record in markup:
        first_url = record["first_url"]
        second_url = record["second_url"]
        not_found_in_labels = first_url not in labels or second_url not in labels
        not_found_in_records = first_url not in url2record or second_url not in url2record
        if not_found_in_labels or not_found_in_records:
            not_found_count += 1
            markup.remove(record)
    if not_found_count != 0:
        print("Not found {} pairs from markup".format(not_found_count))

    targets = []
    predictions = []
    errors = []
    for record in markup:
        first_url = record["first_url"]
        second_url = record["second_url"]
        target = int(record["quality"] == "OK")
        prediction = int(labels[first_url] == labels[second_url])
        first = url2record.get(first_url)
        second = url2record.get(second_url)
        targets.append(target)
        predictions.append(prediction)
        if target == prediction:
            continue
        errors.append({
            "target": target,
            "prediction": prediction,
            "first_url": first_url,
            "second_url": second_url,
            "first_title": first["title"],
            "second_title": second["title"],
            "first_text": first["text"],
            "second_text": second["text"]
        })

    metrics = classification_report(targets, predictions, output_dict=True)
    return metrics, errors

In [76]:
idx2url = dict()
url2record = dict()

for i, record in enumerate(valid_records):
    idx2url[i] = record["url"]
    url2record[record["url"]] = record
    
url2label = dict()

for i, label in enumerate(labels):
    url2label[idx2url[i]] = label
    
metrics = calc_metrics(test_markup[:valid_size], url2record, url2label)[0]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [77]:
print(metrics)

{'0': {'precision': 0.6666666666666666, 'recall': 1.0, 'f1-score': 0.8, 'support': 2}, '1': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1}, 'accuracy': 0.6666666666666666, 'macro avg': {'precision': 0.3333333333333333, 'recall': 0.5, 'f1-score': 0.4, 'support': 3}, 'weighted avg': {'precision': 0.4444444444444444, 'recall': 0.6666666666666666, 'f1-score': 0.5333333333333333, 'support': 3}}
