In [1]:
from fastcoref import spacy_component
import spacy
from fastcoref import FCoref

nlp = spacy.blank("th")
nlp.add_pipe(
    "fastcoref",
    config={
        "model_architecture": "FCoref",
        "model_path": "/home/poomphob/Desktop/Thesis/fastcoref/wangchan_6616/model",
        "device": "cuda:0",
    },
)

2023-10-05 15:44:49.258293: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-05 15:44:50.339513: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-10-05 15:44:50.339815: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1956] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your 

<fastcoref.spacy_component.spacy_component.FastCorefResolver at 0x7fe5de45cfd0>

In [2]:
text = "สาวแค้นโดนบอกเลิก ขับรถเบนซ์หรูของแฟนหนุ่มลงสระ เผยปมเหตุขอเงินทำธุรกิจแต่ผู้ชายไม่ให้ #ต่างประเทศ #เรื่องเล่าเช้านี้"

In [3]:
doc = nlp(text)
print(doc._.coref_clusters)


10/05/2023 15:44:58 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

10/05/2023 15:44:58 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

[[(0, 7), (7, 10), (18, 23), (48, 51)], [(34, 44), (74, 83)]]


# Eval pipeline

## Function

In [17]:
import logging
from fastcoref.utilities.metrics import MentionEvaluator, CorefEvaluator
from transformers import AutoConfig, AutoTokenizer
from fastcoref.coref_models.modeling_fcoref import FCorefModel
from fastcoref.utilities.util import create_mention_to_antecedent, create_clusters, output_evaluation_metrics, update_metrics
from tqdm.auto import tqdm
import numpy as np
import torch
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s - %(levelname)s - \t %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO)


def load_f_coref_model(args):
    logger.info(f'Loading FCoref model with underlying transformer {args.model_name_or_path}')

    config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    config.coref_head = {
        "max_span_length": args.max_span_length,
        "top_lambda": args.top_lambda,
        "ffnn_size": args.ffnn_size,
        "dropout_prob": args.dropout_prob,
        "max_segment_len": args.max_segment_len
    }

    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path, use_fast=True, add_prefix_space=True, cache_dir=args.cache_dir
    )

    model = FCorefModel.from_pretrained(
        args.model_name_or_path, output_loading_info=False,
        config=config, cache_dir=args.cache_dir
    )

    t_params, h_params = [p / 1000000 for p in model.num_parameters()]
    logger.info(f'FCoref Parameters: {t_params + h_params:.1f}M, '
                f'Transformer: {t_params:.1f}M, Coref head: {h_params:.1f}M')

    return model, tokenizer

def evaluate(eval_sampler, model, prefix=''):
    count = 0
    dataset_str = "val_set"
    if eval_sampler is None:
        logger.info(f'Skipping evaluation. {dataset_str} is None')
        return {}

    model.eval()

    logger.info(f"***** Running evaluation on {dataset_str} - {len(eval_sampler.dataset)} documents *****")
    
    metrics_dict = {
        "loss": 0.0,
        "post_pruning": MentionEvaluator(),
        "mentions": MentionEvaluator(),
        "zero_mentions": MentionEvaluator(),
        "normal_mentions": MentionEvaluator(),
        "coref": CorefEvaluator(),
        "wozp_evaluator": CorefEvaluator(),
    }
    doc_to_tokens = {}
    doc_to_subtoken_map = {}
    doc_to_new_word_map = {}
    doc_to_prediction = {}

    all_prediction_clusters = []
    all_gold_clusters = []
    doc_key_list = []

    doc_key_to_token_ids = []

    with tqdm(desc="Inference", total=len(eval_sampler.dataset)) as progress_bar:
        for idx, batch in enumerate(eval_sampler):
            doc_keys = batch['doc_key']
            tokens = batch['tokens']
            subtoken_map = batch['subtoken_map']
            new_token_map = batch['new_token_map']
            gold_clusters = batch['gold_clusters']
            doc_key_to_token_ids.append((doc_keys, batch['input_ids'].cpu().numpy()))

            with torch.no_grad():
                outputs = model(batch, gold_clusters=gold_clusters, return_all_outputs=True)

            outputs_np = tuple(tensor.cpu().numpy() for tensor in outputs)

            gold_clusters = gold_clusters.cpu().numpy()
            loss, span_starts, span_ends, mention_logits, coref_logits = outputs_np
            metrics_dict['loss'] += loss.item()

            doc_indices, mention_to_antecedent = create_mention_to_antecedent(span_starts, span_ends, coref_logits)

            for i, doc_key in enumerate(doc_keys):
                doc_mention_to_antecedent = mention_to_antecedent[np.nonzero(doc_indices == i)]
                predicted_clusters = create_clusters(doc_mention_to_antecedent)

                doc_to_prediction[doc_key] = predicted_clusters
                doc_to_tokens[doc_key] = tokens[i]
                doc_to_subtoken_map[doc_key] = subtoken_map[i]
                doc_to_new_word_map[doc_key] = new_token_map[i]

                all_prediction_clusters.append(predicted_clusters)
                all_gold_clusters.append(gold_clusters[i])
                doc_key_list.append(doc_key)
                # print(f"Gold_{doc_key}: ",gold_clusters[i])
                # print(f"Predict_{doc_key}: ",predicted_clusters)
                # print("====================================")
                update_metrics(metrics_dict, span_starts[i], span_ends[i], gold_clusters[i], predicted_clusters)

            progress_bar.update(n=len(doc_keys))

    results = output_evaluation_metrics(
        metrics_dict=metrics_dict, prefix=prefix
    )
    return results, {"doc_key": doc_key_list, "prediction_clusters": all_prediction_clusters, "gold_clusters": all_gold_clusters}, doc_key_to_token_ids

In [18]:
from dataclasses import dataclass
@dataclass
class TrainingArgs:
    model_name_or_path: str
    overwrite_output_dir: bool = False
    learning_rate: float = 1e-5
    head_learning_rate: float = 3e-4
    dropout_prob: float = 0.3
    weight_decay: float = 0.01
    adam_beta1: float = 0.9
    adam_beta2: float = 0.98
    adam_epsilon: float = 1e-6
    epochs: float = 3
    ffnn_size: int = 1024
    logging_steps: int = 500
    eval_steps: int = 500
    seed: int = 42
    max_span_length: int = 30
    top_lambda: float = 0.4
    cache_dir: str = 'cache'
    max_segment_len: int = 512
    max_doc_len: int = None
    max_tokens_in_batch: int = 5000
    device: str = None

## Pipeline

In [19]:
device = "cuda:0"

args = TrainingArgs(
    "/home/poomphob/Desktop/Thesis/fastcoref/wangchan_6616/model",
    device = device,
    max_segment_len = 512,
    max_tokens_in_batch = 5000,
)

In [20]:
model, tokenizer = load_f_coref_model(args)
model.to(device)
print("Model loaded")

10/05/2023 15:49:55 - INFO - 	 Loading FCoref model with underlying transformer /home/poomphob/Desktop/Thesis/fastcoref/wangchan_6616/model
10/05/2023 15:49:56 - INFO - 	 FCoref Parameters: 113.7M, Transformer: 105.2M, Coref head: 8.4M


Model loaded


In [21]:
import sys
import os
sys.path.append(os.path.abspath("/home/poomphob/Desktop/Thesis/fastcoref/"))

In [22]:
from fastcoref.utilities import coref_dataset
import spacy
from fastcoref.utilities.collate import DynamicBatchSampler,LeftOversCollator

dataset_path = "/home/poomphob/Desktop/Thesis/s2e_coref/data/01_10_2023_doccano/val_tokens.jsonl"

nlp = spacy.blank("th")

dataset = coref_dataset.create(
    dataset_path,
    tokenizer,
    nlp,
)

collator = LeftOversCollator(
    tokenizer=tokenizer,
    device=device,
    max_segment_len=args.max_segment_len
)

sampler = DynamicBatchSampler(
    dataset=dataset,
    collator=collator,
    max_tokens=args.max_tokens_in_batch,
    max_segment_len=args.max_segment_len
)



In [23]:
result, error_analysis, doc_key_to_token_ids = evaluate(sampler, model, prefix='val')

10/05/2023 15:50:06 - INFO - 	 ***** Running evaluation on val_set - 73 documents *****


Inference:   0%|          | 0/73 [00:00<?, ?it/s]

10/05/2023 15:50:06 - INFO - 	 ***** Eval results val *****
10/05/2023 15:50:06 - INFO - 	   eval_loss                      = 4.863
10/05/2023 15:50:06 - INFO - 	   normal mention precision       = 0.881
10/05/2023 15:50:06 - INFO - 	   normal mention recall          = 0.790
10/05/2023 15:50:06 - INFO - 	   normal mention f1              = 0.833
10/05/2023 15:50:06 - INFO - 	   zero mention precision         = 0.866
10/05/2023 15:50:06 - INFO - 	   zero mention recall            = 0.612
10/05/2023 15:50:06 - INFO - 	   zero mention f1                = 0.717
10/05/2023 15:50:06 - INFO - 	   mention precision              = 0.934
10/05/2023 15:50:06 - INFO - 	   mention recall                 = 0.759
10/05/2023 15:50:06 - INFO - 	   mention f1                     = 0.838
10/05/2023 15:50:06 - INFO - 	   wozp precision                 = 0.769
10/05/2023 15:50:06 - INFO - 	   wozp recall                    = 0.691
10/05/2023 15:50:06 - INFO - 	   wozp f1                        = 0.727
10/0

In [24]:
doc_key_to_token_ids_map = {}
for doc_keys, token_ids in doc_key_to_token_ids:
    for doc_key, token_id in zip(doc_keys, list(token_ids)):
        doc_key_to_token_ids_map[doc_key] = token_id

In [25]:
def preprocess_cluster(clusters):
    clusters = [[mention for mention in cluster if mention[0] != -1 and mention[1] != -1] for cluster in clusters.tolist()]
    clusters = [cluster for cluster in clusters if len(cluster) > 0]
    return clusters

In [26]:
gold_clusters = [preprocess_cluster(clusters) for clusters in error_analysis["gold_clusters"]]
prediction_clusters = error_analysis["prediction_clusters"]
doc_keys = error_analysis["doc_key"]

In [27]:
gold_clusters[0]

[[[7, 7], [2, 4]], [[12, 15], [9, 9], [1, 4]]]

In [28]:
import json
with open ("/home/poomphob/Desktop/Thesis/s2e_coref/data/01_10_2023_doccano/val_tokens.jsonl", "r") as f:
    data = {i: json.loads(line) for i, line in enumerate(f)}

In [29]:
for doc_key, gold_cluster, prediction_cluster in zip(doc_keys, gold_clusters, prediction_clusters):
    tokens = doc_key_to_token_ids_map[doc_key]
    print("full text: ", "".join(data[int(doc_key)]["tokens"]))
    print("Gold cluster: ", [[tokenizer.batch_decode(tokens[:,start:stop])[0] for start, stop in cluster] for cluster in gold_cluster])
    print("Pred cluster: ", [[tokenizer.batch_decode(tokens[:,start:stop])[0] for start, stop in cluster] for cluster in prediction_cluster])
    print("====================================")


full text:  ลูกแม่ซาร่า มาแล้ว ตั้งชื่อว่า น้องไอร่า
Gold cluster:  [['', 'แม่ซาร่า'], ['น้องไอร่า', '', 'ลูก แม่ซาร่า']]
Pred cluster:  [['แม่ซาร่า', ''], ['ลูก แม่ซาร่า', '']]
full text:  จ๊ะ อาร์สยาม เข้าให้ปากคำ ดีเอสไอ คดีแชร์แม่มณี ในฐานะพยาน
Gold cluster:  [['จ๊ะ อาร์สยาม'], ['ดีเอสไอ'], ['แม่มณี']]
Pred cluster:  []
full text:  หนุ่มเทคนิคทำปืนลั่นใส่เพื่อนตายกลางถนน ส่วนตัวเองเจ็บนิ้วหวิดขาด
Gold cluster:  [['หนุ่มเทคนิค', 'ตัวเอง'], ['เพื่อน']]
Pred cluster:  [['หนุ่มเทคนิค', 'ตัวเอง']]
full text:  สตีฟ บัลเมอร์ รับ ผมเป็นสัญลักษณ์ของไมโครซอฟท์ยุคเก่า และถึงเวลาต้องไป
Gold cluster:  [['สตีฟ บัลเมอร์', '', 'ผม'], ['ไมโครซอฟท์']]
Pred cluster:  [['สตีฟ บัลเมอร์', 'ผม', '']]
full text:  สาว 17 ซิ่งเบนซ์หรูแหกโค้ง พุ่งทะลุบ้าน พังยับ คนขับบาดเจ็บเล็กน้อย
Gold cluster:  [['คนขับ', 'สาว 17']]
Pred cluster:  [['สาว 17', '', '', 'คนขับ']]
full text:  Jon Rubinstein ซีอีโอของ Palm บอก ผมไม่เคยใช้ iPhone
Gold cluster:  [['<unk>on <unk>ubinstein', 'ผม'], ['<unk>alm']]
Pred cluster:  [['