In [1]:
import pandas as pd
import spacy

from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from functools import lru_cache

from backend.AI_services.ai_services.vector_storage import VectorStorage
from backend.AI_services.ai_services.models.fact_checker import FactCheckerPipeline
from backend.AI_services.ai_services.preprocessing import get_default_coref_pipeline
from backend.AI_services.ai_services.utils import disable_fastcoref_progress_bar
from setup_clearml_env import setup
from metric import *

tqdm.pandas()
disable_fastcoref_progress_bar()
setup(seed=42)

In [2]:
data = pd.read_csv("../data/test.csv")
data = data.fillna("[]")

In [10]:
sentence_transformer_model = 'intfloat/e5-base-v2'
sentence_transformer_device = "cpu"
fact_checker_base = {
    "processing_device": "cpu",
    "device": "cpu",
    "get_explanation": False,
    "automatic_contextualisation": True,
}

search_params = {
    "storage_search_k": 3,
    "storage_search_threshold": 0.2,
}

vector_storage_file = "../data/vector_storages/storage-chunk_2_processed"

In [4]:
model = SentenceTransformer(sentence_transformer_model, device=sentence_transformer_device)
nlp = spacy.load("en_core_web_sm")

05/07/2025 23:18:10 - INFO - 	 Load pretrained SentenceTransformer: intfloat/e5-base-v2


In [5]:
@lru_cache(maxsize=None)
def get_sentence_embeddings(text: str, **kwargs):
    return model.encode(text, **kwargs)

In [6]:
storage = VectorStorage(
    dim=model.get_sentence_embedding_dimension(),
    embedder=get_sentence_embeddings,
)

In [7]:
coref_pipeline = get_default_coref_pipeline(device=fact_checker_base["processing_device"])

In [11]:
def evaluate_config(dataset):
    preds = pd.DataFrame(columns=["text", "is_error_in_paragraphs", "errors_in_sentences"])

    df_true = dataset.copy()
    storage.load(vector_storage_file)

    fact_checker = FactCheckerPipeline(
        vector_storage=storage,
        processing_pipeline=coref_pipeline,
        processing_device=fact_checker_base["processing_device"],
        device=fact_checker_base["device"],
        get_explanation=fact_checker_base["get_explanation"],
        storage_search_k=search_params["storage_search_k"],
        storage_search_threshold=search_params["storage_search_threshold"],
        automatic_contextualisation=fact_checker_base["automatic_contextualisation"]
    )

    for i, row in tqdm(df_true.iterrows(), total=len(df_true), desc="Evaluating"):
        text = row["text"]
        predictions = fact_checker.evaluate_text(
            text.lower().strip().replace("\n", " "),
        )
        preds.loc[i] = [
            text,
            len(predictions) != 0,
            str([s.fact.index + 1 for s in predictions])
        ]

    preds["errors_in_sentences"] = preds["errors_in_sentences"].apply(str)
    preds["suggestions_json"] = preds["errors_in_sentences"].apply(parse_suggestions_column)

    df_true["errors_in_sentences"] = df_true["errors_in_sentences"].apply(str)
    df_true["suggestions_json"] = df_true["errors_in_sentences"].apply(parse_suggestions_column)

    df_true["true_sugs"] = df_true["suggestions_json"]
    df_true["pred_sugs"] = preds["suggestions_json"]

    texts = df_true["text"].tolist()
    true_suggestions = df_true["true_sugs"].tolist()
    pred_suggestions = df_true["pred_sugs"].tolist()

    print(paragraph_classification_report(true_suggestions, pred_suggestions))
    print(sentence_classification_report(texts, true_suggestions, pred_suggestions, nlp))
    print(suggestion_level_metrics(true_suggestions, pred_suggestions))


In [12]:
evaluate_config(data)

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

              precision    recall  f1-score   support

    no_error      0.500     1.000     0.667        25
       error      0.000     0.000     0.000        25

    accuracy                          0.500        50
   macro avg      0.250     0.500     0.333        50
weighted avg      0.250     0.500     0.333        50

              precision    recall  f1-score   support

    no_error      0.702     1.000     0.825       139
       error      0.000     0.000     0.000        59

    accuracy                          0.702       198
   macro avg      0.351     0.500     0.412       198
weighted avg      0.493     0.702     0.579       198

{'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
