In [3]:
import pandas as pd

from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

from backend.AI_services.ai_services.vector_storage import VectorStorage
from backend.AI_services.ai_services.models.fact_checker import FactCheckerPipeline
from backend.AI_services.ai_services.preprocessing import get_default_coref_pipeline
from backend.AI_services.ai_services.utils import disable_fastcoref_progress_bar
from metric import *

tqdm.pandas()
disable_fastcoref_progress_bar()

In [4]:
model = SentenceTransformer('intfloat/e5-base-v2', device="cpu")

05/04/2025 18:21:43 - INFO - 	 Load pretrained SentenceTransformer: intfloat/e5-base-v2


model.safetensors:  34%|###3      | 147M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [5]:
storage = VectorStorage(
    dim=model.get_sentence_embedding_dimension(),
    embedder=model.encode,
)
storage.load("../data/rd2indexflat")

In [6]:
coref_pipeline = get_default_coref_pipeline(device="cpu")

In [7]:
fact_checker = FactCheckerPipeline(
    vector_storage=storage,
    processing_pipeline=coref_pipeline,
    processing_device="cpu",
    device="cpu",
    get_explanation=False,
    storage_search_k=5,
    storage_search_threshold=1.5,
    automatic_contextualisation=True
)

In [8]:
data = pd.read_csv("../data/test.csv")

In [9]:
data = data.fillna("[]")

In [10]:
preds = pd.DataFrame(
    columns=["text", "is_error_in_paragraphs", "errors_in_sentences"]
)

for i, row in tqdm(data.iterrows(), total=len(data)):
    text = row["text"]
    predictions = fact_checker.evaluate_text(text.lower().strip())
    preds.loc[i] = [text, len(predictions) != 0, str([s.fact.index for s in predictions])]

  0%|          | 0/50 [00:00<?, ?it/s]

IndexError: list index out of range

In [46]:
preds.errors_in_sentences = preds.errors_in_sentences.apply(str)
preds["suggestions_json"] = preds["errors_in_sentences"].apply(parse_suggestions_column)
data.errors_in_sentences = data.errors_in_sentences.apply(str)
data["suggestions_json"] = data["errors_in_sentences"].apply(parse_suggestions_column)

In [47]:
import spacy

df_true = data.copy()
df_pred = preds.copy()

df_true["true_sugs"] = df_true["suggestions_json"]
df_true["pred_sugs"] = df_pred["suggestions_json"]

texts = df_true["text"].tolist()
true_suggestions = df_true["true_sugs"].tolist()
pred_suggestions = df_true["pred_sugs"].tolist()

nlp = spacy.load("en_core_web_sm")

print("=== Paragraph-level ===")
print(paragraph_classification_report(true_suggestions, pred_suggestions))

print("=== Sentence-level ===")
print(sentence_classification_report(
    texts,
    true_suggestions,
    pred_suggestions,
    nlp
))

print("=== Suggestion-level ===")
scores = suggestion_level_metrics(true_suggestions, pred_suggestions)
print(f"Precision: {scores['precision']:.3f}")
print(f"Recall:    {scores['recall']:.3f}")
print(f"F1-score:  {scores['f1']:.3f}")

=== Paragraph-level ===
              precision    recall  f1-score   support

    no_error      0.000     0.000     0.000        25
       error      0.500     1.000     0.667        25

    accuracy                          0.500        50
   macro avg      0.250     0.500     0.333        50
weighted avg      0.250     0.500     0.333        50

=== Sentence-level ===


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

    no_error      1.000     0.518     0.682       139
       error      0.468     1.000     0.638        59

    accuracy                          0.662       198
   macro avg      0.734     0.759     0.660       198
weighted avg      0.842     0.662     0.669       198

=== Suggestion-level ===
Precision: 0.368
Recall:    0.747
F1-score:  0.493
