In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp ./drive/MyDrive/backend.zip ./backend.zip
!cp ./drive/MyDrive/metric.py ./metric.py
!cp ./drive/MyDrive/setup_clearml_env.py ./setup_clearml_env.py
!cp ./drive/MyDrive/test.csv ./test.csv
!unzip -qq ./backend.zip
!cp -r ./drive/MyDrive/sae_data/vector_storages4 ./vector_storages
!rm -rf ./backend6.zip
!pip install faiss-cpu clearml datasets fastcoref pydantic==1.10.19 pydantic_core==2.23.4
!spacy download en_core_web_trf

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting clearml
  Downloading clearml-2.0.0-py2.py3-none-any.whl.metadata (17 kB)
Collecting fastcoref
  Downloading fastcoref-2.1.6.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pydantic==1.10.19
  Downloading pydantic-1.10.19-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (152 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.6/152.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic_core==2.23.4
  Downloading pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting furl>=2.0.0 (from clearml)
  Downloading furl-2.1.4-py2.py3-none-any.whl.metadata (25 kB)
Collecting pathlib2>=2.3.0 (from clearml)
  Downloading pathlib2-2.3.7.post1-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting orderedmultidict>=1.0.1 (from furl>=2.0.0->clearml)

In [None]:
import pandas as pd
import torch
import itertools
import copy
import spacy

from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from functools import lru_cache
from clearml import Task, Logger

from backend.AI_services.ai_services.vector_storage import VectorStorage
from backend.AI_services.ai_services.models.fact_checker import FactCheckerPipeline
from backend.AI_services.ai_services.preprocessing import get_default_coref_pipeline
from backend.AI_services.ai_services.models.coref import CorefResolver
from backend.AI_services.ai_services.preprocessing import Pipeline
from backend.AI_services.ai_services.utils import disable_fastcoref_progress_bar
from setup_clearml_env import setup
from metric import *

spacy.prefer_gpu()
tqdm.pandas()
disable_fastcoref_progress_bar()
setup(seed=42)

In [None]:
import gc
import torch

def clear_torch_caches():
    gc.collect()

    torch.cuda.empty_cache()

    if hasattr(torch.cuda, "ipc_collect"):
        torch.cuda.ipc_collect()

    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.benchmark = True

In [None]:
sentence_transformer_model = 'intfloat/e5-base-v2'

In [None]:
model = SentenceTransformer(sentence_transformer_model, device="cuda")

In [None]:
@lru_cache(maxsize=None)
def get_sentence_embeddings(text: str, **kwargs):
    return model.encode(text, **kwargs)

In [None]:
storage = VectorStorage(
    dim=model.get_sentence_embedding_dimension(),
    embedder=get_sentence_embeddings,
)

In [None]:
task = Task.init(
    project_name="sae",
    task_name="GridSearch Notebook",
    task_type=Task.TaskTypes.optimizer
)

ClearML Task: created new task id=3819814abb954a26acd9adf8f4f95b6b
2025-05-30 02:15:32,170 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/c8e43e61eb7548a781d200ce5184864c/experiments/3819814abb954a26acd9adf8f4f95b6b/output/log


In [None]:
task.connect({
    "sentence_transformer": sentence_transformer_model,
    "test_dataset_version": "v4",
    "coref": "default",
    "cross_encoder": "cross-encoder/stsb-roberta-base",
    "nlp": "en_core_web_trf",
    "paragraph_level": False,
    "splitter": "\\n"
})

{'sentence_transformer': 'intfloat/e5-base-v2',
 'test_dataset_version': 'v4',
 'coref': 'default',
 'cross_encoder': 'cross-encoder/stsb-roberta-base',
 'nlp': 'en_core_web_trf',
 'paragraph_level': False,
 'splitter': '\\n'}

In [None]:
data = pd.read_csv("./test.csv")

In [None]:
data = data.fillna("[]")

In [None]:

fact_checker_base = {
    "processing_device": "cuda",
    "device": "cuda",
    "get_explanation": False,
    "automatic_contextualisation": True,
    "ner_corpus": "en_core_web_trf",
    "model_name": "cross-encoder/nli-deberta-v3-base"
}

search_params = {
    "storage_search_k": [3, 5, 7, 9, 11, 15],
    "storage_search_threshold": [0.85, 0.9, 0.95, 1.1],
    "enable_ner": [True, False]
}

vector_storage_files = [
    "./vector_storages/storage-chunk_1_processed",
    "./vector_storages/storage-chunk_2_processed",
    "./vector_storages/storage-chunk_3_processed",
    "./vector_storages/storage-chunk_4_processed",
    # "./vector_storages/storage-chunk_5_processed",
    # "./vector_storages/storage-chunk_6_processed",
]

param_combos = list(itertools.product(
    vector_storage_files,
    search_params["storage_search_k"],
    search_params["storage_search_threshold"],
    search_params["enable_ner"]
))


In [None]:
coref_pipeline = Pipeline(
    coref=CorefResolver(
        sentence_splitter="en_core_web_trf",
        device=fact_checker_base["processing_device"],
    ),
    device=fact_checker_base["processing_device"],
)

2025-05-30 02:15:38,658 - clearml.model - INFO - Selected model id: b18ed9cce8c34b009b0daddccbe26ff0


In [None]:
nlp = spacy.load(fact_checker_base["ner_corpus"])

In [None]:
def log_metrics_group(group_name: str, metrics: dict, base_iteration: int = 0):
    logger = Logger.current_logger()
    i = base_iteration

    for label, values in metrics.items():
        if isinstance(values, dict):
            for metric, value in values.items():
                if value > 1:
                    continue
                logger.report_scalar(
                    title=group_name,
                    series=f"{label}/{metric}",
                    iteration=i,
                    value=value
                )
        else:
            logger.report_scalar(
                title=group_name,
                series=label,
                iteration=i,
                value=values
            )


def evaluate_config(config, iteration: int = 0):
    preds = pd.DataFrame(columns=["text", "is_error_in_paragraphs", "errors_in_sentences"])

    df_true = data.copy()

    storage.load(config["vector_storage_path"])

    fact_checker = FactCheckerPipeline(
        vector_storage=storage,
        processing_pipeline=coref_pipeline,
        processing_device=config["processing_device"],
        device=config["device"],
        get_explanation=config["get_explanation"],
        storage_search_k=config["storage_search_k"],
        storage_search_threshold=config["storage_search_threshold"],
        automatic_contextualisation=config["automatic_contextualisation"],
        ner_corpus=config["ner_corpus"],
        model_name=config["model_name"],
        enable_ner=config["enable_ner"]
    )

    for i, row in df_true.iterrows():
        text = row["text"]
        predictions = fact_checker.evaluate_text(
            text.lower().strip().replace("\n", " "),
        )
        try:
            preds.loc[i] = [
                text,
                len(predictions) != 0,
                str([s.fact.index + 1 for s in predictions])
            ]
        except TypeError as e:
            print(predictions)
            breakpoint()

    preds["errors_in_sentences"] = preds["errors_in_sentences"].apply(str)
    preds["suggestions_json"] = preds["errors_in_sentences"].apply(parse_suggestions_column)

    df_true["errors_in_sentences"] = df_true["errors_in_sentences"].apply(str)
    df_true["suggestions_json"] = df_true["errors_in_sentences"].apply(parse_suggestions_column)

    df_true["true_sugs"] = df_true["suggestions_json"]
    df_true["pred_sugs"] = preds["suggestions_json"]

    texts = df_true["text"].tolist()
    true_suggestions = df_true["true_sugs"].tolist()
    pred_suggestions = df_true["pred_sugs"].tolist()

    para = paragraph_classification_report(true_suggestions, pred_suggestions, output_dict=True)
    sent = sentence_classification_report(texts, true_suggestions, pred_suggestions, nlp, output_dict=True)
    scores = suggestion_level_metrics(true_suggestions, pred_suggestions)

    # print(paragraph_classification_report(true_suggestions, pred_suggestions))
    # print(sentence_classification_report(texts, true_suggestions, pred_suggestions, nlp))
    # print(suggestion_level_metrics(true_suggestions, pred_suggestions))

    log_metrics_group("paragraph_level", para, base_iteration=iteration)
    log_metrics_group("sentence_level", sent, base_iteration=iteration)
    log_metrics_group("suggestion_level", scores, base_iteration=iteration)

    return scores, para, sent

In [None]:
results = []
max_f1 = 0
last_f1 = 0

with tqdm(total=len(param_combos), desc="f1=0.000") as pbar:
    for i, (storage_file, search_k, threshold, enable_ner) in enumerate(param_combos):
        pbar.set_postfix({"config": f"{i+1}/{len(param_combos)}", "f1": last_f1})
        config = copy.deepcopy(fact_checker_base)
        config["storage_search_k"] = search_k
        config["storage_search_threshold"] = threshold
        config["vector_storage_path"] = storage_file
        config["enable_ner"] = enable_ner


        scores, para, sent = evaluate_config(config, i)

        f1 = scores["f1"]
        max_f1 = max(max_f1, f1)
        last_f1 = f1
        task.get_logger().report_scalar(
            "suggestion_level_f1",
            f"{search_k}-{threshold}-{enable_ner}@{storage_file.split('/')[-1]}",
            iteration=i,
            value=f1
        )
        pbar.set_description(f"max_f1={max_f1:.3f}")

        results.append({
            "storage": storage_file,
            "k": search_k,
            "threshold": threshold,
            "f1": f1
        })
        pbar.update(1)
        clear_torch_caches()

f1=0.000:   0%|          | 0/192 [00:00<?, ?it/s]