In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp ./drive/MyDrive/backend-3.zip ./backend.zip
!cp ./drive/MyDrive/metric.py ./metric.py
!cp ./drive/MyDrive/chunks.zip ./chunks.zip
!cp ./drive/MyDrive/setup_clearml_env.py ./setup_clearml_env.py
!cp ./drive/MyDrive/test.csv ./test.csv
!unzip -qq ./chunks.zip
!cp ./drive/MyDrive/raw_data3.csv ./chunks/raw_data3.csv
!unzip -qq ./backend.zip
!cp -r ./drive/MyDrive/vector_storages ./
!rm -rf ./chunks.zip
!rm -rf ./backend6.zip
!pip install faiss-cpu clearml datasets fastcoref pydantic==1.10.19	pydantic_core==2.23.4

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting clearml
  Downloading clearml-1.18.0-py2.py3-none-any.whl.metadata (18 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fastcoref
  Downloading fastcoref-2.1.6.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pydantic==1.10.19
  Downloading pydantic-1.10.19-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (152 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.6/152.6 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic_core==2.23.4
  Downloading pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting furl>=2.0.0 (from clearml)
  Downloading furl-2.1.4-py2.py3-none-any.whl.metadata (25 kB)
Collecting pathlib2>=2.3.0 (from clearml)
  Downloading pathlib2-2.3.7.post1-py2.py3-none-any.

In [2]:
import pandas as pd
import itertools
import copy
import spacy

from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from functools import lru_cache
from clearml import Task, Logger

from backend.AI_services.ai_services.vector_storage import VectorStorage
from backend.AI_services.ai_services.models.fact_checker import FactCheckerPipeline
from backend.AI_services.ai_services.preprocessing import get_default_coref_pipeline
from backend.AI_services.ai_services.utils import disable_fastcoref_progress_bar
from setup_clearml_env import setup
from metric import *

tqdm.pandas()
disable_fastcoref_progress_bar()
setup(seed=42)

In [3]:
sentence_transformer_model = 'intfloat/e5-base-v2'

In [4]:
model = SentenceTransformer(sentence_transformer_model, device="cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
@lru_cache(maxsize=None)
def get_sentence_embeddings(text: str, **kwargs):
    return model.encode(text, **kwargs)

In [6]:
storage = VectorStorage(
    dim=model.get_sentence_embedding_dimension(),
    embedder=get_sentence_embeddings,
)

In [7]:
task = Task.init(
    project_name="sae",
    task_name="GridSearch Notebook",
    task_type=Task.TaskTypes.optimizer
)

ClearML Task: created new task id=e2bf0a80a0c648e79c3ff3cb0e197551
2025-05-07 20:49:46,811 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/c8e43e61eb7548a781d200ce5184864c/experiments/e2bf0a80a0c648e79c3ff3cb0e197551/output/log


In [8]:
coref_pipeline = get_default_coref_pipeline(device="cuda")

2025-05-07 20:49:55,434 - clearml.model - INFO - Selected model id: b18ed9cce8c34b009b0daddccbe26ff0


In [9]:
task.connect({
    "sentence_transformer": sentence_transformer_model,
    "test_dataset_version": "1.0.0",
    "coref": "default",
    "cross_encoder": "cross-encoder/stsb-roberta-base"
})

{'sentence_transformer': 'intfloat/e5-base-v2',
 'test_dataset_version': '1.0.0',
 'coref': 'default',
 'cross_encoder': 'cross-encoder/stsb-roberta-base'}

In [10]:
data = pd.read_csv("./test.csv")

In [11]:
data = data.fillna("[]")

In [12]:

fact_checker_base = {
    "processing_device": "cuda",
    "device": "cuda",
    "get_explanation": False,
    "automatic_contextualisation": True,
}

search_params = {
    "storage_search_k": [3, 5, 7, 9, 11],
    "storage_search_threshold": [0.85, 0.9, 0.95, 1.5],
    "cross_encoder_threshold": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
}

vector_storage_files = [
    "./vector_storages/storage-chunk_1_processed",
    "./vector_storages/storage-chunk_2_processed",
    "./vector_storages/storage-chunk_3_processed",
    "./vector_storages/storage-chunk_4_processed",
    "./vector_storages/storage-chunk_5_processed",
    "./vector_storages/storage-chunk_6_processed",
]

param_combos = list(itertools.product(
    vector_storage_files,
    search_params["storage_search_k"],
    search_params["storage_search_threshold"],
    search_params["cross_encoder_threshold"]
))


In [13]:
nlp = spacy.load("en_core_web_sm")

In [14]:
def log_metrics_group(group_name: str, metrics: dict, base_iteration: int = 0):
    logger = Logger.current_logger()
    i = base_iteration

    for label, values in metrics.items():
        if isinstance(values, dict):
            for metric, value in values.items():
                logger.report_scalar(
                    title=group_name,
                    series=f"{label}/{metric}",
                    iteration=i,
                    value=value
                )
        else:
            logger.report_scalar(
                title=group_name,
                series=label,
                iteration=i,
                value=values
            )


def evaluate_config(config, iteration: int = 0):
    preds = pd.DataFrame(columns=["text", "is_error_in_paragraphs", "errors_in_sentences"])

    df_true = data.copy()

    storage.load(config["vector_storage_path"])

    fact_checker = FactCheckerPipeline(
        vector_storage=storage,
        processing_pipeline=coref_pipeline,
        processing_device=config["processing_device"],
        device=config["device"],
        get_explanation=config["get_explanation"],
        storage_search_k=config["storage_search_k"],
        storage_search_threshold=config["storage_search_threshold"],
        automatic_contextualisation=config["automatic_contextualisation"],
        cross_encoder_threshold=config["cross_encoder_threshold"]
    )

    for i, row in df_true.iterrows():
        text = row["text"]
        predictions = fact_checker.evaluate_text(
            text.lower().strip().replace("\n", " "),
        )
        try:
            preds.loc[i] = [
                text,
                len(predictions) != 0,
                str([s.fact.index + 1 for s in predictions])
            ]
        except TypeError as e:
            print(predictions)
            breakpoint()

    preds["errors_in_sentences"] = preds["errors_in_sentences"].apply(str)
    preds["suggestions_json"] = preds["errors_in_sentences"].apply(parse_suggestions_column)

    df_true["errors_in_sentences"] = df_true["errors_in_sentences"].apply(str)
    df_true["suggestions_json"] = df_true["errors_in_sentences"].apply(parse_suggestions_column)

    df_true["true_sugs"] = df_true["suggestions_json"]
    df_true["pred_sugs"] = preds["suggestions_json"]

    texts = df_true["text"].tolist()
    true_suggestions = df_true["true_sugs"].tolist()
    pred_suggestions = df_true["pred_sugs"].tolist()

    para = paragraph_classification_report(true_suggestions, pred_suggestions, output_dict=True)
    sent = sentence_classification_report(texts, true_suggestions, pred_suggestions, nlp, output_dict=True)
    scores = suggestion_level_metrics(true_suggestions, pred_suggestions)

    # print(paragraph_classification_report(true_suggestions, pred_suggestions))
    # print(sentence_classification_report(texts, true_suggestions, pred_suggestions, nlp))
    # print(suggestion_level_metrics(true_suggestions, pred_suggestions))

    log_metrics_group("paragraph_level", para, base_iteration=iteration)
    log_metrics_group("sentence_level", sent, base_iteration=iteration)
    log_metrics_group("suggestion_level", scores, base_iteration=iteration)

    return scores, para, sent

In [None]:
results = []
max_f1 = 0
last_f1 = 0

with tqdm(total=len(param_combos), desc="f1=0.000") as pbar:
    for i, (storage_file, search_k, threshold, ce_threshold) in enumerate(param_combos):
        pbar.set_postfix({"config": f"{i+1}/{len(param_combos)}", "f1": last_f1})
        config = copy.deepcopy(fact_checker_base)
        config["storage_search_k"] = search_k
        config["storage_search_threshold"] = threshold
        config["vector_storage_path"] = storage_file
        config["cross_encoder_threshold"] = ce_threshold
        scores, para, sent = evaluate_config(config, i)
        f1 = scores["f1"]
        max_f1 = max(max_f1, f1)
        last_f1 = f1
        task.get_logger().report_scalar(
            "suggestion_level_f1",
            f"{search_k}-{threshold}-{ce_threshold}@{storage_file.split('/')[-1]}",
            iteration=i,
            value=f1
        )
        pbar.set_description(f"max_f1={max_f1:.3f}")

        results.append({
            "storage": storage_file,
            "k": search_k,
            "threshold": threshold,
            "f1": f1
        })
        pbar.update(1)

f1=0.000:   0%|          | 0/720 [00:00<?, ?it/s]