In [1]:
import os

import numpy as np
import pandas as pd

from codealltag_data_processor_v2025 import CodealltagDataProcessor
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.nn import Model
from pandas import DataFrame
from tqdm import tqdm
from typing import Any, Dict

In [2]:
cdp_2022 = CodealltagDataProcessor(data_version='20220513', config_path=['codealltag_data_processor.yml'])

In [3]:
sample_size = 10_000
k = 5

In [4]:
predicted_text_df_name = f'PredictedText_DF_{cdp_2022.get_data_version()}_{sample_size // 1000}K_k{k}.csv'
predicted_text_df = pd.read_csv(predicted_text_df_name, index_col=0)

In [5]:
model_dir_path = os.path.join(*["logs", "GELECTRA", "NER", "10K", "k5"])

In [6]:
model = SequenceTagger.load(os.path.join(model_dir_path, 'best-model.pt'))

2025-02-25 15:47:34,320 loading file logs/GELECTRA/NER/10K/k5/best-model.pt
2025-02-25 15:47:50,052 SequenceTagger predicts: Dictionary with 57 tags: O, S-MALE, B-MALE, E-MALE, I-MALE, S-FAMILY, B-FAMILY, E-FAMILY, I-FAMILY, S-URL, B-URL, E-URL, I-URL, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-CITY, B-CITY, E-CITY, I-CITY, S-DATE, B-DATE, E-DATE, I-DATE, S-STREET, B-STREET, E-STREET, I-STREET, S-ZIP, B-ZIP, E-ZIP, I-ZIP, S-STREETNO, B-STREETNO, E-STREETNO, I-STREETNO, S-FEMALE, B-FEMALE, E-FEMALE, I-FEMALE, S-UFID, B-UFID, E-UFID, I-UFID, S-ORG


In [7]:
def get_annotation_df_from_model_prediction(email_text: str, model: Model, cdp: CodealltagDataProcessor) -> DataFrame:
    tuples = list()
    
    email_content = email_text
    sentences = [Sentence(cdp.tokenize_with_somajo(email_text))]

    model.predict(sentences)

    email_content_length = len(email_content)
    email_content_copy = email_content[0:email_content_length]

    token_id = 0
    next_cursor = 0
    for sentence in sentences:
        labels = sentence.get_labels()
        for label in labels:
            text = label.data_point.text
            start = email_content_copy.find(text)
            if start == -1 and ' ' in text:
                start = email_content_copy.find(text.split(' ')[0])
                text = text.replace(' ', '')

            if start != -1:
                end = start + len(text)

                token_id += 1
                prev_cursor = next_cursor
                next_cursor += end
                email_content_copy = email_content[next_cursor:email_content_length]

                start = prev_cursor + start
                end = prev_cursor + end

                tuples.append((
                    'T' + str(token_id),
                    label.value,
                    start,
                    end,
                    email_content[start:end]
                ))
            else:
                token_id += 1
                tuples.append((
                    'T' + str(token_id),
                    label.value,
                    -1,
                    -1,
                    text
                ))

    return pd.DataFrame(
        tuples,
        columns=["Token_ID", "Label", "Start", "End", "Token"]
    )

In [8]:
def collect_annotation_df_for_pseudonymized_text() -> Dict[str, Dict[str, Dict[str, Any]]]:
    file_path_gelectra_adf_dict: Dict[str, Dict[str, Dict[str, Any]]] = dict()
    with tqdm(total=len(predicted_text_df), smoothing=0) as progress_bar:
        for idx in range(0, len(predicted_text_df)):
            gelectra_adf_dict = get_annotation_df_from_model_prediction(
                email_text=predicted_text_df.iloc[idx].PseudonymizedText,
                model=model,
                cdp=cdp_2022
            ).to_dict()
            file_path_gelectra_adf_dict[predicted_text_df.iloc[idx].FilePath] = gelectra_adf_dict
            progress_bar.update(1)
    return file_path_gelectra_adf_dict

In [9]:
file_path_gelectra_adf_dict = collect_annotation_df_for_pseudonymized_text()

100%|███████████████████████████████████████| 2000/2000 [02:40<00:00, 12.43it/s]


In [10]:
predicted_text_df['GELECTRA_PseudonymizedTextADF'] = np.array(
    [file_path_gelectra_adf_dict.get(file_path) for file_path in predicted_text_df['FilePath']]
)

In [11]:
predicted_text_df.to_csv(predicted_text_df_name)