In [1]:
import numpy as np
import pandas as pd

from __future__ import annotations
from codealltag_data_processor_v2025 import CodealltagDataProcessor
from concurrent.futures import ThreadPoolExecutor, as_completed
from pandas import DataFrame
from pandas.core.series import Series
from tqdm import tqdm
from typing import Dict, Generator, List, Tuple

In [2]:
cdp_2022 = CodealltagDataProcessor(data_version='20220513', config_path=['codealltag_data_processor.yml'])

In [3]:
sample_size = 10_000
k = 5

In [4]:
predicted_text_df_name = f'PredictedText_DF_{cdp_2022.get_data_version()}_{sample_size // 1000}K_k{k}.csv'
predicted_text_df = pd.read_csv(predicted_text_df_name, index_col=0)

In [5]:
labels = ['CITY', 'DATE', 'EMAIL', 'FAMILY', 'FEMALE', 'MALE', 'ORG', 
          'PHONE', 'STREET', 'STREETNO', 'UFID', 'URL', 'USER', 'ZIP']

In [6]:
def get_annotation_df_with_input_text_and_predicted_text(input_text: str, 
                                                         predicted_text: str,
                                                         labels: List[str]) -> DataFrame:
    tuples = list()

    input_text_length = len(input_text)
    input_text_copy = input_text[0: input_text_length]

    item_delim = "; "
    token_delim = ": "
    pseudonym_delim = " **"
    token_id = 0
    next_cursor = 0

    predicted_items = predicted_text.split(item_delim)
    for item in predicted_items:

        label, token, pseudonym = "", "", ""

        for l in labels:
            if item.startswith(l):
                label = l

        if label != "" and (label+token_delim) in item:

            value_splits = item.split(label+token_delim)
            token_pseudonym = value_splits[1]

            if (pseudonym_delim in token_pseudonym and token_pseudonym.endswith(pseudonym_delim.strip())):

                pseudonym_splits = token_pseudonym.split(pseudonym_delim)
                token = pseudonym_splits[0]
                pseudonym = pseudonym_splits[1][:-2]

            else:
                token = token_pseudonym

            if len(token.strip()) > 0:

                start = input_text_copy.find(token)
                if start == -1 and ' ' in token:
                    start = input_text_copy.find(token.split(' ')[0])
                    token = token.replace(' ', '')

                if start != -1:
                    end = start + len(token)

                    token_id += 1
                    prev_cursor = next_cursor
                    next_cursor += end
                    input_text_copy = input_text[next_cursor: input_text_length]

                    start = prev_cursor + start
                    end = prev_cursor + end

                    tuples.append((
                        'T' + str(token_id),
                        label,
                        start,
                        end,
                        input_text[start:end],
                        pseudonym
                    ))

    return pd.DataFrame(
        tuples,
        columns=["Token_ID", "Label", "Start", "End", "Token", "Pseudonym"]
    )

In [7]:
def get_original_adf_and_mT5_original_text_adf(predicted_text_df: DataFrame, 
                                               idx: int, 
                                               cdp: CodealltagDataProcessor) -> Dict[str, Tuple]:
    
    row: Series = predicted_text_df.iloc[idx]
    file_path: str = row.FilePath
    original_text = cdp.read_email(file_path)[1]
    original_adf: DataFrame = cdp.get_annotation_df_by_file(file_path).drop(columns="FilePath")
    mT5_predicted_text = row[row.UseVersion]
    mT5_original_text_adf = get_annotation_df_with_input_text_and_predicted_text(original_text, mT5_predicted_text, labels)
    
    return {file_path: (original_adf.to_dict(), mT5_original_text_adf.to_dict())}

In [8]:
def collect_original_adf_and_mT5_original_text_adf(max_workers: int = 10) -> Generator[Dict[str, Tuple]]:
    with tqdm(total=len(predicted_text_df), smoothing=0) as progress_bar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [
                executor.submit(get_original_adf_and_mT5_original_text_adf, predicted_text_df, idx, cdp_2022)
                for idx in range(0, len(predicted_text_df))
            ]
            for future in as_completed(futures):
                progress_bar.update(1)
                yield future.result()

In [9]:
merged_dict: Dict[str, Tuple] = {}
for result in collect_original_adf_and_mT5_original_text_adf():
    merged_dict.update(result)

100%|██████████████████████████████████████| 2000/2000 [00:03<00:00, 512.44it/s]


In [10]:
predicted_text_df[['OriginalADF', 'MT5_OriginalTextADF']] = np.array(
    [merged_dict.get(file_path) for file_path in predicted_text_df['FilePath']]
)

In [11]:
predicted_text_df.to_csv(predicted_text_df_name)