In [1]:
import ast
import glob
import json
import os
import random
import re
import requests

import numpy as np
import pandas as pd

from __future__ import annotations
from codealltag_data_processor_v2025 import CodealltagDataProcessor
from concurrent.futures import ThreadPoolExecutor, as_completed
from Levenshtein import ratio
from pandas import DataFrame
from pandas.core.series import Series
from tqdm import tqdm
from typing import Any, Dict, Generator, List, Tuple

In [2]:
cdp_2022 = CodealltagDataProcessor(data_version='20220513', config_path=['codealltag_data_processor_v2025.yml'])

In [3]:
sample_df = pd.read_csv("data_utility_dataset_3500.csv", index_col=0)

In [4]:
sample_df['FilePath'] = sample_df['FilePath'].str.replace('/', '\\', regex=False)

In [5]:
url = "http://localhost:11434/api/chat"

In [6]:
system_prompt = '''
IDDENTITY AND PURPOSE
--------------------
You are an EXPERT in text PSEUDONYMIZATION.
Your task is to ONLY DETECT defined entites and PRODUCE type-compliant PSEUDONYMs.
You will be provided some SAMPLE INPUT and corresponding SAMPLE_OUTPUT to be used as examples and guide for you.
You will also be provided the list of total 14 different ENTITY TYPES AND THEIR DEFINITIONS to be used as knowledge.
You will ONLY output in a format similar to SAMPLE OUTPUT format, no ADDITIONAL text or EXPLANATIONS.


ENTITY TYPES AND THEIR DEFINITIONS
----------------------------------
1. CITY = stands for villages, towns, cities, metropolitan areas and regions smaller than a state
2. DATE = covers all sorts of date descriptions
3. EMAIL = covers all types of email addresses in the texts
4. FAMILY = covers all family names
5. FEMALE = female given names, includes nicknames and initials
6. MALE = male given names, includes nicknames and initials
7. ORG = includes all types of legal actors such as companies, brands, institutions and agencies, etc.
8. PHONE = includes phone numbers and fax numbers
9. STREET = includes all kinds of street names
10. STREETNO = street numbers that appear in location details
11. UFID = to capture persons (students, customers, employees, members of social security systems, authors, etc.)
12. URL = includes other forms of domain names
13. USER = covers all kinds of invented usernames for IT systems and platforms
14. ZIP = zip codes in location details



SAMPLE INPUT: 1
---------------
Besonders nicht bei Italo!


-- 
Zitat:
ACHTUNG "SuPer Plexer Trottel"
Niedernbergweg 5, 91160 Jeggen, 130/3177345
http://onn.mmewxds.sps/lucy-o93112.jeaj


SAMPLE OUTPUT: 1
----------------
MALE: Italo **Fernando**; STREET: Niedernbergweg **Blütenring**; STREETNO: 5 **7**; ZIP: 91160 **88521**; CITY: Jeggen **Nonnenberg**; PHONE: 130/3177345 **664/8651272**; URL: http://onn.mmewxds.sps/lucy-o93112.jeaj **http://leb.uizotxi.kba/dabw-w08293.apqp**

SAMPLE INPUT: 2
---------------
genau, das ist kurz nach dem Kamelmarkt.


lol


-- 
Zitat:
ACHTUNG "Turbotrottel"
Exerzierplatzstraße 5, 91386 Oberbaumgarten, 312/4603663
http://oqq.yyzlnom.zbm/hiog-z30270.ubgo


SAMPLE OUTPUT: 2
----------------
STREET: Exerzierplatzstraße **Töpelstraße**; STREETNO: 5 **5**; ZIP: 91386 **52118**; CITY: Oberbaumgarten **Kotzenbüll**; PHONE: 312/4603663 **644/1281306**; URL: http://oqq.yyzlnom.zbm/hiog-z30270.ubgo **http://gdv.doxulye.doz/fwqj-g78597.nqju**

SAMPLE INPUT: 3
---------------
Das ist das Problem von HKV und Rossner überhaupt.

--=20
Zitat:
ACHTUNG "Turbotrottel"
Jeuststraße 8, 85283 Baldern, 278/9147652
http://yci.lovvwoz.tvw/zkau-v03379.uhmv


SAMPLE OUTPUT: 3
----------------
ORG: HKV **Triagon**; ORG: Rossner **Arzum**; STREET: Jeuststraße **Dreijochgasse**; STREETNO: 8 **8**; ZIP: 85283 **38524**; CITY: Baldern **Ahmsen**; PHONE: 278/9147652 **001/0373780**; URL: http://yci.lovvwoz.tvw/zkau-v03379.uhmv **http://ruj.vftcqyi.cyb/jovm-t59381.pfuk**

SAMPLE INPUT: 4
---------------
* Ilka Ullenboom <Dtmvh.Zneiwl@h-jsrfcx.wg>:

[matrix]

Du magst auch MiB nicht, oder?

Henning
-- 
cross veinless


SAMPLE OUTPUT: 4
----------------
FEMALE: Ilka **Carole**; FAMILY: Ullenboom **Ulferts**; EMAIL: Dtmvh.Zneiwl@h-jsrfcx.wg **Zdcxc.Axdfyh@u-cuebhp.we**; MALE: Henning **Valerian**

SAMPLE INPUT: 5
---------------
Findet ihr unter:

http://bgl.dwkmuvqugt-hfmtsqaj.zd/Dsmpgv


Schaut mal rein
Rjaffc


SAMPLE OUTPUT: 5
----------------
URL: http://bgl.dwkmuvqugt-hfmtsqaj.zd/Dsmpgv **http://jmr.bquhhzahku-xsfnqcua.ry/Scgjyp**; USER: Rjaffc **Xlaczq**

SAMPLE INPUT: 6
---------------
und 

pennymarkt v.94 o.O.
WPK 138076

Prinzipiell interessieren mich aber auch andere Genuss-Scheine

Stefan


SAMPLE OUTPUT: 6
----------------
ORG: pennymarkt **Zeitungen&Zeitschriften**; UFID: 138076 **GKE 330952**; MALE: Stefan **Ulfert**

SAMPLE INPUT: 7
---------------
On Fri, 12. 02. 22 19:54:25 +0100, Anton Hauptmanns

Ja wer macht denn sowas ?

-- 
Artur Lüdeck
jlcrjl@dqyjm.gf
http://bre.gedzrmlsq.qc/
Mobile: 0656-5242408


SAMPLE OUTPUT: 7
----------------
DATE: 12. 02. 22 **03. 06. 20**; MALE: Anton **Otmar**; FAMILY: Hauptmanns **Olte**; MALE: Artur **Oswald**; FAMILY: Lüdeck **Freischläger**; EMAIL: jlcrjl@dqyjm.gf **lzvjme@nylof.of**; URL: http://bre.gedzrmlsq.qc/ **http://ojf.oxewmmrcr.kq/**; PHONE: 0656-5242408 **0028-3487683**


INPUT
-----
The following is the text for which you will provide output:

'''

In [7]:
model = "llama3.1:8b" # [llama3.1:8b] [gemma:7b]  [mistral:7b] [llama3.3:70b] [gemma2:27b] [gemma2:9b]

In [8]:
def get_llm_output(ollama_api_url: str, model_tag: str, system_prompt: str, user_prompt: str) -> str:
    payload = {
        "model": model_tag,
        "messages": [
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        "stream": False
    }
    response = requests.post(ollama_api_url, json=payload).json()
    return response.get('message', {}).get('content', response.get('error'))

In [28]:
def get_llm_output_for_one_sample_by_index(sample_df: DataFrame,
                                           idx: int,
                                           cdp: CodealltagDataProcessor,
                                           ollama_api_url: str,
                                           model_tag: str,
                                           system_prompt: str) -> Dict[str, Tuple]:
    
    file_path = sample_df.iloc[idx].FilePath
    input_text = cdp.read_email(file_path)[1]
    orig_adf = pd.DataFrame(ast.literal_eval(sample_df.iloc[idx].OA))
    orig_ltps = orig_adf[['Label', 'Token']].agg(': '.join, axis=1).tolist()
    max_score = 0.0
    llm_output_with_max_score = ''
    for r_count in range(5):
        llm_output = get_llm_output(ollama_api_url, model_tag, system_prompt, input_text)
        found_ltps_count = sum([1 if ltp in llm_output else 0 for ltp in orig_ltps])
        score = found_ltps_count / len(orig_ltps)
        if score > max_score:
            max_score = score
            llm_output_with_max_score = llm_output
    return {file_path: (file_path, model_tag, llm_output_with_max_score, max_score)}

In [29]:
def collect_llm_output_for_sample_df(max_workers: int = 3) -> Generator[Dict[str, Tuple]]:
    with tqdm(total=len(sample_df), smoothing=0) as progress_bar:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [
                executor.submit(get_llm_output_for_one_sample_by_index, sample_df, idx, cdp_2022, url, model, system_prompt)
                for idx in range(0, len(sample_df))
            ]
            for future in as_completed(futures):
                progress_bar.update(1)
                yield future.result()

In [30]:
merged_dict_llama: Dict[str, Tuple] = {}
for result in collect_llm_output_for_sample_df():
    merged_dict_llama.update(result)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3500/3500 [2:56:05<00:00,  3.02s/it]


In [36]:
sum([1 if merged_dict_llama[key][3] >= 0.88 else 0 for key in merged_dict_llama.keys()])

1510

In [37]:
sample_df_copy = sample_df.copy()
sample_df_copy["L318BPO"] = sample_df_copy["FilePath"].map(lambda fp: merged_dict_llama.get(fp, (None, None, None, None))[2])
sample_df_copy["L318BPOS"] = sample_df_copy["FilePath"].map(lambda fp: merged_dict_llama.get(fp, (None, None, None, None))[3])

In [41]:
for fp in list(merged_dict_llama.keys())[0:10]:
    print(f'ORIGINAL LABEL-TOKEN:\n')
    adf = pd.DataFrame(ast.literal_eval(sample_df.loc[sample_df.FilePath==fp].OA.values[0]))
    lts = (";").join(adf[['Label', 'Token']].agg(': '.join, axis=1).tolist())
    print(lts)
    print("\n------------------------------\n")
    print(f'PSEUDONYMIZED OUTPUT:\n')
    print(merged_dict_llama[fp][2])
    print()
    print(f'SCORE={merged_dict_llama[fp][3]}')
    print('\n')
    print('\n')
    

ORIGINAL LABEL-TOKEN:

FEMALE: Ella;URL: http://xaj.dxkte.cm/ztlb/

------------------------------

PSEUDONYMIZED OUTPUT:

FEMALE: Ella **Klara**; URL: http://xaj.dxkte.cm/ztlb/ **http://tgd.mnqcxwv.kfz/ujhr**

SCORE=1.0




ORIGINAL LABEL-TOKEN:

MALE: Valerian;FAMILY: Bammert;URL: http://piq.aekxp.focpvwchygwb.jbr;PHONE: +21-(6)33-6347-677698

------------------------------

PSEUDONYMIZED OUTPUT:

MALE: Valerian **Ludwig**; ORG: Digital Photography **Fotofusion**; URL: http://piq.aekxp.focpvwchygwb.jbr **http://yjg.ypfyrqmxz.kk/**; PHONE: +21-(6)33-6347-677698 **0024/23456789**

SCORE=0.75




ORIGINAL LABEL-TOKEN:

ORG: Interaktiv;STREET: Feldkellergasse;STREETNO: 26;CITY: Eulenloh;DATE: 7.3.99;MALE: Wilhelm

------------------------------

PSEUDONYMIZED OUTPUT:

FEMALE: Boerseninteressierte **Johanna**; CITY: Eulenloh **Weidenberg**; STREET: Feldkellergasse **Hamburgerstraße**; STREETNO: 26 **5**; ORG: Interaktiv **Betonbau GmbH**; DATE: 7.3.99 **12.9.03**; MALE: Wilhelm **Karl**



In [42]:
model = "gemma2:9b"
merged_dict_gemma: Dict[str, Tuple] = {}
for result in collect_llm_output_for_sample_df():
    merged_dict_gemma.update(result)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3500/3500 [4:20:00<00:00,  4.46s/it]


In [43]:
sum([1 if merged_dict_gemma[key][3] == 1.0 else 0 for key in merged_dict_gemma.keys()])

1855

In [44]:
sample_df_copy["G29BPO"] = sample_df_copy["FilePath"].map(lambda fp: merged_dict_gemma.get(fp, (None, None, None, None))[2])
sample_df_copy["G29BPOS"] = sample_df_copy["FilePath"].map(lambda fp: merged_dict_gemma.get(fp, (None, None, None, None))[3])

In [49]:
sample_df_copy.sample(n=3)

Unnamed: 0,FilePath,OT,OA,OTT,OTTL,MT5O,MT5PT,MT5PA,MT5PTT,MT5PTTL,L318BPO,L318BPOS,G29BPO,G29BPOS
2064,CodEAlltag_pXL_PHILOSOPHY\1-\19-\197136.txt,"ptpwa@bux.gz meinte am 17.04.97\nzum Thema ""We...","{'Token_ID': {0: 'T1', 1: 'T2', 2: 'T3'}, 'Lab...","ptpwa@bux.gz meinte am 17. 04. 97 zum Thema "" ...",ptpwa@bux.gz B-EMAIL\nmeinte O\nam O\n17. B-DA...,EMAIL: ptpwa@bux.gz **tjzy@hmr.ci**; DATE: 17....,"tjzy@hmr.ci meinte am 17.04.67\nzum Thema ""Wei...","{'Token_ID': {0: 'T1', 1: 'T2', 2: 'T3'}, 'Lab...","tjzy@hmr.ci meinte am 17. 04. 67 zum Thema "" W...",tjzy@hmr.ci B-EMAIL\nmeinte O\nam O\n17. B-DAT...,DATE: 17.04.97 **01.02.20**; EMAIL: ptpwa@bux....,1.0,EMAIL: ptpwa@bux.gz **vwxj.tkyf@h-zrlcx.ws**; ...,1.0
2398,CodEAlltag_pXL_TRAVELS\1-\14-\141981.txt,"Hallo,\n\nkennt jemand ein Reisebuero, welches...","{'Token_ID': {0: 'T1'}, 'Label': {0: 'FEMALE'}...","Hallo , kennt jemand ein Reisebuero , welches ...","Hallo O\n, O\nkennt O\njemand O\nein O\nReiseb...",FEMALE: Elke **Ursina**,"Hallo,\n\nkennt jemand ein Reisebuero, welches...","{'Token_ID': {0: 'T1'}, 'Label': {0: 'FEMALE'}...","Hallo , kennt jemand ein Reisebuero , welches ...","Hallo O\n, O\nkennt O\njemand O\nein O\nReiseb...",FEMALE: Elke **Evelyn**; CITY: Iran **Afghanis...,1.0,----------------\nFEMALE: Elke **Viola** \n\n\n,1.0
3314,CodEAlltag_pXL_TEENS\1-\16-\162162.txt,"Am Wed, 29 Mrz 2001 00:55:18 +0200, schrieb Fe...","{'Token_ID': {0: 'T1', 1: 'T2', 2: 'T3', 3: 'T...","Am Wed , 29 Mrz 2001 00:55:18 +0200 , schrieb ...","Am O\nWed O\n, O\n29 B-DATE\nMrz I-DATE\n2001 ...",DATE: 29 Mrz 2001 **26 Mrz 2001**; MALE: Felix...,"Am Wed, 26 Mrz 2001 00:55:18 +0200, schrieb Si...","{'Token_ID': {0: 'T1', 1: 'T2', 2: 'T3', 3: 'T...","Am Wed , 26 Mrz 2001 00:55:18 +0200 , schrieb ...","Am O\nWed O\n, O\n26 B-DATE\nMrz I-DATE\n2001 ...",DATE: 29 Mrz 2001 **16. 02. 23**; MALE: Felix ...,1.0,OUTPUT\n-------\nDATE: 29 Mrz 2001 **15 Dez 20...,1.0


In [50]:
sample_df_copy['FilePath'] = sample_df_copy['FilePath'].str.replace('\\', '/', regex=False)

In [51]:
sample_df_copy.to_csv('data_syntheticity_dataset_3500_ss.csv')

In [59]:
sample_df_p75 = sample_df_copy[(sample_df_copy.L318BPOS > 0.75) & (sample_df_copy.G29BPOS > 0.75)]

In [60]:
sample_df_p75.reset_index(drop=True, inplace=True)

In [61]:
sample_df_p75.head(n=3)

Unnamed: 0,FilePath,OT,OA,OTT,OTTL,MT5O,MT5PT,MT5PA,MT5PTT,MT5PTTL,L318BPO,L318BPOS,G29BPO,G29BPOS
0,CodEAlltag_pXL_FINANCE/7-/76653.txt,...\n\ndie können doch nicht soooo von den US-...,"{'Token_ID': {0: 'T1', 1: 'T2'}, 'Label': {0: ...",... die können doch nicht soooo von den US-Zin...,... O\ndie O\nkönnen O\ndoch O\nnicht O\nsoooo...,FEMALE: Ella **Ülkü**; URL: http://xaj.dxkte.c...,...\n\ndie können doch nicht soooo von den US-...,"{'Token_ID': {0: 'T1', 1: 'T2'}, 'Label': {0: ...",... die können doch nicht soooo von den US-Zin...,... O\ndie O\nkönnen O\ndoch O\nnicht O\nsoooo...,FEMALE: Ella **Klara**; URL: http://xaj.dxkte....,1.0,...; FEMALE: Ella **Greta**; URL: http://xaj.d...,1.0
1,CodEAlltag_pXL_FINANCE/7-/74675.txt,Hallo Boerseninteressierte!\n\nDer naechste St...,"{'Token_ID': {0: 'T1', 1: 'T2', 2: 'T3', 3: 'T...",Hallo Boerseninteressierte ! Der naechste Stam...,Hallo O\nBoerseninteressierte O\n! O\nDer O\nn...,ORG: Interaktiv **Rovers**; STREET: Feldkeller...,Hallo Boerseninteressierte!\n\nDer naechste St...,"{'Token_ID': {0: 'T1', 1: 'T2', 2: 'T3', 3: 'T...",Hallo Boerseninteressierte ! Der naechste Stam...,Hallo O\nBoerseninteressierte O\n! O\nDer O\nn...,FEMALE: Boerseninteressierte **Johanna**; CITY...,1.0,-----------------------\nSTREET: Feldkellergas...,0.833333
2,CodEAlltag_pXL_FINANCE/5-/50447.txt,"On Tue, 26 Apr 1999 21:46:56 +0100, Susanna un...","{'Token_ID': {0: 'T1', 1: 'T2', 2: 'T3', 3: 'T...","On Tue , 26 Apr 1999 21:46:56 +0100 , Susanna ...","On O\nTue O\n, O\n26 B-DATE\nApr I-DATE\n1999 ...",DATE: 26 Apr 1999 **27 Jul 1999**; FEMALE: Sus...,"On Tue, 27 Jul 1999 21:46:56 +0100, Helma und ...","{'Token_ID': {0: 'T1', 1: 'T2', 2: 'T3', 3: 'T...","On Tue , 27 Jul 1999 21:46:56 +0100 , Helma un...","On O\nTue O\n, O\n27 B-DATE\nJul I-DATE\n1999 ...",FEMALE: Susanna **Luisa**; MALE: Marko **Marce...,1.0,----------------\nDATE: 26 Apr 1999 **18. 07. ...,1.0
