In [1]:
!nvidia-smi

Tue Feb 11 01:25:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.08             Driver Version: 550.127.08     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          On  |   00000000:47:00.0 Off |                    0 |
| N/A   30C    P0             63W /  400W |       1MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import matplotlib.pyplot as plt
import os
import pandas as pd

from codealltag_data_processor_v2025 import CodealltagDataProcessor
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.nn import Model
from pandas import DataFrame
from pathlib import Path
from somajo import SoMaJo
from typing import List

In [3]:
sample_size = 10_000
k = 5
arch = "GELECTRA"
model_dir_path = os.path.join(*["logs", arch, "NER", str(sample_size//1000)+"K", "k"+str(k)])

In [4]:
model = SequenceTagger.load(os.path.join(model_dir_path, 'best-model.pt'))

2025-02-11 01:26:12,132 loading file logs/GELECTRA/NER/10K/k5/best-model.pt
2025-02-11 01:27:56,313 SequenceTagger predicts: Dictionary with 57 tags: O, S-MALE, B-MALE, E-MALE, I-MALE, S-FAMILY, B-FAMILY, E-FAMILY, I-FAMILY, S-URL, B-URL, E-URL, I-URL, S-EMAIL, B-EMAIL, E-EMAIL, I-EMAIL, S-PHONE, B-PHONE, E-PHONE, I-PHONE, S-CITY, B-CITY, E-CITY, I-CITY, S-DATE, B-DATE, E-DATE, I-DATE, S-STREET, B-STREET, E-STREET, I-STREET, S-ZIP, B-ZIP, E-ZIP, I-ZIP, S-STREETNO, B-STREETNO, E-STREETNO, I-STREETNO, S-FEMALE, B-FEMALE, E-FEMALE, I-FEMALE, S-UFID, B-UFID, E-UFID, I-UFID, S-ORG


In [5]:
cdp_2022 = CodealltagDataProcessor(data_version='20220513', config_path=['codealltag_data_processor.yml'])
cdp_2020 = CodealltagDataProcessor(data_version='20200518', config_path=['codealltag_data_processor.yml'])

In [6]:
category_path_df = cdp_2022.get_category_path_df()

In [7]:
dataset = cdp_2022.get_train_dev_test_datasetdict_for_sample_size(cdp_2020, sample_size, k)
train_df = dataset["train"].to_pandas()
dev_df = dataset["dev"].to_pandas()
test_df = dataset["test"].to_pandas()

In [8]:
outside_10K = category_path_df[
    ~category_path_df.FilePath.isin(
        list(train_df['FilePath']) + list(dev_df['FilePath']) + list(test_df['FilePath'])
    )
]

In [9]:
selected_file_path = outside_10K.iloc[2050].FilePath

In [10]:
_, email_text = cdp_2022.read_email(selected_file_path, show=True)

../../data/CodEAlltag_pXL_20220513/CodEAlltag_pXL_EVENTS/6-/6731.txt
--------------------------------------------------------------------

cnmbhglcvmu@fgxm-vmla.rbh (Lmqwewtqrx Mühln), oder vielmehr 
  Fabian Riemert, dessen mgl. Wesenszug einer hinterfotzigen verlogenen 
  Drecksau ich auf Anraten von Dr. Benno Steinhäuser [<http://reatxmo.wgj/6yp4g3m>]


  Das sind doch garkeine Suedlaender, Fabian Riemert!
  Du bringst wieder alles durcheinander!
  Wenns um Nichtsuedlaender geht, dann musst Du auch konsequent sein,
  Du Dreckarsch:

  https://ocb.uufcclz.fos/rtohg?x=unEKn7ROTMK

-- 
[Fabian Riemerts Version von 'ich hab wieder nix kapiert, moechte 
aber unbedingt mitreden' in <p2wmq998ir4u.gjoqck3y03gr$.wsy@68hriw.ifb>]



In [11]:
cdp_2022.get_annotation_df_by_file(selected_file_path)

Unnamed: 0,FilePath,Token_ID,Label,Start,End,Token
0,CodEAlltag_pXL_EVENTS/6-/6731.ann,T1,EMAIL,0,25,cnmbhglcvmu@fgxm-vmla.rbh
1,CodEAlltag_pXL_EVENTS/6-/6731.ann,T2,EMAIL,27,37,Lmqwewtqrx
2,CodEAlltag_pXL_EVENTS/6-/6731.ann,T3,FAMILY,38,43,Mühln
3,CodEAlltag_pXL_EVENTS/6-/6731.ann,T4,MALE,63,69,Fabian
4,CodEAlltag_pXL_EVENTS/6-/6731.ann,T5,FAMILY,70,77,Riemert
5,CodEAlltag_pXL_EVENTS/6-/6731.ann,T6,MALE,169,174,Benno
6,CodEAlltag_pXL_EVENTS/6-/6731.ann,T7,FAMILY,175,186,Steinhäuser
7,CodEAlltag_pXL_EVENTS/6-/6731.ann,T8,URL,189,217,http://reatxmo.wgj/6yp4g3m>]
8,CodEAlltag_pXL_EVENTS/6-/6731.ann,T9,MALE,258,264,Fabian
9,CodEAlltag_pXL_EVENTS/6-/6731.ann,T10,FAMILY,265,272,Riemert


In [12]:
def get_somajo_tokenized_sentences(text: str) -> List[Sentence]:
    tokenizer = SoMaJo("de_CMC", split_camel_case=False)
    sentences: List[Sentence] = list()
    for sentence in tokenizer.tokenize_text([text]):
        sentences.append(Sentence([token.text for token in sentence]))
    return sentences

In [13]:
sentences = get_somajo_tokenized_sentences(email_text)

In [14]:
model.predict(sentences)

In [15]:
for sentence in sentences:
    print(sentence.to_tagged_string())

Sentence: "cnmbhglcvmu@fgxm-vmla.rbh ( Lmqwewtqrx Mühln ) , oder vielmehr Fabian Riemert , dessen mgl. Wesenszug einer hinterfotzigen verlogenen Drecksau ich auf Anraten von Dr. Benno Steinhäuser [ < http://reatxmo.wgj/6yp4g3m>] Das sind doch garkeine Suedlaender , Fabian Riemert !" → ["cnmbhglcvmu@fgxm-vmla.rbh"/EMAIL, "Lmqwewtqrx"/EMAIL, "Mühln"/FAMILY, "Fabian"/MALE, "Riemert"/FAMILY, "Benno"/MALE, "Steinhäuser"/FAMILY, "<"/URL, "http://reatxmo.wgj/6yp4g3m>]"/URL, "Fabian"/MALE, "Riemert"/FAMILY]
Sentence: "Du bringst wieder alles durcheinander !"
Sentence: "Wenns um Nichtsuedlaender geht , dann musst Du auch konsequent sein , Du Dreckarsch : https://ocb.uufcclz.fos/rtohg?x=unEKn7ROTMK -- [ Fabian Riemerts Version von ' ich hab wieder nix kapiert , moechte aber unbedingt mitreden ' in < p2wmq998ir4u . gjoqck3y03gr $ . wsy@68hriw.ifb > ]" → ["https://ocb.uufcclz.fos/rtohg?x=unEKn7ROTMK"/URL, "Fabian"/MALE, "Riemerts"/FAMILY, "p2wmq998ir4u"/EMAIL, ". gjoqck3y03gr"/URL, "$ . wsy@68hriw

In [16]:
def get_annotation_df_from_model_prediction(email_text: str, model: Model = None) -> DataFrame:
    tuples = list()
    
    email_content = email_text
    sentences = get_somajo_tokenized_sentences(email_text)

    model.predict(sentences)

    email_content_length = len(email_content)
    email_content_copy = email_content[0:email_content_length]

    token_id = 0
    next_cursor = 0
    for sentence in sentences:
        labels = sentence.get_labels()
        for label in labels:
            text = label.data_point.text
            start = email_content_copy.find(text)
            if start == -1 and ' ' in text:
                start = email_content_copy.find(text.split(' ')[0])
                text = text.replace(' ', '')

            if start != -1:
                end = start + len(text)

                token_id += 1
                prev_cursor = next_cursor
                next_cursor += end
                email_content_copy = email_content[next_cursor:email_content_length]

                start = prev_cursor + start
                end = prev_cursor + end

                tuples.append((
                    'T' + str(token_id),
                    label.value,
                    start,
                    end,
                    email_content[start:end]
                ))
            else:
                token_id += 1
                tuples.append((
                    'T' + str(token_id),
                    label.value,
                    -1,
                    -1,
                    text
                ))

    return pd.DataFrame(
        tuples,
        columns=["Token_ID", "Label", "Start", "End", "Token"]
    )

In [17]:
get_annotation_df_from_model_prediction(email_text, model)

Unnamed: 0,Token_ID,Label,Start,End,Token
0,T1,EMAIL,0,25,cnmbhglcvmu@fgxm-vmla.rbh
1,T2,EMAIL,27,37,Lmqwewtqrx
2,T3,FAMILY,38,43,Mühln
3,T4,MALE,63,69,Fabian
4,T5,FAMILY,70,77,Riemert
5,T6,MALE,169,174,Benno
6,T7,FAMILY,175,186,Steinhäuser
7,T8,URL,188,189,<
8,T9,URL,189,217,http://reatxmo.wgj/6yp4g3m>]
9,T10,MALE,258,264,Fabian


In [18]:
email_text = '''
Betreff: Einladung zur Fachkonferenz am 15. März 2024 in München

Von: Dr. Maximilian Weber <max.weber@example.com>
An: Frau Julia Schneider <julia.schneider@firma.de>

Sehr geehrte Frau Schneider,

ich hoffe, diese E-Mail erreicht Sie wohlbehalten. Ich möchte Sie herzlich zur Jahresfachkonferenz für Digitale Innovationen, organisiert von der Deutschen Gesellschaft für Technologie e.V., einladen. Die Veranstaltung findet am 15. März 2024 in München statt.

Hier die Veranstaltungsdetails:
Ort: Innovationszentrum München, Maximilianstraße 45, 80539 München
Telefon: +49 89 1234 5678
Webseite: www.dgti-ev.de

Unsere Keynote-Speaker sind unter anderem Herr Dr. Tobias Lehmann und Frau Julia Schneider. Sie können sich mit Ihrem Benutzername JSchneider_92 auf unserer Webseite registrieren.

Falls Sie Fragen haben, erreichen Sie mich unter max.weber@example.com oder telefonisch. Mein ORCID iD lautet 0000-0001-2345-6789.

Wir freuen uns auf Ihre Teilnahme!

Mit freundlichen Grüßen
Dr. Maximilian Weber
Projektleiter, Deutsche Gesellschaft für Technologie e.V.
'''

In [19]:
get_annotation_df_from_model_prediction(email_text, model)

Unnamed: 0,Token_ID,Label,Start,End,Token
0,T1,DATE,41,54,15. März 2024
1,T2,MALE,76,86,Maximilian
2,T3,FAMILY,87,92,Weber
3,T4,MALE,94,115,max.weber@example.com
4,T5,FEMALE,126,131,Julia
5,T6,FAMILY,132,141,Schneider
6,T7,FEMALE,143,167,julia.schneider@firma.de
7,T8,FAMILY,188,197,Schneider
8,T9,DATE,429,442,15. März 2024
9,T10,STREET,527,543,Maximilianstraße


In [20]:
import torch
import gc

model = model.to("cpu")
del model
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
gc.collect()

4239