In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from TorchCRF import CRF
import json

# --------------------
# 1. Daten laden
# --------------------
def load_conll_data(file_path):
    tokens, ner_tags = [], []
    all_tokens, all_tags = [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                if tokens:
                    all_tokens.append(tokens)
                    all_tags.append(ner_tags)
                tokens, ner_tags = [], []
            else:
                splits = line.strip().split()
                tokens.append(splits[0])
                ner_tags.append(splits[-1])
    if tokens:
        all_tokens.append(tokens)
        all_tags.append(ner_tags)
    return all_tokens, all_tags


data_path = "/bachelorarbeit-ner/data/annotated"
all_sentences, all_tags = [], []
for file in os.listdir(data_path):
    if file.endswith('.conll'):
        sents, tgs = load_conll_data(os.path.join(data_path, file))
        all_sentences.extend(sents)
        all_tags.extend(tgs)

# --------------------
# 2. Vokabular
# --------------------
words = list(set(word for sentence in all_sentences for word in sentence))
tag_list = list(set(tag for tag_seq in all_tags for tag in tag_seq))
tag_list.append("PAD")  # expliziter PAD-Tag für Labels

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

tag2idx = {t: i for i, t in enumerate(tag_list)}
idx2tag = {i: t for t, i in tag2idx.items()}

pad_word_idx = word2idx["PAD"]
pad_tag_idx = tag2idx["PAD"]
o_tag_idx = tag2idx["O"]  # wird evtl. für Maskierung gebraucht

X = [[word2idx.get(w, word2idx["UNK"]) for w in s] for s in all_sentences]
y = [[tag2idx[t] for t in ts] for ts in all_tags]
# --------------------
# 3. Dataset
# --------------------
class NERDataset(Dataset):
    def __init__(self, sentences, tags):
        self.sentences = sentences
        self.tags = tags
    def __len__(self):
        return len(self.sentences)
    def __getitem__(self, idx):
        return self.sentences[idx], self.tags[idx]

def collate_fn(batch):
    sentences, tags = zip(*batch)
    max_length = max(len(s) for s in sentences)

    padded_sentences = []
    padded_tags = []
    attention_masks = []

    for s, t in zip(sentences, tags):
        pad_len = max_length - len(s)
        padded_s = s + [pad_word_idx] * pad_len
        padded_t = t + [pad_tag_idx] * pad_len  
        padded_sentences.append(padded_s)
        padded_tags.append(padded_t)
        attention_masks.append([1] * len(s) + [0] * pad_len)

    return (torch.tensor(padded_sentences, dtype=torch.long).transpose(0, 1),
            torch.tensor(padded_tags, dtype=torch.long).transpose(0, 1),
            torch.tensor(attention_masks, dtype=torch.uint8).transpose(0, 1))

# --------------------
# 4. Modell
# --------------------
class BiLSTMCRF(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, tagset_size, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, tagset_size)
        self.crf = CRF(tagset_size)
    def forward(self, input_ids, tags=None, mask=None):
        x = self.embedding(input_ids)
        x, _ = self.lstm(x)
        emissions = self.fc(x)
        if tags is not None:
            return -self.crf(emissions, tags, mask.bool(), reduction='mean')
        else:
            return self.crf.decode(emissions, mask.bool())

# --------------------
# 5. Trainingsfunktion
# --------------------
def run_experiment(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Neue Splits pro Seed für Variation
    X_trainval, X_test_, y_trainval, y_test_ = train_test_split(X, y, test_size=0.15, random_state=42)
    X_train_, X_val_, y_train_, y_val_ = train_test_split(X_trainval, y_trainval, test_size=0.1765, random_state=42)

    train_loader = DataLoader(NERDataset(X_train_, y_train_), batch_size=32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(NERDataset(X_val_, y_val_), batch_size=32, shuffle=False, collate_fn=collate_fn)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = BiLSTMCRF(len(word2idx), 100, 128, len(tag2idx), pad_idx=pad_word_idx).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-3)

    # Training
    model.train()
    for _ in range(5):
        for batch in train_loader:
            input_ids, tags, mask = [x.to(device) for x in batch]
            optimizer.zero_grad()
            loss = model(input_ids, tags, mask)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

    # Evaluation
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, tags, mask = [x.to(device) for x in batch]
            predictions = model(input_ids, mask=mask)
            for i in range(len(predictions)):
                seq_len = mask[:, i].sum().item()
                y_true.extend([idx2tag[t.item()] for t in tags[:seq_len, i]])
                y_pred.extend([idx2tag[p] for p in predictions[i][:seq_len]])

    report = classification_report(y_true, y_pred, output_dict=True, digits=4)
    return {
        "seed": seed,
        "macro_precision": report["macro avg"]["precision"],
        "macro_recall": report["macro avg"]["recall"],
        "macro_f1": report["macro avg"]["f1-score"],
        "accuracy": report["accuracy"]
    }, report

# --------------------
# 6. Alle 10 Durchläufe
# --------------------
results = []
detailed = {}

for s in range(1, 11):
    print(f"===> Running seed {s}")
    res, full = run_experiment(s)
    results.append(res)
    detailed[f"seed_{s}"] = full

# Speichern
results_df = pd.DataFrame(results)
results_df.to_csv("/bachelorarbeit-ner/results/blstm_eval_10runs.csv", index=False)

print("Alle Ergebnisse gespeichert.")