In [None]:
import os
import numpy as np
import pandas as pd
import random
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    Trainer, TrainingArguments, DataCollatorForTokenClassification
)
from sklearn.model_selection import train_test_split
import evaluate

# 1. Seed setzen für Reproduzierbarkeit
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# 2. CoNLL-Dateien laden
def load_conll_data(file_path):
    tokens, ner_tags = [], []
    all_tokens, all_tags = [], []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                if tokens:
                    all_tokens.append(tokens)
                    all_tags.append(ner_tags)
                tokens, ner_tags = [], []
            else:
                splits = line.strip().split()
                tokens.append(splits[0])
                ner_tags.append(splits[-1])
    if tokens:
        all_tokens.append(tokens)
        all_tags.append(ner_tags)
    return all_tokens, all_tags

# 3. Lade alle CoNLL-Dateien
data_dir = "/bachelorarbeit-ner/data/annotated"
all_tokens, all_tags = [], []
for file_name in sorted(os.listdir(data_dir)):
    if file_name.endswith(".conll"):
        tokens, tags = load_conll_data(os.path.join(data_dir, file_name))
        all_tokens.extend(tokens)
        all_tags.extend(tags)

# 4. Aufteilen in train/val/test
train_tokens, temp_tokens, train_tags, temp_tags = train_test_split(all_tokens, all_tags, test_size=0.3, random_state=42)
val_tokens, test_tokens, val_tags, test_tags = train_test_split(temp_tokens, temp_tags, test_size=0.5, random_state=42)

# 5. Tag-Mapping
unique_tags = sorted(list(set(tag for doc in all_tags for tag in doc)))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

def create_dataset(tokens, tags):
    data = {
        "tokens": tokens,
        "ner_tags": [[tag2id[tag] for tag in tag_seq] for tag_seq in tags]
    }
    return Dataset.from_dict(data)

datasets = DatasetDict({
    "train": create_dataset(train_tokens, train_tags),
    "validation": create_dataset(val_tokens, val_tags),
    "test": create_dataset(test_tokens, test_tags)
})

# 6. Tokenisierung
model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]
)

# 7. Metriken
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [id2tag[p] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[l] for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# 8. Trainings-Schleife für 10 Seeds
all_results = []
for seed in range(1, 11):
    print(f"\n===== Training with seed {seed} =====")
    set_seed(seed)

    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2id))

    training_args = TrainingArguments(
        output_dir=f"./results/bert_seed_{seed}",
        evaluation_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=4,
        weight_decay=0.01,
        save_strategy="no",
        logging_dir=f"./logs/bert_seed_{seed}",
        seed=seed
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_results = trainer.evaluate(tokenized_datasets["test"])
    eval_results["seed"] = seed
    all_results.append(eval_results)

# 9. Ergebnisse speichern
results_df = pd.DataFrame(all_results)
results_path = "/bachelorarbeit-ner/results/bert_base_eval_10runs.csv"
results_df.to_csv(results_path, index=False)
print(f"\nGesamtergebnis gespeichert unter: {results_path}")