In [None]:
import os
import random
import numpy as np
import torch
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForTokenClassification
)
import evaluate
from sklearn.model_selection import train_test_split

# 1. Seed-Funktion für Reproduzierbarkeit
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

GLOBAL_SEED = 42

# 2. CoNLL-Daten laden
def load_conll_data(file_path):
    tokens = []
    ner_tags = []
    all_tokens = []
    all_tags = []
    
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip() == "":
                if tokens:
                    all_tokens.append(tokens)
                    all_tags.append(ner_tags)
                tokens = []
                ner_tags = []
            else:
                splits = line.strip().split()
                tokens.append(splits[0])
                ner_tags.append(splits[-1])
    if tokens:
        all_tokens.append(tokens)
        all_tags.append(ner_tags)
    
    return all_tokens, all_tags

data_dir = r"C:\Users\Yannek Dirksen\Desktop\Uni\Bachelorarbeit\For Real\Annotierte Pamphletes"
all_tokens, all_tags = [], []

for file_name in sorted(os.listdir(data_dir)):
    if file_name.endswith(".conll"):
        tokens, tags = load_conll_data(os.path.join(data_dir, file_name))
        all_tokens.extend(tokens)
        all_tags.extend(tags)

# 3. Aufteilen in Trainings-, Validierungs- und Testdaten (70/15/15)
train_tokens, temp_tokens, train_tags, temp_tags = train_test_split(all_tokens, all_tags, test_size=0.3, random_state=GLOBAL_SEED)
val_tokens, test_tokens, val_tags, test_tags = train_test_split(temp_tokens, temp_tags, test_size=0.5, random_state=GLOBAL_SEED)

# 4. Mapping von NER-Tags auf IDs
unique_tags = list(set(tag for doc in all_tags for tag in doc))
tag2id = {tag: idx for idx, tag in enumerate(unique_tags)}
id2tag = {idx: tag for tag, idx in tag2id.items()}

def create_dataset(tokens, tags):
    data = {
        "tokens": tokens,
        "ner_tags": [[tag2id[tag] for tag in tag_seq] for tag_seq in tags]
    }
    return Dataset.from_dict(data)

train_dataset = create_dataset(train_tokens, train_tags)
val_dataset = create_dataset(val_tokens, val_tags)
test_dataset = create_dataset(test_tokens, test_tags)

datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

# 5. Tokenisierung und Label-Ausrichtung
model_name = 'roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"]
)

# 6. Trainingsargumente (gleich für alle Durchläufe)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_overall_f1",
    greater_is_better=True,
    seed=GLOBAL_SEED  # Dieser Seed wird aber später für jeden Run individuell angepasst
)

data_collator = DataCollatorForTokenClassification(tokenizer)
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = [
        [id2tag[pred] for (pred, lab) in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2tag[lab] for (pred, lab) in zip(prediction, label) if lab != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    flat_metrics = {
        "eval_overall_precision": results["overall_precision"],
        "eval_overall_recall": results["overall_recall"],
        "eval_overall_f1": results["overall_f1"],
        "eval_overall_accuracy": results["overall_accuracy"],
    }
    return flat_metrics

# 7. Experiment: Datenanteils-Experiment mit mehrfachen Durchläufen und individuellen Seeds
fractions = [i / 10 for i in range(1, 11)]  # 0.1, 0.2, ..., 1.0
results_dict = {}
num_runs = 10  # Anzahl der Wiederholungen pro Anteil; kann auf 10 erhöht werden

for frac in fractions:
    frac_str = f"{int(frac*100)}%"
    f1_scores = []
    print(f"\n--- Training mit {frac_str} der Trainingsdaten ---")
    subset_size = int(len(tokenized_datasets["train"]) * frac)
    
    # Für jeden Datenanteil mehrere Durchläufe
    for run in range(num_runs):
        # Setze individuellen Seed für den Run (GLOBAL_SEED + run)
        current_seed = GLOBAL_SEED + run
        set_seed(current_seed)
        
        # Aktualisiere den Seed in den Trainingsargumenten
        current_training_args = TrainingArguments(
            output_dir=f"./results/{frac_str}/run_{run+1}",
            eval_strategy="epoch",
            learning_rate=3e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=3,
            weight_decay=0.01,
            save_strategy="epoch",
            save_total_limit=2,
            logging_dir=f'./logs/{frac_str}/run_{run+1}',
            logging_steps=10,
            load_best_model_at_end=True,
            metric_for_best_model="eval_overall_f1",
            greater_is_better=True,
            seed=current_seed
        )
        
        # Stelle sicher, dass der Subset für jeden Run gleich ist
        subset_train_dataset = tokenized_datasets["train"].shuffle(seed=GLOBAL_SEED).select(range(subset_size))
        
        # Lade ein frisches Modell für jeden Run
        model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(unique_tags))
        
        # Initialisiere den Trainer
        trainer = Trainer(
            model=model,
            args=current_training_args,
            train_dataset=subset_train_dataset,
            eval_dataset=tokenized_datasets["validation"],
            data_collator=data_collator,
            compute_metrics=compute_metrics
        )
        
        # Training
        trainer.train()
        
        # Evaluation auf dem Testdatensatz
        test_results = trainer.evaluate(tokenized_datasets["test"])
        f1_score = test_results["eval_overall_f1"]
        f1_scores.append(f1_score)
        print(f"Run {run+1}: F1 Score = {f1_score:.4f}")
    
    # Berechne Mittelwert und Standardabweichung der F1-Scores für den aktuellen Datenanteil
    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)
    results_dict[frac_str] = (mean_f1, std_f1)
    print(f"Ergebnisse für {frac_str}: Mean F1 = {mean_f1:.4f}, Std = {std_f1:.4f}")

print("\n--- Zusammenfassung der Ergebnisse ---")
for data_percent, (mean_f1, std_f1) in results_dict.items():
    print(f"{data_percent}: Mean F1 Score = {mean_f1:.4f} ± {std_f1:.4f}")

