---
* Load data

---

In [None]:
import json

# Load the training data
with open("TRAINING_DATA.json", "r") as f:
    training_data = json.load(f)

# Print a sample of the data to verify structure
print(training_data[:1])

---

* Install relevant packages

---

In [None]:
# Install SpaCy
!pip install spacy

# Download the German model
!python -m spacy download de_core_news_lg

# Install lookups package (contains lemma_lookup table and other language resources)
!pip install spacy-lookups-data

---

* Split data into training and development/validation data

---

In [None]:
import json
import random

# Load the training data
with open("TRAINING_DATA.json", "r") as f:
    training_data = json.load(f)

# Shuffle the data for randomness
random.seed(42)  # Set seed for reproducibility
random.shuffle(training_data)

# Split into training (80%) and dev (20%)
split_index = int(0.8 * len(training_data))
train_data = training_data[:split_index]
dev_data = training_data[split_index:]

# Save the splits to separate files
with open("train_data.json", "w") as f:
    json.dump(train_data, f, ensure_ascii=False, indent=4)

with open("dev_data.json", "w") as f:
    json.dump(dev_data, f, ensure_ascii=False, indent=4)

print(f"Data split completed: {len(train_data)} training samples, {len(dev_data)} dev samples")


---

* Convert training data into DocBin, so spaCy can process it

---

In [None]:
import spacy
from spacy.tokens import DocBin, MorphAnalysis
import json

# Load your base model
nlp = spacy.load("de_core_news_lg")

# Create DocBin to store annotated docs
train_doc_bin = DocBin()
dev_doc_bin = DocBin()

# Process each sentence in training data
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    spans = annotations.get("tokens", [])

    for i, token_data in enumerate(spans):
        if i < len(doc):
            token = doc[i]
            token.lemma_ = token_data["lemma"]
            token.pos_ = token_data["pos"]
            token.tag_ = token_data.get("tag", "")

            # Validate and assign morph
            morph_value = token_data.get("morph", "")
            if morph_value and isinstance(morph_value, str):  # Ensure morph is a valid string
                try:
                    token.morph = MorphAnalysis(nlp.vocab, nlp.vocab.morphology.add(morph_value))
                except Exception as e:
                    print(f"Skipping invalid morph for token: {token.text} - Error: {e}")
                    token.morph = MorphAnalysis(nlp.vocab)  # Assign empty MorphAnalysis for invalid morphs
            else:
                token.morph = MorphAnalysis(nlp.vocab)

    train_doc_bin.add(doc)

# Process each sentence in dev data
for text, annotations in dev_data:
    doc = nlp.make_doc(text)
    spans = annotations.get("tokens", [])

    for i, token_data in enumerate(spans):
        if i < len(doc):
            token = doc[i]
            token.lemma_ = token_data["lemma"]
            token.pos_ = token_data["pos"]
            token.tag_ = token_data.get("tag", "")

            # Validate and assign morph
            morph_value = token_data.get("morph", "")
            if morph_value and isinstance(morph_value, str):  # Ensure morph is a valid string
                try:
                    token.morph = MorphAnalysis(nlp.vocab, nlp.vocab.morphology.add(morph_value))
                except Exception as e:
                    print(f"Skipping invalid morph for token: {token.text} - Error: {e}")
                    token.morph = MorphAnalysis(nlp.vocab)  # Assign empty MorphAnalysis for invalid morphs
            else:
                token.morph = MorphAnalysis(nlp.vocab)

    dev_doc_bin.add(doc)

# Save to disk
train_doc_bin.to_disk("training_data.spacy")
print("Training data saved to 'training_data.spacy'")

dev_doc_bin.to_disk("dev_data.spacy")
print("Development data saved to 'dev_data.spacy'")

---
* Setup training configuration

---

In [None]:
# Config file defines the pipeline components and training parameters
!python -m spacy init config config.cfg --lang de --pipeline tagger,lemmatizer,morphologizer

---

* Fine-tune model

---

In [None]:
# Apply config file to training and dev data
!python -m spacy train config.cfg --paths.train training_data.spacy --paths.dev dev_data.spacy --output ./output
# takes 4-6min

---

* Load and test new model

---

In [None]:
# Load fine-tuned model
fine_tuned_nlp = spacy.load("./output/model-last")

# Load original SpaCy model
original_nlp = spacy.load("de_core_news_lg")

# Test sentence
text = "Hallöchen, hallo, hi und moin, meine Nachbarin hat behauptet, die Erde sei eine Scheibe. Ich würde gerne wissen, was passieren würde, wenn herauskäme, dass die Erde wirklich eine Scheibe ist. Ich meine, was wäre dann? Ändert sich dann alles? Darf ich dann mit meinem Leben weitermachen wie bisher? Oder muss ich mein Leben umstellen? LG und MFG"

# Process with both models
doc_fine_tuned = fine_tuned_nlp(text)
doc_original = original_nlp(text)

# Print side-by-side comparison (FT: fine-tuned; OG: original)
print(f"{'Token':<15}{'FT Lemma':<15}{'OG Lemma':<15}{'FT POS':<15}{'OG POS':<15}{'FT Tag':<15}{'OG Tag':<15}{'FT Morph':<15}{'OG Morph':<25}")
print("=" * 150)

for token_fine, token_orig in zip(doc_fine_tuned, doc_original):
    print(f"{token_fine.text:<15}{token_fine.lemma_:<15}{token_orig.lemma_:<15}{token_fine.pos_:<15}{token_orig.pos_:<15}{token_fine.tag_:<15}{token_orig.tag_:<15}{str(token_fine.morph):<15}{str(token_orig.morph):<25}")

In [None]:
print(nlp.pipe_names)
print(fine_tuned_nlp.pipe_names)