wejście: surowy tekst (np. fragment dokumentu),
wyjście: ustrukturyzowany wynik:

	•	tokeny,
	•	zdania,
	•	POS / tagi,
	•	morfologia (przypadek, liczba, rodzaj itd.),
	•	wstępne NER jako „hinty” dla kolejnych warstw.

In [2]:
!pip install spacy
!python -m spacy download pl_core_news_md

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.3/772.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Using cached typer_slim-0.20.0-py3-none-any.whl (47 kB)
Using cached wasabi-1.1.3-py3-none-any.whl (27 kB)
Using cached weasel-0.4.3-py3-none-any.whl (50 kB)
Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)
Downloading blis-1.3.3-cp310-cp310-macosx_11_0_arm64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached cloudpathlib-0.23.0-py3-none-any.whl (62 kB)
Using cached confection-0.1.5-py3-none-any.whl (35 kB)
Using cached smart_open-7.5.0-py3-none-any.whl (63 kB)
Using cached typing_inspection-0.4.2-py3-none-any.whl (14 kB)
Downloading wrapt-2.0.1-cp310-cp310-macosx_11_0_arm64.whl (61 kB)
Installing collected packages: wrapt, wasabi, typing-inspection, typer-slim, tqdm, spacy-loggers, spacy-

In [3]:
import spacy
from dataclasses import dataclass
from typing import List, Optional, Any, Dict

# Ładowanie spaCy

In [4]:

try:
    nlp = spacy.load("pl_core_news_md")
except OSError as e:
    raise RuntimeError(
        "Polish model 'pl_core_news_md' is not installed. "
        "Run: python -m spacy download pl_core_news_md"
    ) from e

nlp

<spacy.lang.pl.Polish at 0x12649d1c0>

In [5]:
@dataclass
class TokenInfo:
    idx: int                 # token index in doc
    text: str
    lemma: str
    pos: str                 # coarse POS tag
    tag: str                 # detailed tag
    morph: str               # raw morph string
    dep: str                 # dependency relation
    head: int                # index of head token
    is_stop: bool
    is_punct: bool
    whitespace: str          # trailing whitespace


@dataclass
class SentenceInfo:
    sent_id: int
    text: str
    start_char: int
    end_char: int
    token_indices: List[int]  # indices of tokens belonging to this sentence


@dataclass
class EntityHint:
    text: str
    label: str
    start_char: int
    end_char: int


@dataclass
class PreprocessResult:
    raw_text: str
    tokens: List[TokenInfo]
    sentences: List[SentenceInfo]
    entities: List[EntityHint]
    meta: Dict[str, Any]

TokenInfo:

	•	idx=i – numer tokena w dokumencie (int).
	•	text=token.text – oryginalny tekst tokena.
	•	lemma=token.lemma_ – lemma (forma podstawowa) z pipeline’u spaCy.
	•	pos=token.pos_ – ogólny POS (część mowy, np. NOUN, VERB).
	•	tag=token.tag_ – szczegółowy tag morfosyntaktyczny (np. specyficzny tag UD).
	•	morph=token.morph.to_string() – cechy morfologiczne sklejone do jednego stringa, np. „Case=Nom|Number=Sing”.
	•	dep=token.dep_ – relacja składniowa (dependency, np. nsubj, obj).
	•	head=token.head.i – indeks tokena, który jest „headem” w drzewie zależności.
	•	is_stop=token.is_stop – czy jest słowem funkcyjnym/stopword.
	•	is_punct=token.is_punct – czy to znak interpunkcyjny.
	•	whitespace=token.whitespace_ – oryginalny trailing whitespace (np. " ", "\n").

In [6]:
class SpacyPreprocessor:
    def __init__(
        self,
        model_name: str = "pl_core_news_md",
        use_ner_hints: bool = True,
        disable: Optional[List[str]] = None,
    ) -> None:
        """
        Wrapper around spaCy Polish pipeline.

        Parameters
        ----------
        model_name : str
            Name of the spaCy model to load.
        use_ner_hints : bool
            Whether to extract NER hints from spaCy doc.ents.
        disable : list[str] | None
            Optional list of pipeline components to disable for speed.
            Example: ["tagger", "parser", "attribute_ruler", "lemmatizer"]
        """
        self.model_name = model_name
        self.use_ner_hints = use_ner_hints
        self.disable = disable or []

        # Load or reuse existing global nlp if possible
        try:
            self.nlp = spacy.load(model_name, disable=self.disable)
        except OSError as e:
            raise RuntimeError(
                f"spaCy model '{model_name}' is not installed. "
                f"Install with: python -m spacy download {model_name}"
            ) from e

    def _tokens_to_info(self, doc: "spacy.tokens.Doc") -> List[TokenInfo]:
        tokens_info: List[TokenInfo] = []
        for i, token in enumerate(doc):
            tokens_info.append(
                TokenInfo(
                    idx=i,
                    text=token.text,
                    lemma=token.lemma_,
                    pos=token.pos_,
                    tag=token.tag_,
                    morph=token.morph,
                    dep=token.dep_,
                    head=token.head.i,
                    is_stop=token.is_stop,
                    is_punct=token.is_punct,
                    whitespace=token.whitespace_,
                )
            )
        return tokens_info

    def _sentences_to_info(self, doc: "spacy.tokens.Doc") -> List[SentenceInfo]:
        sentences_info: List[SentenceInfo] = []
        for sent_id, sent in enumerate(doc.sents):
            token_indices = list(range(sent.start, sent.end))
            sentences_info.append(
                SentenceInfo(
                    sent_id=sent_id,
                    text=sent.text,
                    start_char=sent.start_char,
                    end_char=sent.end_char,
                    token_indices=token_indices,
                )
            )
        return sentences_info

    def _entities_to_hints(self, doc: "spacy.tokens.Doc") -> List[EntityHint]:
        entity_hints: List[EntityHint] = []
        for ent in doc.ents:
            entity_hints.append(
                EntityHint(
                    text=ent.text,
                    label=ent.label_,
                    start_char=ent.start_char,
                    end_char=ent.end_char,
                )
            )
        return entity_hints

    def __call__(self, text: str) -> PreprocessResult:
        """
        Run full preprocessing pipeline on raw text.
        """
        doc = self.nlp(text)

        tokens = self._tokens_to_info(doc)
        sentences = self._sentences_to_info(doc)
        entities = self._entities_to_hints(doc) if self.use_ner_hints else []

        meta = {
            "model_name": self.model_name,
            "use_ner_hints": self.use_ner_hints,
            "num_tokens": len(tokens),
            "num_sentences": len(sentences),
            "num_entities": len(entities),
        }

        return PreprocessResult(
            raw_text=text,
            tokens=tokens,
            sentences=sentences,
            entities=entities,
            meta=meta,
        )

In [7]:
example_text = """
Nazywam się Jan Kowalski, mój PESEL to 90010112345.
Mieszkam w Warszawie przy ulicy Długiej 5.
"""

preprocessor = SpacyPreprocessor()
result = preprocessor(example_text)

result.meta


{'model_name': 'pl_core_news_md',
 'use_ner_hints': True,
 'num_tokens': 21,
 'num_sentences': 3,
 'num_entities': 3}

In [8]:
for t in result.tokens[:20]:
    print(
        f"{t.idx:>2}: {t.text:<15} POS={t.pos:<5} TAG={t.tag:<8} MORPH={t.morph}"
    )

 0: 
               POS=SPACE TAG=_SP      MORPH=
 1: Nazywam         POS=VERB  TAG=FIN      MORPH=Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act
 2: się             POS=PRON  TAG=QUB      MORPH=PronType=Prs|Reflex=Yes
 3: Jan             POS=PROPN TAG=SUBST    MORPH=Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing
 4: Kowalski        POS=PROPN TAG=SUBST    MORPH=Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing
 5: ,               POS=PUNCT TAG=INTERP   MORPH=PunctType=Comm
 6: mój             POS=DET   TAG=ADJ      MORPH=Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
 7: PESEL           POS=PROPN TAG=SUBST    MORPH=Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
 8: to              POS=PRON  TAG=PRED     MORPH=Case=Nom|Gender=Neut|Number=Sing|PronType=Dem
 9: 90010112345     POS=NUM   TAG=NUM      MORPH=Animacy=Inan|Case=Nom|Gender=Masc|NumForm=Digit|NumType=Card|Number=Sing
10: .               POS=PUNCT TAG=SUBST    MO

In [9]:
for s in result.sentences:
    print(f"Sentence {s.sent_id}: {s.text}")

Sentence 0: 

Sentence 1: Nazywam się Jan Kowalski, mój PESEL to 90010112345.

Sentence 2: Mieszkam w Warszawie przy ulicy Długiej 5.



In [None]:
for e in result.entities:
    print(f"[{e.label}] {e.text} ({e.start_char}-{e.end_char})")

[persName] Jan Kowalski (13-25)
[placeName] Warszawie (64-73)
[geogName] ulicy Długiej (79-92)


In [11]:
result = preprocessor(example_text)

In [None]:
text = """
"Reprezentujemy konsorcjum DataSafe, które zajmuje się anonimizacją dokumentów. "
"Siedziba firmy znajduje się we Wrocławiu przy ulicy Kościuszki 10. "
"Dane osobowe takich osób jak Jan Kowalski czy Anna Nowak muszą zostać zanonimizowane. "
"Mój nr telefonu to 123-456-789. Nazywam się Krawiec i urodziłem się 20-10-2024."
"""

preprocessor = SpacyPreprocessor()
result = preprocessor(text)

print(result.meta)
print("--- TOKENS ---")
for t in result.tokens:
    print(t.idx, t.text, t.lemma, t.pos, t.morph)

print("--- ENTITIES ---")
for e in result.entities:
    print(e.label, "->", e.text)

{'model_name': 'pl_core_news_md', 'use_ner_hints': True, 'num_tokens': 61, 'num_sentences': 6, 'num_entities': 5}
--- TOKENS ---
0 
 
 SPACE 
1 Reprezentujemy Reprezentujemy VERB Aspect=Imp|Mood=Ind|Number=Plur|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act
2 konsorcjum konsorcjum NOUN Case=Acc|Gender=Neut|Number=Sing
3 DataSafe DataSafe ADV 
4 , , PUNCT PunctType=Comm
5 które który DET Case=Nom|Gender=Neut|Number=Sing|PronType=Rel
6 zajmuje zajmować VERB Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act
7 się się PRON PronType=Prs|Reflex=Yes
8 anonimizacją anonimizacja NOUN Case=Ins|Gender=Fem|Number=Sing
9 dokumentów dokument NOUN Animacy=Inan|Case=Gen|Gender=Masc|Number=Plur
10 . . PUNCT PunctType=Peri
11 
 
 SPACE 
12 Siedziba Siedziba NOUN Case=Nom|Gender=Fem|Number=Sing
13 firmy firma NOUN Case=Gen|Gender=Fem|Number=Sing
14 znajduje znajdować VERB Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act
15 się się PRON PronType=Prs|Reflex=Ye

In [20]:
result.entities

[EntityHint(text='Wrocławiu', label='placeName', start_char=111, end_char=120),
 EntityHint(text='ulicy', label='geogName', start_char=126, end_char=131),
 EntityHint(text='Jan Kowalski', label='persName', start_char=173, end_char=185),
 EntityHint(text='Anna Nowak', label='persName', start_char=190, end_char=200),
 EntityHint(text='Krawiec', label='persName', start_char=274, end_char=281)]