wejście: surowy tekst (np. fragment dokumentu),
wyjście: ustrukturyzowany wynik:

	•	tokeny,
	•	zdania,
	•	POS / tagi,
	•	morfologia (przypadek, liczba, rodzaj itd.),
	•	wstępne NER jako „hinty” dla kolejnych warstw.

In [1]:
!pip install spacy
!python -m spacy download pl_core_news_md

Collecting spacy
  Downloading spacy-3.8.11-cp312-cp312-macosx_11_0_arm64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.15-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.3 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.13-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.7 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.12-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.5 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.10-cp312-cp312-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Do

In [2]:
import spacy
from dataclasses import dataclass
from typing import List, Optional, Any, Dict

# Ładowanie spaCy

In [3]:

try:
    nlp = spacy.load("pl_core_news_md")
except OSError as e:
    raise RuntimeError(
        "Polish model 'pl_core_news_md' is not installed. "
        "Run: python -m spacy download pl_core_news_md"
    ) from e

nlp

<spacy.lang.pl.Polish at 0x107932ba0>

In [4]:
@dataclass
class TokenInfo:
    idx: int                 # token index in doc
    text: str
    lemma: str
    pos: str                 # coarse POS tag
    tag: str                 # detailed tag
    morph: str               # raw morph string
    dep: str                 # dependency relation
    head: int                # index of head token
    is_stop: bool
    is_punct: bool
    whitespace: str          # trailing whitespace


@dataclass
class SentenceInfo:
    sent_id: int
    text: str
    start_char: int
    end_char: int
    token_indices: List[int]  # indices of tokens belonging to this sentence


@dataclass
class EntityHint:
    text: str
    label: str
    start_char: int
    end_char: int


@dataclass
class PreprocessResult:
    raw_text: str
    tokens: List[TokenInfo]
    sentences: List[SentenceInfo]
    entities: List[EntityHint]
    meta: Dict[str, Any]

In [8]:
class SpacyPreprocessor:
    def __init__(
        self,
        model_name: str = "pl_core_news_md",
        use_ner_hints: bool = True,
        disable: Optional[List[str]] = None,
    ) -> None:
        """
        Wrapper around spaCy Polish pipeline.

        Parameters
        ----------
        model_name : str
            Name of the spaCy model to load.
        use_ner_hints : bool
            Whether to extract NER hints from spaCy doc.ents.
        disable : list[str] | None
            Optional list of pipeline components to disable for speed.
            Example: ["tagger", "parser", "attribute_ruler", "lemmatizer"]
        """
        self.model_name = model_name
        self.use_ner_hints = use_ner_hints
        self.disable = disable or []

        # Load or reuse existing global nlp if possible
        try:
            self.nlp = spacy.load(model_name, disable=self.disable)
        except OSError as e:
            raise RuntimeError(
                f"spaCy model '{model_name}' is not installed. "
                f"Install with: python -m spacy download {model_name}"
            ) from e

    def _tokens_to_info(self, doc: "spacy.tokens.Doc") -> List[TokenInfo]:
        tokens_info: List[TokenInfo] = []
        for i, token in enumerate(doc):
            tokens_info.append(
                TokenInfo(
                    idx=i,
                    text=token.text,
                    lemma=token.lemma_,
                    pos=token.pos_,
                    tag=token.tag_,
                    morph=token.morph,
                    dep=token.dep_,
                    head=token.head.i,
                    is_stop=token.is_stop,
                    is_punct=token.is_punct,
                    whitespace=token.whitespace_,
                )
            )
        return tokens_info

    def _sentences_to_info(self, doc: "spacy.tokens.Doc") -> List[SentenceInfo]:
        sentences_info: List[SentenceInfo] = []
        for sent_id, sent in enumerate(doc.sents):
            token_indices = list(range(sent.start, sent.end))
            sentences_info.append(
                SentenceInfo(
                    sent_id=sent_id,
                    text=sent.text,
                    start_char=sent.start_char,
                    end_char=sent.end_char,
                    token_indices=token_indices,
                )
            )
        return sentences_info

    def _entities_to_hints(self, doc: "spacy.tokens.Doc") -> List[EntityHint]:
        entity_hints: List[EntityHint] = []
        for ent in doc.ents:
            entity_hints.append(
                EntityHint(
                    text=ent.text,
                    label=ent.label_,
                    start_char=ent.start_char,
                    end_char=ent.end_char,
                )
            )
        return entity_hints

    def __call__(self, text: str) -> PreprocessResult:
        """
        Run full preprocessing pipeline on raw text.
        """
        doc = self.nlp(text)

        tokens = self._tokens_to_info(doc)
        sentences = self._sentences_to_info(doc)
        entities = self._entities_to_hints(doc) if self.use_ner_hints else []

        meta = {
            "model_name": self.model_name,
            "use_ner_hints": self.use_ner_hints,
            "num_tokens": len(tokens),
            "num_sentences": len(sentences),
            "num_entities": len(entities),
        }

        return PreprocessResult(
            raw_text=text,
            tokens=tokens,
            sentences=sentences,
            entities=entities,
            meta=meta,
        )

In [11]:
example_text = """
Nazywam się Jan Kowalski, mój PESEL to 90010112345.
Mieszkam w Warszawie przy ulicy Długiej 5.
"""

preprocessor = SpacyPreprocessor()
result = preprocessor(example_text)

result.meta


{'model_name': 'pl_core_news_md',
 'use_ner_hints': True,
 'num_tokens': 21,
 'num_sentences': 3,
 'num_entities': 3}

In [12]:
for t in result.tokens[:20]:
    print(
        f"{t.idx:>2}: {t.text:<15} POS={t.pos:<5} TAG={t.tag:<8} MORPH={t.morph}"
    )

 0: 
               POS=SPACE TAG=_SP      MORPH=
 1: Nazywam         POS=VERB  TAG=FIN      MORPH=Aspect=Imp|Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin|Voice=Act
 2: się             POS=PRON  TAG=QUB      MORPH=PronType=Prs|Reflex=Yes
 3: Jan             POS=PROPN TAG=SUBST    MORPH=Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing
 4: Kowalski        POS=PROPN TAG=SUBST    MORPH=Animacy=Hum|Case=Nom|Gender=Masc|Number=Sing
 5: ,               POS=PUNCT TAG=INTERP   MORPH=PunctType=Comm
 6: mój             POS=DET   TAG=ADJ      MORPH=Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing|Number[psor]=Sing|Person=1|Poss=Yes|PronType=Prs
 7: PESEL           POS=PROPN TAG=SUBST    MORPH=Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing
 8: to              POS=PRON  TAG=PRED     MORPH=Case=Nom|Gender=Neut|Number=Sing|PronType=Dem
 9: 90010112345     POS=NUM   TAG=NUM      MORPH=Animacy=Inan|Case=Nom|Gender=Masc|NumForm=Digit|NumType=Card|Number=Sing
10: .               POS=PUNCT TAG=SUBST    MO

In [13]:
for s in result.sentences:
    print(f"Sentence {s.sent_id}: {s.text}")

Sentence 0: 

Sentence 1: Nazywam się Jan Kowalski, mój PESEL to 90010112345.

Sentence 2: Mieszkam w Warszawie przy ulicy Długiej 5.



In [14]:
for e in result.entities:
    print(f"[{e.label}] {e.text} ({e.start_char}-{e.end_char})")

[persName] Jan Kowalski (13-25)
[placeName] Warszawie (64-73)
[geogName] ulicy Długiej (79-92)
