In [1]:
# Quelle: https://github.com/explosion/spacy-stanza

In [1]:
import stanza
import spacy_stanza
from spacy import displacy
from spacy import Language

In [2]:
# Download the stanza model if necessary
stanza.download("en")

# Initialize the pipeline
nlp = spacy_stanza.load_pipeline("en")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-06 20:44:34 INFO: Downloading default packages for language: en (English) ...
2023-12-06 20:44:35 INFO: File exists: C:\Users\Franziska\stanza_resources\en\default.zip
2023-12-06 20:44:40 INFO: Finished downloading models and saved to C:\Users\Franziska\stanza_resources.
2023-12-06 20:44:40 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-12-06 20:44:41 INFO: Loading these models for language: en (English):
| Processor    | Package             |
--------------------------------------
| tokenize     | combined            |
| pos          | combined_charlm     |
| lemma        | combined_nocharlm   |
| constituency | ptb3-revised_charlm |
| depparse     | combined_charlm     |
| sentiment    | sstplus             |
| ner          | ontonotes_charlm    |

2023-12-06 20:44:41 INFO: Using device: cpu
2023-12-06 20:44:41 INFO: Loading: tokenize
2023-12-06 20:44:42 INFO: Loading: pos
2023-12-06 20:44:42 INFO: Loading: lemma
2023-12-06 20:44:42 INFO: Loading: constituency
2023-12-06 20:44:43 INFO: Loading: depparse
2023-12-06 20:44:43 INFO: Loading: sentiment
2023-12-06 20:44:43 INFO: Loading: ner
2023-12-06 20:44:44 INFO: Done loading processors!


In [10]:
doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_, token.ent_iob)
print(f"-------------------------------------------------------")
print(doc.ents)

Barack Barack PROPN nsubj:pass PERSON 3
Obama Obama PROPN flat PERSON 1
was be AUX aux:pass  2
born bear VERB root  2
in in ADP case  2
Hawaii Hawaii PROPN obl GPE 3
. . PUNCT punct  2
He he PRON nsubj:pass  2
was be AUX aux:pass  2
elected elect VERB root  2
president president NOUN xcomp  2
in in ADP case  2
2008 2008 NUM obl DATE 3
. . PUNCT punct  2
-------------------------------------------------------
(Barack Obama, Hawaii, 2008)


In [11]:
# Access spaCy's lexical attributes
print([token.is_stop for token in doc])
print([token.like_num for token in doc])

# Visualize dependencies
displacy.render(doc)  # or displacy.render if you're in a Jupyter notebook

# Process texts with nlp.pipe
for doc in nlp.pipe(["Lots of texts", "Even more texts", "..."]):
    print(doc.text)

# Combine with your own custom pipeline components
@Language.component("custom_component")
def custom_component(doc):
    # Do something to the doc here
    print(f"Custom component called: {doc.text}")
    return doc

nlp.add_pipe("custom_component")
doc = nlp("Some text")

# Serialize attributes to a numpy array
np_array = doc.to_array(['ORTH', 'LEMMA', 'POS'])
np_array

[False, False, True, False, True, False, False, True, True, False, False, True, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False, True, False]


Lots of texts
Even more texts
...
Custom component called: Some text


array([[14298532990736973729,  7000492816108906599,                   90],
       [15099781594404091470, 15099781594404091470,                   92]],
      dtype=uint64)