In [5]:
# Import Spacy
#!pip install spacy
import spacy



In [6]:
# Create a blank English nlp object
#  nlp - object containing the processing pipeline in spacy
nlp = spacy.blank("en")

# When you process a text with the nlp object, spaCy creates a Doc object – short for "document". 
# The Doc lets you access information about the text in a structured way, and no information is lost.
doc = nlp("Hello world!")

# Iterate over tokens in a doc
#  view all the tokens in the doc
for token in doc:
    print(token.text)

Hello
world
!


In [7]:
# view the token with index x
token = doc[1]
# Get the token text via the .text attribute
print(token.text)

world


In [10]:
# Get the token text via the .text attribute
print(doc[0])
#print(doc)

Hello
Hello world!


In [11]:
from IPython.display import Image
Image(url="doc_span.png")

A Span object is a slice of the document consisting of one or more tokens. It's only a view of the Doc and doesn't contain any data itself.

To create a span, you can use Python's slice notation. For example, 1:3 will create a slice starting from the token at position 1, up to – but not including! – the token at position 3.

In [13]:
# A slice from the Doc is a span object
span = doc[1:3]

print(span.text)


# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

world!


Lexical Attributes


They refer to the entry in the vocabulary and don't depend on the token's context.

Here you can see some of the available token attributes:

i is the index of the token within the parent document.

text returns the token text.

is_alpha, is_punct and like_num return boolean values indicating whether the token consists of alphabetic characters, whether it's punctuation or whether it resembles a number. For example, a token "10" – one, zero – or the word "ten" – T, E, N.

In [14]:
doc = nlp("It costs $5.")

In [17]:
print("Index: ", [token.i for token in doc])
print("Index: ", [token.text for token in doc])

print("is_alpha : ", [token.is_alpha for token in doc])
print("is_punct: ", [token.is_punct for token in doc])
print("like_num: ", [token.like_num for token in doc])

Index:  [0, 1, 2, 3, 4]
Index:  ['It', 'costs', '$', '5', '.']
is_alpha :  [True, True, False, False, False]
is_punct:  [False, False, False, False, True]
like_num:  [False, False, False, True, False]


In [None]:
import spacy

nlp = spacy.blank("en")

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)