In [1]:
import spacy

In [2]:
nlp = spacy.blank('en')
doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")

for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [3]:
doc[0]

Dr.

In [4]:
token = doc[1]
token.text

'Strange'

In [5]:
dir(token)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

#### Token attribute

In [6]:
doc = nlp("Tony gave two $ to Peter.")
token0 = doc[0]
token0

Tony

In [7]:
token0.is_alpha

True

In [8]:
token0.is_digit

False

In [9]:
token0.like_num

False

In [10]:
token2 = doc[2]
token2

two

In [11]:
token2.like_num

True

In [12]:
token2.is_alpha

True

In [13]:
token3 = doc[3]
token3

$

In [14]:
token3.is_currency

True

In [15]:
for token in doc:
    print(token, "==>", "index: ", token.i, "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Tony ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
gave ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
two ==> index:  2 is_alpha: True is_punct: False like_num: True is_currency: False
$ ==> index:  3 is_alpha: False is_punct: False like_num: False is_currency: True
to ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
Peter ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False
. ==> index:  6 is_alpha: False is_punct: True like_num: False is_currency: False


#### Collecting email ids of students from students information sheet

In [16]:
# Reading text file
with open('student.txt') as f:
    text = f.readlines()
text

['Name\t\tEmail\n',
 'Abi\t\tabi@abi.com\n',
 'Atif\t\tatif@mail.com\n',
 'Abdullah\tabdullah@mail.com']

In [17]:
# removing extra space with space
text = " ".join(text)
text

'Name\t\tEmail\n Abi\t\tabi@abi.com\n Atif\t\tatif@mail.com\n Abdullah\tabdullah@mail.com'

In [18]:
# Fetching email from text file
doc = nlp(text)
emails = []

for token in doc:
    if token.like_email:
        emails.append(token)
emails

[abi@abi.com, atif@mail.com, abdullah@mail.com]

#### Support in other languages

In [19]:
nlp = spacy.blank("hi")
doc = nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")

for token in doc:
    print(token)

भैया
जी
!
5000
₹
उधार
थे
वो
वापस
देदो


In [20]:
for token in doc:
    print(token, "==>", "index: ", token.i, "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

भैया ==> index:  0 is_alpha: False is_punct: False like_num: False is_currency: False
जी ==> index:  1 is_alpha: False is_punct: False like_num: False is_currency: False
! ==> index:  2 is_alpha: False is_punct: True like_num: False is_currency: False
5000 ==> index:  3 is_alpha: False is_punct: False like_num: True is_currency: False
₹ ==> index:  4 is_alpha: False is_punct: False like_num: False is_currency: True
उधार ==> index:  5 is_alpha: False is_punct: False like_num: False is_currency: False
थे ==> index:  6 is_alpha: False is_punct: False like_num: False is_currency: False
वो ==> index:  7 is_alpha: False is_punct: False like_num: False is_currency: False
वापस ==> index:  8 is_alpha: False is_punct: False like_num: False is_currency: False
देदो ==> index:  9 is_alpha: False is_punct: False like_num: False is_currency: False


#### Customizing tokenizer

In [21]:
from spacy.symbols import ORTH

In [22]:
nlp = spacy.blank('hi')
doc = nlp('gimme double cheese extra large healthy pizza')
tokens = [token.text for token in doc]
print(tokens)

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']


In [23]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

#### Sentence Tokenization

In [24]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

# Showing error because no pipeline is available

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [25]:
nlp.pipeline

[]

In [26]:
# Adding pipeline manually
nlp.add_pipe('sentencizer')
nlp.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x23a959f288>)]

In [27]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr.
Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


### Practice

In [28]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [29]:
nlp = spacy.blank('en')

In [30]:
link = []
doc = nlp(text)
doc


Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.

In [31]:
for token in doc:
    if token.like_url:
        link.append(token)
link

[http://www.data.gov/,
 http://www.science,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

In [32]:
# implementing for and if in single line
websit_link = [token.text for token in doc if token.like_url]
websit_link

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [33]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
trans = nlp(transactions)

result = []
for data in trans:
    if data.like_num or data.is_currency:
        result.append(data)
result

[two, $, 500, €]