In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")
doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")
for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [3]:
doc[0]

Dr.

In [4]:
token = doc[1]
token.text

'Strange'

In [5]:
span = doc[0:5]
span

Dr. Strange loves pav bhaji

In [6]:
doc = nlp("Tony gave two $ to Peter.")

In [7]:
token0 = doc[0]
token0

Tony

In [8]:
token.is_alpha

True

In [9]:
token0.like_num

False

In [10]:
for token in doc:
    print(token, "==>", "index: ", token.i, "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Tony ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
gave ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
two ==> index:  2 is_alpha: True is_punct: False like_num: True is_currency: False
$ ==> index:  3 is_alpha: False is_punct: False like_num: False is_currency: True
to ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
Peter ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False
. ==> index:  6 is_alpha: False is_punct: True like_num: False is_currency: False


In [11]:
with open("students.txt") as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [12]:
text = " ".join(text)
text



In [13]:
doc = nlp(text)
emails = []
for token in doc:
    if token.like_email:
        emails.append(token.text)
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

In [14]:
nlp = spacy.blank("hi")
doc = nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")
for token in doc:
    print(token, token.is_currency)

भैया False
जी False
! False
5000 False
₹ True
उधार False
थे False
वो False
वापस False
देदो False


In [15]:
from spacy.symbols import ORTH

nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [16]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"},
])
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [17]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x29960499dd0>

In [18]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


In [19]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''
text

'\nLook for data to help you address the question. Governments are good\nsources because data from public research is often freely available. Good\nplaces to start include http://www.data.gov/, and http://www.science.\ngov/, and in the United Kingdom, http://data.gov.uk/.\nTwo of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, \nand the European Social Survey at http://www.europeansocialsurvey.org/.\n'

In [20]:
text = " ".join(text)
text

'\n L o o k   f o r   d a t a   t o   h e l p   y o u   a d d r e s s   t h e   q u e s t i o n .   G o v e r n m e n t s   a r e   g o o d \n s o u r c e s   b e c a u s e   d a t a   f r o m   p u b l i c   r e s e a r c h   i s   o f t e n   f r e e l y   a v a i l a b l e .   G o o d \n p l a c e s   t o   s t a r t   i n c l u d e   h t t p : / / w w w . d a t a . g o v / ,   a n d   h t t p : / / w w w . s c i e n c e . \n g o v / ,   a n d   i n   t h e   U n i t e d   K i n g d o m ,   h t t p : / / d a t a . g o v . u k / . \n T w o   o f   m y   f a v o r i t e   d a t a   s e t s   a r e   t h e   G e n e r a l   S o c i a l   S u r v e y   a t   h t t p : / / w w w 3 . n o r c . o r g / g s s + w e b s i t e / ,   \n a n d   t h e   E u r o p e a n   S o c i a l   S u r v e y   a t   h t t p : / / w w w . e u r o p e a n s o c i a l s u r v e y . o r g / . \n'

In [21]:
doc = nlp(text)
urls = []
for token in doc:
    if token.like_url:
        urls.append(token.text)
urls

[]

In [22]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''
text

'\nLook for data to help you address the question. Governments are good\nsources because data from public research is often freely available. Good\nplaces to start include http://www.data.gov/, and http://www.science.\ngov/, and in the United Kingdom, http://data.gov.uk/.\nTwo of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, \nand the European Social Survey at http://www.europeansocialsurvey.org/.\n'

In [23]:
doc = nlp(text)
urls = []
for token in doc:
    if token.like_url:
        urls.append(token.text)
urls

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [25]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"
doc = nlp(transactions)
cur = []
for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)  

two $
500 €
