In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

In [4]:
doc = nlp("Dr. is saying something. Pay attention.")

for token in doc:
  print(token)

Dr.
is
saying
something
.
Pay
attention
.


In [5]:
type(nlp)

spacy.lang.en.English

In [6]:
type(doc)

spacy.tokens.doc.Doc

In [9]:
doc = nlp("Tony gave two $ to Peter")

token0 = doc[0]

In [10]:
token0

Tony

In [11]:
dir(token0)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [13]:
token0.is_alpha

True

In [14]:
token0.like_num

False

In [15]:
token2 = doc[2]

In [16]:
token2

two

In [17]:
token2.text

'two'

In [18]:
token2.like_num

True

In [20]:
token3 = doc[3]

In [21]:
token3

$

In [22]:
token3.is_currency

True

In [23]:
for token in doc:
  print(token, "-->", "index:", token.i, "is_alpha:", token.is_alpha, "is_num:", token.like_num, "is_currency:", token.is_currency)

Tony --> index: 0 is_alpha: True is_num: False is_currency: False
gave --> index: 1 is_alpha: True is_num: False is_currency: False
two --> index: 2 is_alpha: True is_num: True is_currency: False
$ --> index: 3 is_alpha: False is_num: False is_currency: True
to --> index: 4 is_alpha: True is_num: False is_currency: False
Peter --> index: 5 is_alpha: True is_num: False is_currency: False


In [25]:
text = 'Tomorrow will be a holiday. Email will be sent to abc213@akj.com because \
abc111@gmail.com is old'

doc = nlp(text)

for token in doc:
  if token.like_email:
    print("Email id:", token)

Email id: abc213@akj.com
Email id: abc111@gmail.com


In [26]:
doc = nlp("gimme double cheese extra large burger")

tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'burger']

In [28]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "give"},
    {ORTH: "me"}
])

ValueError: ignored

In [29]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}
])

In [33]:
doc = nlp("gimme burger yo")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'burger', 'yo']

In [34]:
doc = nlp("Dr. is coming to play. He will enjoy this game")

for sentence in doc.sents:
  print(sentence)

ValueError: ignored

In [35]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7f3bd344d140>

In [38]:
doc = nlp("Dr. Strange is coming to play. He will enjoy this game")

for sentence in doc.sents:
  print(sentence)

Dr. Strange is coming to play.
He will enjoy this game


In [42]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

doc = nlp(text)

for token in doc:
  if token.like_url:
    print("URL:", token.text)

URL: http://www.data.gov/
URL: http://www.science
URL: http://data.gov.uk/.
URL: http://www3.norc.org/gss+website/
URL: http://www.europeansocialsurvey.org/.


In [45]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

doc = nlp(transactions)
currencies = [(token.text, token.i) for token in doc if token.is_currency]

In [46]:
currencies

[('$', 3), ('€', 10)]

In [48]:
for currency, index in currencies:
  print("Transaction :", doc[index - 1], currency)

Transaction : two $
Transaction : 500 €
