# Spacy

> Docs -> https://spacy.io/

In [1]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import spacy

In [3]:
nlp= spacy.blank('en') # Component

doc=nlp('''"Let's go to N.Y.!"''')

for token in doc:
  print(token)

"
Let
's
go
to
N.Y.
!
"


In [4]:
nlp= spacy.blank('en') # Component

doc=nlp('Dr. Strange loves pav bhaji and etc. of mumbai as it')

for token in doc:
  print(token)

Dr.
Strange
loves
pav
bhaji
and
etc
.
of
mumbai
as
it


In [4]:
type(nlp)

In [5]:
type(doc)

spacy.tokens.doc.Doc

In [6]:
type(token)

spacy.tokens.token.Token

In [7]:
doc[1]

Strange

In [8]:
doc[1:5]

Strange loves pav bhaji

In [10]:
span=doc[1:5]
type(span)

spacy.tokens.span.Span

In [11]:
doc= nlp("Tony gave two $ to peter")

In [12]:
token0= doc[0]
token0

Tony

In [None]:
dir(token0)

In [14]:
type(token0)

spacy.tokens.token.Token

In [15]:
token0.is_alpha

True

In [18]:
token0.like_num

False

In [19]:
token0.is_digit

False

In [22]:
token2= doc[2]
token2, token2.text

(two, 'two')

In [24]:
token2.is_digit

False

In [25]:
token2.like_num

True

In [27]:
token3= doc[3]
token3, token3.text

($, '$')

In [28]:
token3.is_currency

True

In [29]:
for token in doc:
    print(token, "==>", "index: ", token.i, 
          "is_alpha:", token.is_alpha, 
          "is_punct:", token.is_punct, 
          "like_num:", token.like_num,
          "is_currency:", token.is_currency,
         )

Tony ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
gave ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
two ==> index:  2 is_alpha: True is_punct: False like_num: True is_currency: False
$ ==> index:  3 is_alpha: False is_punct: False like_num: False is_currency: True
to ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
peter ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False


In [30]:
with open('/content/students.txt') as f:
  text= f.readlines()

text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [31]:
text= ' '.join(text)
text



In [32]:
doc= nlp(text)
doc

Dayton high school, 8th grade students information
 
 Name	birth day   	email
 -----	------------	------
 Virat   5 June, 1882    virat@kohli.com
 Maria	12 April, 2001  maria@sharapova.com
 Serena  24 June, 1998   serena@williams.com 
 Joe      1 May, 1997    joe@root.com
 
 
 

In [35]:
emails=[]
for token in doc:
  if token.like_email:
    emails.append(token)

emails

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]

## In Hindi Language

In [37]:
nlp= spacy.blank('hi')

doc= nlp('भैया जी! 5000 ₹ उधार थे वो वापस देदो')

In [40]:
# words_hindi=[]

for token in doc:
  print(token, token.is_currency, token.like_num)


भैया False False
जी False False
! False False
5000 False True
₹ True False
उधार False False
थे False False
वो False False
वापस False False
देदो False False


## Customizing tokenization rule

In [42]:
nlp = spacy.blank("en")
doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [50]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme",[
    {ORTH: "gim"},
    {ORTH: "me"}
]) # We cannot modift the actual word, otherwise it will give an error.

doc = nlp("gimme double cheese extra large healthy pizza")
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [56]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
# nlp.add_pipe('sentencizer')
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


In [52]:
nlp.pipe_names

[]

In [53]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7efc9cce1870>

In [54]:
nlp.pipe_names

['sentencizer']


# Exercise



(1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf)

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy


In [76]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [77]:
nlp= spacy.blank('en')
doc= nlp(text)

for token in doc:
  print(token)



Look
for
data
to
help
you
address
the
question
.
Governments
are
good


sources
because
data
from
public
research
is
often
freely
available
.
Good


places
to
start
include
http://www.data.gov/
,
and
http://www.science
.


gov/
,
and
in
the
United
Kingdom
,
http://data.gov.uk/.


Two
of
my
favorite
data
sets
are
the
General
Social
Survey
at
http://www3.norc.org/gss+website/
,


and
the
European
Social
Survey
at
http://www.europeansocialsurvey.org/.




In [78]:
urls=[]
for token in doc:
  if token.like_url:
    urls.append(token)

urls

[http://www.data.gov/,
 http://www.science,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €


In [79]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

nlp= spacy.blank('en')
doc= nlp(transactions)

for token in doc:
  print(token)

Tony
gave
two
$
to
Peter
,
Bruce
gave
500
€
to
Steve


In [83]:
currency=[]
for token in doc:
  if token.like_num and doc[token.i+1].is_currency:
    currency.append(token)
    print(doc[token.i+1], token)
currency

$ two
€ 500


[two, 500]