In [1]:
import re
import string
import spacy

In [2]:
#Split by Whitespace
#Select Words

text = 'I\'have watched Harry.Potter!'
words = re.split(r'\W+', text)
print(words)

['I', 'have', 'watched', 'Harry', 'Potter', '']


In [3]:
# split into words by white space
words = text.split()
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation from each word
stripped = [re_punc.sub('', w) for w in words]
print(stripped)

# string.printable inverse of string.punctuation
re_print = re.compile('[^%s]' % re.escape(string.printable))
result = [re_print.sub('', w) for w in words]
print(result)

['Ihave', 'watched', 'HarryPotter']
["I'have", 'watched', 'Harry.Potter!']


In [4]:
# Normalizing Case

# split into words by white space
words = text.split()
# convert to lower case
words = [word.lower() for word in words]
print(words)


["i'have", 'watched', 'harry.potter!']


# Working on Spacy

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')

string = '"Harry.Potter! a fiction movie is very \'interesting"'
print(string)
doc = nlp(string)
for token in doc:
    print(token.text, end=' | ')


"Harry.Potter! a fiction movie is very 'interesting"
" | Harry | . | Potter | ! | a | fiction | movie | is | very | ' | interesting | " | 

In [6]:
doc2 = nlp(u"We're here to help! you!")
for t in doc2:
    print(t)


We
're
here
to
help
!
you
!


In [7]:
doc3 = nlp(u'One Software is of Cost $10.30')
for t in doc3:
    print(t)


One
Software
is
of
Cost
$
10.30


In [8]:
doc4 = nlp(u"Let us work on forest fire prediction.")
for t in doc4:
    print(t)

print(len(doc4))
len(doc4.vocab)

Let
us
work
on
forest
fire
prediction
.
8


794

In [9]:
doc5 = nlp(u'It is better to give than to receive.')
# Retrieve the third token:
doc5[2]

better

In [10]:
# Retrieve three tokens from the middle:
doc5[2:5]

better to give

In [11]:
# Retrieve the last four tokens:
doc5[-4:]

than to receive.

In [12]:
doc8 = nlp(u'Apple to build a factory in U.S for $6 million')

for token in doc8:
    print(token.text, end=' | ')

print('\n----')

Apple | to | build | a | factory | in | U.S | for | $ | 6 | million | 
----


In [13]:
for ent in doc8.ents:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

len(doc8.ents)

Apple - ORG - Companies, agencies, institutions, etc.
U.S - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


3

In [14]:

doc9 = nlp(u"Apple to build a factory in U.S for $6 million.")

for chunk in doc9.noun_chunks:
    print(chunk.text)

Apple
a factory
U.S


In [15]:
doc10 = nlp(u"Karachi City of Light and Lahore whole hearted People.")

for chunk in doc10.noun_chunks:
    print(chunk.text)

Karachi City
Light
Lahore
People


In [16]:
doc11 = nlp(u"Microsoft work for $6 million.")

for chunk in doc11.noun_chunks:
    print(chunk.text)

Microsoft


In [17]:
from spacy import displacy
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})


In [18]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')
displacy.render(doc, style='ent', jupyter=True)
