In [1]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()


In [2]:
doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')

In [3]:
for token in doc:
    print(f'{token.text:{10}},{token.pos_:{10}},{token.dep_:{12}},{token.tag_:{10}},{spacy.explain(token.tag_)}')

Tesla     ,PROPN     ,nsubj       ,NNP       ,noun, proper singular
is        ,AUX       ,aux         ,VBZ       ,verb, 3rd person singular present
looking   ,VERB      ,ROOT        ,VBG       ,verb, gerund or present participle
at        ,ADP       ,prep        ,IN        ,conjunction, subordinating or preposition
buying    ,VERB      ,pcomp       ,VBG       ,verb, gerund or present participle
U.S.      ,PROPN     ,compound    ,NNP       ,noun, proper singular
startup   ,NOUN      ,dobj        ,NN        ,noun, singular or mass
for       ,ADP       ,prep        ,IN        ,conjunction, subordinating or preposition
$         ,SYM       ,quantmod    ,$         ,symbol, currency
6         ,NUM       ,compound    ,CD        ,cardinal number
million   ,NUM       ,pobj        ,CD        ,cardinal number


In [4]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fe9d923f110>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fe9e89212f0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fe9f847bec0>)]

In [5]:
doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')

In [6]:
for sentence in doc4.sents:
    print(sentence)

This is the first sentence.
This is another sentence.
This is the last sentence.


# TOKENIZATION:
    Process of breaking up the original text in the component pieces

In [7]:
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [8]:
doc = nlp(mystring)

In [9]:
for token in doc:
     print(f'{token.text:{10}},{token.pos_:{10}},{token.dep_:{12}},{token.tag_:{10}},{spacy.explain(token.tag_)}')

"         ,PUNCT     ,punct       ,``        ,opening quotation mark
We        ,PRON      ,nsubj       ,PRP       ,pronoun, personal
're       ,AUX       ,aux         ,VBP       ,verb, non-3rd person singular present
moving    ,VERB      ,ROOT        ,VBG       ,verb, gerund or present participle
to        ,ADP       ,prep        ,IN        ,conjunction, subordinating or preposition
L.A.      ,PROPN     ,pobj        ,NNP       ,noun, proper singular
!         ,PUNCT     ,punct       ,.         ,punctuation mark, sentence closer
"         ,PUNCT     ,punct       ,''        ,closing quotation mark


In [10]:
doc2 = nlp(u"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!")


In [11]:
for t in doc2:
    print(f'{t.text:{10}},{t.pos_:{10}},{t.dep_:{12}},{t.tag_:{10}},{spacy.explain(t.tag_)}')

We        ,PRON      ,nsubj       ,PRP       ,pronoun, personal
're       ,AUX       ,ROOT        ,VBP       ,verb, non-3rd person singular present
here      ,ADV       ,advmod      ,RB        ,adverb
to        ,PART      ,aux         ,TO        ,infinitival "to"
help      ,VERB      ,advcl       ,VB        ,verb, base form
!         ,PUNCT     ,punct       ,.         ,punctuation mark, sentence closer
Send      ,VERB      ,ROOT        ,VB        ,verb, base form
snail     ,NOUN      ,compound    ,NN        ,noun, singular or mass
-         ,PUNCT     ,punct       ,HYPH      ,punctuation mark, hyphen
mail      ,NOUN      ,dobj        ,NN        ,noun, singular or mass
,         ,PUNCT     ,punct       ,,         ,punctuation mark, comma
email     ,NOUN      ,conj        ,NN        ,noun, singular or mass
support@oursite.com,X         ,nummod      ,ADD       ,email
or        ,CCONJ     ,cc          ,CC        ,conjunction, coordinating
visit     ,VERB      ,conj        ,VB        ,verb,

In [12]:
len(doc2.vocab)

512

# Named Entities
Going a step beyond tokens, named entities add another layer of context. The language model recognizes that certain words are organizational names while others are locations, and still other combinations relate to money, dates, etc. Named entities are accessible through the ents property of a Doc object.

In [13]:
doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')

In [14]:
for token in doc8:
    print(token.text,end=' | ')
print('\n----')
for ent in doc8.ents:
     print(ent.text+' - '+ent.label_+' -'+str(spacy.explain(ent.label_)))

Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | 
----
Apple - ORG -Companies, agencies, institutions, etc.
Hong Kong - GPE -Countries, cities, states
$6 million - MONEY -Monetary values, including unit


# Built-in Visualizers
spaCy includes a built-in visualization tool called displaCy. displaCy is able to detect whether you're working in a Jupyter notebook, and will return markup that can be rendered in a cell right away. When you export your notebook, the visualizations will be included as HTML.

For more info visit https://spacy.io/usage/visualizers

In [15]:
from spacy import displacy
doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc, style='ent', jupyter=True, options={'distance': 110})

# Creating Visualizations Outside of Jupyter
If you're using another Python IDE or writing a script, you can choose to have spaCy serve up html separately:

In [16]:
#doc = nlp(u'This is a sentence.')
#displacy.serve(doc, style='dep')

In [17]:
text = """In ancient Rome, some neighbors live in three adjacent houses. In the center is the house of Senex, who lives there with wife Domina, son Hero, and several slaves, including head slave Hysterium and the musical's main character Pseudolus. A slave belonging to Hero, Pseudolus wishes to buy, win, or steal his freedom. One of the neighboring houses is owned by Marcus Lycus, who is a buyer and seller of beautiful women; the other belongs to the ancient Erronius, who is abroad searching for his long-lost children (stolen in infancy by pirates). One day, Senex and Domina go on a trip and leave Pseudolus in charge of Hero. Hero confides in Pseudolus that he is in love with the lovely Philia, one of the courtesans in the House of Lycus (albeit still a virgin)."""

In [18]:
doc = nlp(text)


In [19]:
doc.user_data["title"] = "This is a title"

In [20]:
displacy.render(doc, style='ent')

# STEMMING

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()

In [None]:
#pip install nltk

In [None]:
words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word + '------>' + p_stemmer.stem(word))

In [None]:
from nltk.stem.snowball import SnowballStemmer
p_snow = SnowballStemmer(language='english')
for word in words:
    print(word + '------>' + p_snow.stem(word))

In [None]:
phrase = 'I am meeting him tomorrow at the meeting'

for word in phrase.split():
    print(word + '------>' + p_snow.stem(word))

# Lemmatization
In contrast to stemming, lemmatization looks beyond word reduction, and considers a language's full vocabulary to apply a morphological analysis to words. The lemma of 'was' is 'be' and the lemma of 'mice' is 'mouse'. Further, the lemma of 'meeting' might be 'meet' or 'meeting' depending on its use in a sentence.

In [None]:
doc1 = nlp(u"I am a runner running in a race because I love to run since I ran today")
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{10}},{token.pos_:{10}}, {token.lemma:<{22}}, {token.lemma_:{12}},  {token.tag_:{12}},{spacy.explain(token.tag_)}')

In [None]:
doc2 = nlp(u"I saw eighteen mice today!")
show_lemmas(doc2)

# Stop Words
Words like "a" and "the" appear so frequently that they don't require tagging as thoroughly as nouns, verbs and modifiers. We call these stop words, and they can be filtered from the text to be processed. spaCy holds a built-in list of some 305 English stop words.

In [None]:
print(nlp.Defaults.stop_words)

In [None]:
nlp.vocab['is'].is_stop

In [None]:
nlp.Defaults.stop_words.add('btw')
# Set the stop_word tag on the lexeme
nlp.vocab['btw'].is_stop = True

# PHRASE MATCHING AND VOCABULARY

Other token attributes
Besides lemmas, there are a variety of token attributes we can use to determine matching rules:

Attribute	Description
`ORTH`	The exact verbatim text of a token
`LOWER`	The lowercase form of the token text
`LENGTH`	The length of the token text
`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`	Token text consists of alphanumeric characters, ASCII characters, digits
`IS_LOWER`, `IS_UPPER`, `IS_TITLE`	Token text is in lowercase, uppercase, titlecase
`IS_PUNCT`, `IS_SPACE`, `IS_STOP`	Token is punctuation, whitespace, stop word
`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`	Token text resembles a number, URL, email
`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`	The token's simple and extended part-of-speech tag, dependency label, lemma, shape
`ENT_TYPE`	The token's entity label


In [None]:
from spacy.matcher import Matcher

In [None]:
matcher = Matcher(nlp.vocab)

In [None]:
#SolarPower
#Solar-Power
#Solar Power
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER':'solar'}, {'IS_PUNCT': True, 'OP':'*'},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

In [None]:
matcher.add('SolarPower', None,pattern1,pattern2,pattern3)

In [None]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [None]:
found_matches = matcher(doc)

In [None]:
print(found_matches)

In [None]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

In [None]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')

In [None]:
#found_matcher = matcher(doc2)

# PHRASE MATCHER

In [None]:
# Import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [None]:
with open('../TextFiles/reaganomics.txt', encoding='utf8', errors='ignore') as f:
    doc3 = nlp(f.read())

In [None]:
doc3

In [None]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)

# Build a list of matches:
matches = matcher(doc3)

print(matches)

In [None]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

# ASSESSMENT

In [None]:
# Enter your code here:
with open('../TextFiles/owlcreek.txt') as f:
      doc = nlp(f.read())

In [None]:
doc[:36]

In [None]:
len(doc)

In [None]:
sentence = [sent for sent in doc.sents]
len(sentence)

In [None]:
sentence[2].text

In [None]:
for token in sentence[2]:
     print(f'{token.text:{15}},{token.pos_:{5}},{token.dep_:{12}},{token.lemma_:{10}},{spacy.explain(token.tag_)}')

In [None]:
# Import the Matcher library:

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)


In [None]:
pattern = [{'LOWER':'swimming'},{'IS_SPACE': True, 'OP':'*'},{'LOWER':'vigorously'}]
matcher.add('Swimming',None,pattern)



In [None]:
found_match = matcher(doc)
print(found_match)

In [None]:
for match_id, start, end in found_match:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start-5:end+5]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

# PART OF SPEECH TAGGING

In [None]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{10}},{token.pos_:{10}}, {token.lemma:<{22}}, {token.lemma_:{12}},  {token.tag_:{12}},{spacy.explain(token.tag_)}')

In [None]:
# Create a simple Doc object
doc = nlp(u"The quick brown fox jumped over the lazy dog's back.")
show_lemmas(doc)

In [None]:
doc1 = nlp(u'I read books on NLP.')
show_lemmas(doc1)

# Counting POS Tags
The Doc.count_by() method accepts a specific token attribute as its argument, and returns a frequency count of the given attribute as a dictionary object. Keys in the dictionary are the integer values of the given attribute ID, and values are the frequency. Counts of zero are not included.

In [None]:
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

In [None]:
doc.vocab[83].text

In [None]:
def count(tex):
    POS_counts = tex.count_by(spacy.attrs.POS)
    for k,v in sorted(POS_counts.items()):
         print(f'{k}. {tex.vocab[k].text:{5}}: {v}')

In [None]:
count(doc)

In [None]:
#TAG Counts:
def TAG(tex):
    DEP_counts = tex.count_by(spacy.attrs.DEP)
    for k,v in sorted(DEP_counts.items()):
        print(f'{k}. {tex.vocab[k].text:{5}}: {v}')

In [None]:
TAG(doc)

In [None]:
len(doc.vocab)

# VISUALIZING:

In [None]:
def display(tec):
    for sent in tec.sents:
        docx = nlp(sent.text)
        if docx.ents:
            displacy.render(docx, style='ent', jupyter=True)
        else:
             print(docx.text)

In [None]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')

In [None]:
display(doc)

# NAMED ENTITY RECOGNITION

In [None]:
# Write a function to display basic entity info:
def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))
    else:
        print('No named entities found.')

In [None]:
doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')
show_ents(doc)

# Adding a Named Entity to a Span
Normally we would have spaCy build a library of named entities by training it on several samples of text.
In this case, we only want to add one value:

In [None]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')
show_ents(doc)

In [None]:
from spacy.tokens import Span

# Get the hash value of the ORG entity label
ORG = doc.vocab.strings[u'ORG']  

# Create a Span for the new entity
new_ent = Span(doc, 0, 1, label=ORG)

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + [new_ent]


In [None]:
show_ents(doc)

In [None]:
doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '
          u'If successful, the vacuum cleaner will be our first product.')

show_ents(doc)

In [None]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [None]:
# First, create a list of match phrases:
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('vacuum', None, *phrase_patterns)

# Build a list of matches:
matches = matcher(doc)

print(matches)

In [None]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

In [None]:
from spacy.tokens import Span

PROD = doc.vocab.strings[u'PRODUCT'] 

# Create a Span for the new entity
new_ent = [Span(doc, match[1], match[2], label=PROD) for match in matches]

# Add the entity to the existing Doc object
doc.ents = list(doc.ents) + new_ent

In [None]:
show_ents(doc)

In [None]:
doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')

[ent for ent in doc.ents if ent.label_=="MONEY"]

In [81]:
# Import the displaCy library
from spacy import displacy

In [83]:
doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '
         u'By contrast, Sony sold only 7 thousand Walkman music players.')
displacy.render(doc, style='ent', jupyter=True)

In [84]:
options = {'ents': ['ORG', 'PRODUCT']}

displacy.render(doc, style='ent', jupyter=True, options=options)

# SENTENCE SEGMENTATION

In [85]:
# From Spacy Basics:
doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')
for sent in doc.sents:
    print(sent.text)
nlp.pipe_names
['tagger', 'set_custom_boundaries', 'parser', 'ner']
# Re-run the Doc object creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')
​
for sent in doc4.sents:
    print(sent)
"Management is doing things right;
leadership is doing the right things."
-Peter
Drucker
ASSIGNMENT
with open('../TextFiles/peterrabbit.txt') as f:
    doc = nlp(f.read())
doc
doc
The Tale of Peter Rabbit, by Beatrix Potter (1902).


This is the first sentence.
This is another sentence.
This is the last sentence.


In [86]:
doc_sents = [sent for sent in doc.sents]
doc_sents

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [88]:
type(doc_sents[1])

spacy.tokens.span.Span

In [89]:
# SPACY'S DEFAULT BEHAVIOR
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc3.sents:
    print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter
Drucker


In [90]:
# ADD A NEW RULE TO THE PIPELINE
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before='parser')

nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [91]:
# Re-run the Doc object creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Drucker')

for sent in doc4.sents:
    print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter
Drucker


# ASSIGNMENT

In [113]:
with open('../TextFiles/peterrabbit.txt') as f:
    doc = nlp(f.read())

In [114]:
doc

The Tale of Peter Rabbit, by Beatrix Potter (1902).

Once upon a time there were four little Rabbits, and their names
were--

          Flopsy,
       Mopsy,
   Cotton-tail,
and Peter.

They lived with their Mother in a sand-bank, underneath the root of a
very big fir-tree.

'Now my dears,' said old Mrs. Rabbit one morning, 'you may go into
the fields or down the lane, but don't go into Mr. McGregor's garden:
your Father had an accident there; he was put in a pie by Mrs.
McGregor.'

'Now run along, and don't get into mischief. I am going out.'

Then old Mrs. Rabbit took a basket and her umbrella, and went through
the wood to the baker's. She bought a loaf of brown bread and five
currant buns.

Flopsy, Mopsy, and Cottontail, who were good little bunnies, went
down the lane to gather blackberries:

But Peter, who was very naughty, ran straight away to Mr. McGregor's
garden, and squeezed under the gate!

First he ate some lettuces and some French beans; and then he ate
some radishes;

And

In [117]:
def show_lemmas(text):
    for token in list(text.sents)[4]:
        print(f'{token.text:{10}},{token.pos_:{10}}, {token.lemma:<{22}}, {token.lemma_:{12}},  {token.tag_:{12}},{spacy.explain(token.tag_)}')

In [118]:
show_lemmas(doc)

They      ,PRON      , 561228191312463089    , -PRON-      ,  PRP         ,pronoun, personal
lived     ,VERB      , 13874798850131827181  , live        ,  VBD         ,verb, past tense
with      ,ADP       , 12510949447758279278  , with        ,  IN          ,conjunction, subordinating or preposition
their     ,PRON      , 561228191312463089    , -PRON-      ,  PRP$        ,pronoun, possessive
Mother    ,PROPN     , 2563722269420572430   , Mother      ,  NNP         ,noun, proper singular
in        ,ADP       , 3002984154512732771   , in          ,  IN          ,conjunction, subordinating or preposition
a         ,DET       , 11901859001352538922  , a           ,  DT          ,determiner
sand      ,NOUN      , 8689067151372652667   , sand        ,  NN          ,noun, singular or mass
-         ,PUNCT     , 9153284864653046197   , -           ,  HYPH        ,punctuation mark, hyphen
bank      ,NOUN      , 579043611916083110    , bank        ,  NN          ,noun, singular or mass
,      

In [99]:
# Provide a frequency list of POS tags from the entire document
def count(tex):
    POS_counts = tex.count_by(spacy.attrs.POS)
    for k,v in sorted(POS_counts.items()):
         print(f'{k}. {tex.vocab[k].text:{5}}: {v}')

In [98]:
count(doc)

84. ADJ  : 49
85. ADP  : 123
86. ADV  : 66
87. AUX  : 55
89. CCONJ: 61
90. DET  : 91
92. NOUN : 172
93. NUM  : 8
94. PART : 29
95. PRON : 108
96. PROPN: 75
97. PUNCT: 173
98. SCONJ: 20
100. VERB : 129
103. SPACE: 99


# 4. CHALLENGE: What percentage of tokens are nouns?
HINT: the attribute ID for 'NOUN' is 91

In [122]:
100*POS_counts[92]/len(doc)



0.1589825119236884

In [100]:
doc_sent = [sent for sent in doc.sents]

In [103]:
doc_sent[4]

They lived with their Mother in a sand-bank, underneath the root of a
very big fir-tree.


In [110]:
displacy.render(list(doc.sents)[4], style='dep', jupyter=True, options={'distance': 110})

*6. Show the first two named entities from Beatrix Potter's *The Tale of Peter Rabbit **

In [126]:
for ent in doc.ents[:2]:
    print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))

Peter Rabbit - PERSON - People, including fictional
Beatrix Potter - PERSON - People, including fictional


In [127]:
len(list(doc.sents))

74

In [128]:

list_of_sents = [nlp(sent.text) for sent in doc.sents]
list_of_ners = [doc for doc in list_of_sents if doc.ents]
len(list_of_ners)


37

In [129]:
displacy.render(list_of_sents[0], style='ent', jupyter=True)

Processing Pipeline in Spacy
When you call nlp on a text, spaCy first tokenizes the text to produce a Doc object. The Doc is then processed in several different steps – this is also referred to as the processing pipeline.

The pipeline used by the default models consists of a tagger, a parser and an entity recognizer. Each pipeline component returns the processed Doc, which is then passed on to the next component.

image.png

image.png

Processing text
When you call nlp on a text, spaCy will tokenize it and then call each component on the Doc, in order. It then returns the processed Doc that you can work with.

Tips for efficient processing
Process the texts as a stream using nlp.pipe and buffer them in batches, instead of one-by-one. This is usually much more efficient.
Only apply the pipeline components you need. Getting predictions from the model that you don’t actually need adds up and becomes very inefficient at scale. To prevent this, use the disable keyword argument to disable components you don’t need
In this example, we’re using nlp.pipe to process a (potentially very large) iterable of texts as a stream. Because we’re only accessing the named entities in doc.ents (set by the ner component), we’ll disable all other statistical components (the tagger and parser) during processing. nlp.pipe yields Doc objects, so we can iterate over them and access the named entity predictions:



In [130]:
texts = ['net income was $9.4 million compared to the prior year of 2.7$ million',
        'revenue exceeds twelve billion dollars with a loss of $1b']

In [131]:
%%timeit
docs = nlp.pipe(texts, disable = ['tagger', 'parser'])

for doc in docs:
    for ent in doc.ents:
        print(ent.text, ent.label_)
    print()

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
t

In [132]:
%%timeit
docs = nlp.pipe(texts)

for doc in docs:
    for ent in doc.ents:
        print(ent.text, ent.label_)
    print()

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
2.7$ million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
t

Hashtags and Emoji Detection
Social media posts, especially tweets, can be difficult to work with. They’re very short and often contain various emoji and hashtags. By only looking at the plain text, you’ll lose a lot of valuable semantic information.

In [133]:
from spacy.matcher import Matcher
from spacy.tokens import Span

In [134]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

In [135]:
pos = [[{'ORTH': emoji}] for emoji in pos_emoji]
neg = [[{'ORTH': emoji}] for emoji in neg_emoji]

In [136]:
matcher = Matcher(nlp.vocab)

In [137]:
def label_sentiment(matcher, doc, i , matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id]== 'happy':
        doc.sentiment += 0.1
    elif doc.vocab.strings[match_id] == 'sad':
        doc.sentiment -= 0.1

In [138]:
matcher.add("happy", label_sentiment, *pos)

In [139]:
matcher.add("sad", label_sentiment, *neg)

In [140]:
matcher.add('HASHTAG', None, [{'TEXT': '#'}, {'IS_ASCII': True}])

In [141]:
doc = nlp('Hello guys 😀😂 #kgptalkie')

In [142]:
matches = matcher(doc)

In [143]:
for match_id, starts, end in matches:
    string_id = doc.vocab.strings[match_id]
    span = doc[start:end]
    print(string_id, span.text)

happy 
happy 
HASHTAG 
