# Analyzing and Processing Text with spaCy

## Tokenizing the Text

### Word Tokenization

In [1]:
# Word tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

text = """omg! this is such an incredible product. perfect keyboard, perfect weight, perfect battery life, perfect trackpad, perfect performance, perfect touch id, perfect feel, perfect build, perfect look, perfect size, perfect stereo sound, perfect screen, perfect noise absence as not having a fan! i mean, it does not have a fan and it does not get hot most of the times. wtf?! it is the only laptop i have been able to use in my bed for x amount of hours and it is sooo comfortable to do it. it truly is one of the best products out there and it is totally worth the investment, because that is how i see it, not even as spending, that is just how good it is. i am enthusiastic because this is for sure the best device apple has created in terms of price/what you get, and it will probably stay in that position for a couple of years. this is game-changing and revolutionary. the computer feels as if it will last 10+ years, honestly. i highly recommend it. btw, it is so good my girlfriend decided to get one for herself because she just could not get over the first time she used it, and just how beautiful it is and pleasurable to look at from any perspective... this laptop is so ridiculously perfect that you can even open it with one hand and in just a couple of seconds, you will be ready to use it as it has an always-on feature that allows you to get down to the working right away. that is probably the best feat. true tone is also really good, as it is automatic brightness adjustment and dark mode, they sit together perfectly well. get it! you will not regret it at any moment. truly a high-end experience. oh, and... it looks elegant at any angle, and which is most important, classy it feels.
"""

#  "nlp" Object is used to create documents with linguistic annotations.
my_doc = nlp(text)

# Create list of word tokens
token_list = []
for token in my_doc:
    token_list.append(token.text)
print(token_list)

['omg', '!', 'this', 'is', 'such', 'an', 'incredible', 'product', '.', 'perfect', 'keyboard', ',', 'perfect', 'weight', ',', 'perfect', 'battery', 'life', ',', 'perfect', 'trackpad', ',', 'perfect', 'performance', ',', 'perfect', 'touch', 'i', 'd', ',', 'perfect', 'feel', ',', 'perfect', 'build', ',', 'perfect', 'look', ',', 'perfect', 'size', ',', 'perfect', 'stereo', 'sound', ',', 'perfect', 'screen', ',', 'perfect', 'noise', 'absence', 'as', 'not', 'having', 'a', 'fan', '!', 'i', 'mean', ',', 'it', 'does', 'not', 'have', 'a', 'fan', 'and', 'it', 'does', 'not', 'get', 'hot', 'most', 'of', 'the', 'times', '.', 'wtf', '?', '!', 'it', 'is', 'the', 'only', 'laptop', 'i', 'have', 'been', 'able', 'to', 'use', 'in', 'my', 'bed', 'for', 'x', 'amount', 'of', 'hours', 'and', 'it', 'is', 'sooo', 'comfortable', 'to', 'do', 'it', '.', 'it', 'truly', 'is', 'one', 'of', 'the', 'best', 'products', 'out', 'there', 'and', 'it', 'is', 'totally', 'worth', 'the', 'investment', ',', 'because', 'that', 'is

### Sentence Tokenization

In [2]:
# sentence tokenization

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Add the component to the pipeline
nlp.add_pipe('sentencizer')

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)


['omg!', 'this is such an incredible product.', 'perfect keyboard, perfect weight, perfect battery life, perfect trackpad, perfect performance, perfect touch id, perfect feel, perfect build, perfect look, perfect size, perfect stereo sound, perfect screen, perfect noise absence as not having a fan!', 'i mean, it does not have a fan and it does not get hot most of the times.', 'wtf?!', 'it is the only laptop i have been able to use in my bed for x amount of hours and it is sooo comfortable to do it.', 'it truly is one of the best products out there and it is totally worth the investment, because that is how i see it, not even as spending, that is just how good it is.', 'i am enthusiastic because this is for sure the best device apple has created in terms of price/what you get, and it will probably stay in that position for a couple of years.', 'this is game-changing and revolutionary.', 'the computer feels as if it will last 10+ years, honestly.', 'i highly recommend it.', 'btw, it is s

## Cleaning Text Data

### Removing Stopwords

In [3]:
# Identifying Stop words
#importing stop words from English language.
import spacy
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS

#Printing the total number of stop words:
print('Number of stop words: %d' % len(spacy_stopwords))

#Printing first ten stop words:
print('First ten stop words: %s' % list(spacy_stopwords)[:20])

Number of stop words: 326
First ten stop words: ["'s", 'therein', '‘s', 'less', 'yourself', 'various', 'off', 'full', 'but', 'fifteen', 'yet', 'moreover', 'last', 'first', 'if', 'everything', 'somewhere', 'as', 'would', 'one']


In [4]:
# Removing the stop words

from spacy.lang.en.stop_words import STOP_WORDS

#Implementation of stop words:
filtered_sent=[]

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# filtering stop words
for word in doc:
    if word.is_stop==False:
        filtered_sent.append(word)
print("Filtered Sentence:",filtered_sent)


Filtered Sentence: [omg, !, incredible, product, ., perfect, keyboard, ,, perfect, weight, ,, perfect, battery, life, ,, perfect, trackpad, ,, perfect, performance, ,, perfect, touch, d, ,, perfect, feel, ,, perfect, build, ,, perfect, look, ,, perfect, size, ,, perfect, stereo, sound, ,, perfect, screen, ,, perfect, noise, absence, having, fan, !, mean, ,, fan, hot, times, ., wtf, ?, !, laptop, able, use, bed, x, hours, sooo, comfortable, ., truly, best, products, totally, worth, investment, ,, ,, spending, ,, good, ., enthusiastic, sure, best, device, apple, created, terms, price, /, ,, probably, stay, position, couple, years, ., game, -, changing, revolutionary, ., computer, feels, 10, +, years, ,, honestly, ., highly, recommend, ., btw, ,, good, girlfriend, decided, time, ,, beautiful, pleasurable, look, perspective, ..., laptop, ridiculously, perfect, open, hand, couple, seconds, ,, ready, use, -, feature, allows, working, right, away, ., probably, best, feat, ., true, tone, good,

## Lexicon Normalization

### Lemmatization

In [5]:
# Implementing lemmatization
lem = nlp(text)
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

omg 
! 
this 
is 
such 
an 
incredible 
product 
. 
perfect 
keyboard 
, 
perfect 
weight 
, 
perfect 
battery 
life 
, 
perfect 
trackpad 
, 
perfect 
performance 
, 
perfect 
touch 
i 
d 
, 
perfect 
feel 
, 
perfect 
build 
, 
perfect 
look 
, 
perfect 
size 
, 
perfect 
stereo 
sound 
, 
perfect 
screen 
, 
perfect 
noise 
absence 
as 
not 
having 
a 
fan 
! 
i 
mean 
, 
it 
does 
not 
have 
a 
fan 
and 
it 
does 
not 
get 
hot 
most 
of 
the 
times 
. 
wtf 
? 
! 
it 
is 
the 
only 
laptop 
i 
have 
been 
able 
to 
use 
in 
my 
bed 
for 
x 
amount 
of 
hours 
and 
it 
is 
sooo 
comfortable 
to 
do 
it 
. 
it 
truly 
is 
one 
of 
the 
best 
products 
out 
there 
and 
it 
is 
totally 
worth 
the 
investment 
, 
because 
that 
is 
how 
i 
see 
it 
, 
not 
even 
as 
spending 
, 
that 
is 
just 
how 
good 
it 
is 
. 
i 
am 
enthusiastic 
because 
this 
is 
for 
sure 
the 
best 
device 
apple 
has 
created 
in 
terms 
of 
price 
/ 
what 
you 
get 
, 
and 
it 
will 
probably 
stay 
in 
th

### Part-of-Speech (POS) Tagging

In [6]:
# POS tagging

# importing the model en_core_web_sm of English for vocabluary, syntax & entities
import en_core_web_sm

# load en_core_web_sm of English for vocabluary, syntax & entities
nlp = en_core_web_sm.load()

#  "nlp" Objectis used to create documents with linguistic annotations.
docs = nlp(text)

for word in docs:
    print(word.text,word.pos_)



omg NOUN
! PUNCT
this PRON
is AUX
such DET
an DET
incredible ADJ
product NOUN
. PUNCT
perfect ADJ
keyboard NOUN
, PUNCT
perfect ADJ
weight NOUN
, PUNCT
perfect ADJ
battery NOUN
life NOUN
, PUNCT
perfect ADJ
trackpad NOUN
, PUNCT
perfect ADJ
performance NOUN
, PUNCT
perfect ADJ
touch NOUN
i NOUN
d PROPN
, PUNCT
perfect ADJ
feel NOUN
, PUNCT
perfect ADJ
build NOUN
, PUNCT
perfect ADJ
look NOUN
, PUNCT
perfect ADJ
size NOUN
, PUNCT
perfect ADJ
stereo NOUN
sound NOUN
, PUNCT
perfect ADJ
screen NOUN
, PUNCT
perfect ADJ
noise NOUN
absence NOUN
as ADP
not PART
having VERB
a DET
fan NOUN
! PUNCT
i PRON
mean VERB
, PUNCT
it PRON
does AUX
not PART
have VERB
a DET
fan NOUN
and CCONJ
it PRON
does AUX
not PART
get VERB
hot ADJ
most ADJ
of ADP
the DET
times NOUN
. PUNCT
wtf PROPN
? PUNCT
! PUNCT
it PRON
is AUX
the DET
only ADJ
laptop NOUN
i PRON
have AUX
been AUX
able ADJ
to PART
use VERB
in ADP
my PRON
bed NOUN
for ADP
x SYM
amount NOUN
of ADP
hours NOUN
and CCONJ
it PRON
is AUX
sooo ADJ
comfortabl

## Entity Detection

In [7]:
#for visualization of Entity detection importing displacy from spacy:

from spacy import displacy

text2 = nlp(text)

entities=[(i, i.label_, i.label) for i in text2.ents]
entities

[(hours, 'TIME', 392),
 (a couple of years, 'DATE', 391),
 (last 10+ years, 'DATE', 391),
 (first, 'ORDINAL', 396),
 (just a couple of seconds, 'TIME', 392)]

In [8]:
displacy.render(text2, style = "ent",jupyter = True)

## Dependency Parsing

In [9]:
docp = nlp(" In pursuit of a wall, President Trump ran into one.")

for chunk in docp.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
        chunk.root.head.text)


pursuit pursuit pobj In
a wall wall pobj of
President Trump Trump nsubj ran


In [10]:
displacy.render(docp, style="dep", jupyter= True)

## Word Vectors and Semantic Similarity

In [11]:
import en_core_web_sm
nlp = en_core_web_sm.load()
mango = nlp(u'mango')
print(mango.vector.shape)
print(mango.vector)

(96,)
[ 0.67121375  0.547875    1.1662204  -0.8862695  -0.7446909   1.4388278
 -0.0310275   0.14206484  1.4838524  -1.1957114   0.08386576 -0.911056
  0.05417725 -0.10735506 -1.0579314   0.1266487  -0.2706933   1.569655
  0.9489083  -0.14418542  0.3911935  -0.14170304 -0.7126999   1.6709299
  0.7659299   1.1870233   0.08993122  0.7558858   0.2850564   1.481607
 -0.04087245 -0.34370816 -0.64876366 -1.1423168  -0.4228799   1.2094326
 -0.03956398  0.2813167   0.54050326  0.3834293  -0.26618952  0.14527136
 -1.0663483  -0.21301082 -0.00739029 -2.210084    0.540864    1.0819434
  0.66898596 -1.1011103   0.07047433 -1.638876   -0.7076828  -1.7068787
  0.42300764 -0.5747217  -1.5162356  -1.0353892  -0.18434292 -0.32043636
 -0.588754   -0.7868452  -0.9719215  -0.7482023  -0.24716026 -0.25685
 -0.19451576  1.3188727  -0.08707494 -1.3416569   2.2506413  -0.02113014
  0.23767492  0.57176274  0.35061055 -0.598348    1.028291    0.1865943
 -0.03997174  1.0957274  -0.20389979 -0.5785228  -0.31933782

In [12]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "battery"}]
matcher.add("aspect", [pattern])

doc = nlp("The battery life is excellent battery")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

10648701098514877540 aspect 1 2 battery
10648701098514877540 aspect 5 6 battery


In [16]:
import spacy
from spacy import displacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
matched_sents = []  # Collect data of matched sentences to be visualized

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "MATCH",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})

pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
            {"POS": "ADJ"}]
matcher.add("FacebookIs", [pattern], on_match=collect_sents)  # add pattern
doc = nlp("""omg! this is such an incredible product. perfect keyboard, perfect weight, perfect battery life, perfect trackpad, perfect performance, perfect touch id, perfect feel, perfect build, perfect look, perfect size, perfect stereo sound, perfect screen, perfect noise absence as not having a fan! i mean, it does not have a fan and it does not get hot most of the times. wtf?! it is the only laptop i have been able to use in my bed for x amount of hours and it is sooo comfortable to do it. it truly is one of the best products out there and it is totally worth the investment, because that is how i see it, not even as spending, that is just how good it is. i am enthusiastic because this is for sure the best device apple has created in terms of price/what you get, and it will probably stay in that position for a couple of years. this is game-changing and revolutionary. the computer feels as if it will last 10+ years, honestly. i highly recommend it. btw, it is so good my girlfriend decided to get one for herself because she just could not get over the first time she used it, and just how beautiful it is and pleasurable to look at from any perspective... this laptop is so ridiculously perfect that you can even open it with one hand and in just a couple of seconds, you will be ready to use it as it has an always-on feature that allows you to get down to the working right away. that is probably the best feat. true tone is also really good, as it is automatic brightness adjustment and dark mode, they sit together perfectly well. get it! you will not regret it at any moment. truly a high-end experience. oh, and... it looks elegant at any angle, and which is most important, classy it feels.
""")
matches = matcher(doc)

# Serve visualization of sentences containing match with displaCy
# set manual=True to make displaCy render straight from a dictionary
# (if you're not running the code within a Jupyer environment, you can
# use displacy.serve instead)
displacy.render(matched_sents, style="ent", manual=True)