In [2]:
!pip install spacy



In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.1.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
import spacy

In [8]:
model = spacy.load('en_core_web_sm')

In [9]:
text = """ The Republican president is being challenged by Democratic 
Party nominee Joe Biden, who is best known as Barack Obama’s vice-president but has been in 
US politics since the 1970s.As election day approaches, pollingcompanies will be trying to gauge 
the mood of the nation by asking voters which candidate they prefer."""

In [10]:
doc = model(text)

In [12]:
type(doc)

spacy.tokens.doc.Doc

In [19]:
#Sentence tokenization
list(doc.sents)

[ The Republican president is being challenged by Democratic 
 Party nominee Joe Biden, who is best known as Barack Obama’s vice-president but has been in 
 US politics since the 1970s.,
 As election day approaches, pollingcompanies will be trying to gauge 
 the mood of the nation by asking voters which candidate they prefer.]

In [21]:
#Word tokenization
for token in doc :
    print(token)

 
The
Republican
president
is
being
challenged
by
Democratic


Party
nominee
Joe
Biden
,
who
is
best
known
as
Barack
Obama
’s
vice
-
president
but
has
been
in


US
politics
since
the
1970s
.
As
election
day
approaches
,
pollingcompanies
will
be
trying
to
gauge


the
mood
of
the
nation
by
asking
voters
which
candidate
they
prefer
.


In [36]:
#Stop words
stopWords = spacy.lang.en.stop_words.STOP_WORDS

In [37]:
print(stopWords)

{"'d", 'myself', 'he', 'another', 'thereafter', 'a', '‘ll', 'moreover', 'any', 'whether', 'make', 'now', 'although', 'take', 'quite', 'if', 'part', 'himself', 'its', "'m", 'yourselves', 'move', 'less', 'also', 'regarding', 'thus', 'anywhere', "'ve", 'few', 'done', "'ll", 'made', 'out', 'our', "n't", 'alone', 'themselves', 'afterwards', 'someone', 'or', 'five', 'everyone', 'somehow', 'put', 'there', 'ever', 'those', 'yet', 'top', 'upon', 'every', 'one', 'should', 'about', 'while', 'thence', 'when', 'hereafter', 'wherein', '’re', 'anyhow', 'others', 'amongst', 'beside', 'their', 'eight', 'used', 'two', 'beyond', 'something', 'your', 'will', 'i', 'seeming', 'herein', 'becoming', 'herself', 'do', 'all', 'still', 'within', 'by', 'than', 'through', 'whereupon', 'not', 'least', 'us', 'them', 'see', 'into', 'so', 'as', 'either', 'you', 'me', 'whereby', '’d', 'rather', 'full', 'various', '’s', 'can', 'throughout', 'nothing', 'above', 'against', 'well', '‘d', 'nine', 'does', 'serious', 'other', 

In [54]:
def removeStopWords(input_str) :
    outputArray = []
    doc = model(input_str)
    stopWords = list(spacy.lang.en.stop_words.STOP_WORDS)
    for token in doc :
        token = str(token)
        if token not in stopWords :
            outputArray.append(token)
    return ' '.join(outputArray)

In [55]:
print(removeStopWords(text))

  The Republican president challenged Democratic 
 Party nominee Joe Biden , best known Barack Obama vice - president 
 US politics 1970s . As election day approaches , pollingcompanies trying gauge 
 mood nation asking voters candidate prefer .


In [47]:
for token in doc : 
    if token.is_stop :
        print(token)

The
is
being
by
who
is
as
’s
but
has
been
in
US
since
the
As
will
be
to
the
of
the
by
which
they


In [50]:
for token in doc : 
    if token.is_punct :
        print(token)

,
-
.
,
.


In [64]:
def cleanText(input_str) :
    outputArray = []
    doc = model(input_str)
    for token in doc :
        if not token.is_stop and not token.is_punct :
            outputArray.append(str(token))
    return ' '.join(outputArray)

In [67]:
print(cleanText(text))

  Republican president challenged Democratic 
 Party nominee Joe Biden best known Barack Obama vice president 
 politics 1970s election day approaches pollingcompanies trying gauge 
 mood nation asking voters candidate prefer


In [71]:
for token in doc : 
    print(token, '---------->', token.lemma_)

  ---------->  
The ----------> the
Republican ----------> republican
president ----------> president
is ----------> be
being ----------> be
challenged ----------> challenge
by ----------> by
Democratic ----------> Democratic

 ----------> 

Party ----------> Party
nominee ----------> nominee
Joe ----------> Joe
Biden ----------> Biden
, ----------> ,
who ----------> who
is ----------> be
best ----------> well
known ----------> know
as ----------> as
Barack ----------> Barack
Obama ----------> Obama
’s ----------> ’s
vice ----------> vice
- ----------> -
president ----------> president
but ----------> but
has ----------> have
been ----------> be
in ----------> in

 ----------> 

US ----------> US
politics ----------> politic
since ----------> since
the ----------> the
1970s ----------> 1970
. ----------> .
As ----------> as
election ----------> election
day ----------> day
approaches ----------> approach
, ----------> ,
pollingcompanies ----------> pollingcompanie
will ----------> will

In [72]:
for token in doc :
    print(token, '------------>', token.tag_, '------------>', token.pos_, '------------>', spacy.explain(token.tag_))

  ------------> _SP ------------> SPACE ------------> whitespace
The ------------> DT ------------> DET ------------> determiner
Republican ------------> JJ ------------> ADJ ------------> adjective (English), other noun-modifier (Chinese)
president ------------> NN ------------> NOUN ------------> noun, singular or mass
is ------------> VBZ ------------> AUX ------------> verb, 3rd person singular present
being ------------> VBG ------------> AUX ------------> verb, gerund or present participle
challenged ------------> VBN ------------> VERB ------------> verb, past participle
by ------------> IN ------------> ADP ------------> conjunction, subordinating or preposition
Democratic ------------> NNP ------------> PROPN ------------> noun, proper singular

 ------------> _SP ------------> SPACE ------------> whitespace
Party ------------> NNP ------------> PROPN ------------> noun, proper singular
nominee ------------> NN ------------> NOUN ------------> noun, singular or mass
Joe ------