In [1]:
# Installing Libraries
!pip install spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_md')
text = """ The Republican president is being challenged by Democratic Party nominee Joe Biden, who is best known as 
           Barack Obama’s vice-president but has been in US politics since the 1970s. As election day approaches, 
           pollingcompanies will be trying to gauge the mood of the nation by asking voters which candidate they prefer."""
doc = nlp(text)
print(type(doc))

<class 'spacy.tokens.doc.Doc'>


In [4]:
# sentence tokeinization
list(doc.sents)

[ The Republican president is being challenged by Democratic Party nominee Joe Biden, who is best known as 
            Barack Obama’s vice-president but has been in US politics since the 1970s.,
 As election day approaches, 
            pollingcompanies will be trying to gauge the mood of the nation by asking voters which candidate they prefer.]

In [5]:
len(doc)

61

In [6]:
# word tokenisation
for token in doc:
    print(token)

 
The
Republican
president
is
being
challenged
by
Democratic
Party
nominee
Joe
Biden
,
who
is
best
known
as

           
Barack
Obama
’s
vice
-
president
but
has
been
in
US
politics
since
the
1970s
.
As
election
day
approaches
,

           
pollingcompanies
will
be
trying
to
gauge
the
mood
of
the
nation
by
asking
voters
which
candidate
they
prefer
.


In [7]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(stopwords)

326

In [8]:
" ".join([token.text for token in doc if token.text not in stopwords])    

'  The Republican president challenged Democratic Party nominee Joe Biden , best known \n            Barack Obama vice - president US politics 1970s . As election day approaches , \n            pollingcompanies trying gauge mood nation asking voters candidate prefer .'

In [9]:
class Text_Preprocessing: 
    
    def __init__(self): # constructor
        self.nlp = spacy.load('en_core_web_md')
    
    def preprocessing(self, text):
        doc = self.nlp(text)
        clean_data = []
        for token in doc:
            if not token.is_stop:
                if not token.is_punct:
                    clean_data.append(token.lemma_)
                    
        return " ".join(clean_data).strip()

In [10]:
from spacy_preprocessing import Text_Preprocessing

In [11]:
pre = Text_Preprocessing()
text = """ The Republican president is being challenged by Democratic Party nominee Joe Biden, who is best known as Barack Obama’s vice-president but has been in US politics since the 1970s. As election day approaches, pollingcompanies will be trying to gauge the mood of the nation by asking voters which candidate they prefer."""
pre.preprocessing(text)

'republican president challenge Democratic Party nominee Joe Biden well know Barack Obama vice president politic 1970 election day approach pollingcompanie try gauge mood nation ask voter candidate prefer'

In [12]:
text = 'hi zeeshan@gmail.com has account on www.fb.com with phone number 789'
nlp = spacy.load('en_core_web_md')
doc = nlp(text)
for token in doc:
    if token.like_email:
        print(token)

zeeshan@gmail.com


In [13]:
# Visualtion
from spacy import displacy
displacy.render(doc , style = 'ent')

In [14]:
doc = nlp('good bad')
for token in doc:
    print(f'{token}---->{token.vector}')

good---->[-0.42625    0.4431    -0.34517   -0.1326    -0.05816    0.052598
  0.21575   -0.36721   -0.04519    2.2444    -0.29089    0.1667
 -0.052051   0.15964   -0.42759   -0.11147   -0.14951    1.18
 -0.19603    0.15592   -0.06112   -0.011576   0.26849   -0.30175
 -0.055796   0.12116    0.010542  -0.18065    0.23281   -0.26367
  0.11032    0.06216    0.015019  -0.10687    0.098486   0.048457
  0.33355   -0.16177   -0.28503   -0.28655   -0.11245    0.12417
 -0.24975   -0.2008     0.26034    0.25208   -0.17841    0.15395
 -0.19799   -0.22644   -0.074088   0.50289    0.32105   -0.034766
  0.16543    0.057095  -0.20973    0.098376   0.035058  -0.023057
 -0.11736   -0.51327   -0.020999   0.39962    0.30533   -0.38839
  0.0026097  0.29022    0.017045   0.063961   0.10789    0.29013
  0.061732   0.068231  -0.014031   0.048649  -0.011663  -0.26527
 -0.14494    0.45397    0.067191   0.14195    0.37302   -0.0076579
  0.018443  -0.10224   -0.0051647 -0.12233    0.25855   -0.11212
 -0.053952  -0

In [15]:
# Similirty between two words
text = 'lion'
text_1 = 'tiger';
doc = nlp(text)
doc_1 = nlp(text_1)
doc.similarity(doc_1)

0.7359829457249657