In [4]:
#Spacy recogises text and gives its part of speech. EG in our sentence below, spacy recognizes Barry allen as a propernoun
#it also recognises U.S. as a single entity
#also identifies $6 as two seperate number. later it also recognises them as a quantifier
#pipeline object 
import spacy

In [5]:
#loading a model is done here as neext step after import
nlp = spacy.load('en_core_web_sm')

In [6]:
#doc holds processed text
doc = nlp(u'Hello barry allen. You=can=run at speed of 2.7 mach who lives in U.S. . Valuation of tesla is $6 billion')

In [7]:
for token in doc:
    #attribut .text seperates every word intelligently
    #attribute .pos gives a numerical value of a part of speech like adverb, pronoun etc
    #attribute .pos_ gives the actual part of speech ie adverb or pronoun etc
    #ATTRIBUTE .dep_ stands for syntactic dependency. gives more info
    print(f"{token.text:{10}} {token.pos:{5}} {token.pos_:{10}} {token.dep_:{5}}")

#FOLLOWING CONCEPT SHOWS PADDING FEATURE
#following shows use of > in the code {token.pos_:>{10}} used for padding
# the dot operator . is added before > in code token.pos_:.>{10} fills the remaining space with .
print("*"*100)    
for token in doc:    
    print(f"Text is {token.text:.>{10}} POS number is {token.pos:.>{5}} POS value is {token.pos_:.>{10}}")
   

Hello         91 INTJ       compound
barry         96 PROPN      compound
allen         96 PROPN      ROOT 
.             97 PUNCT      punct
You           95 PRON       nsubj
=             97 PUNCT      punct
can          100 VERB       dep  
=             97 PUNCT      punct
run          100 VERB       ROOT 
at            85 ADP        prep 
speed         92 NOUN       pobj 
of            85 ADP        prep 
2.7           93 NUM        nummod
mach          90 DET        pobj 
who           95 PRON       nsubj
lives        100 VERB       relcl
in            85 ADP        prep 
U.S.          96 PROPN      pobj 
.             97 PUNCT      punct
Valuation     92 NOUN       nsubj
of            85 ADP        prep 
tesla         96 PROPN      pobj 
is            87 AUX        ROOT 
$             99 SYM        quantmod
6             93 NUM        compound
billion       93 NUM        attr 
****************************************************************************************************
Te

In [8]:
#when we run nlp, the text is broken down and then series of actions are performed like tagging, parsing 
# and ner or named entity recognizer
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x2c6984fe3c8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x2c6983ca5e8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x2c6983ca6a8>)]

In [9]:
#following is how u can index each text and get its information
doc[0].pos

91

In [10]:
doc2 = nlp(u"Hello world. I didn't know what was the value of life, until i experience failure. I need to understand that life \
           may be a word as bitch, still we need to know the value of it. I have lived for 28 years, yet i need \
           to understand value of it. Valueation of a life depends on what you have made for urself. Or is it? \
           'Chose wisely live well'\
           it mainly is calculated by relations you've made and broken. so ")

In [11]:
for token in doc2:
    print(f"{token.text:{10}} {token.pos:{5}} {token.pos_:{10}} {token.dep_:{5}}")

Hello         91 INTJ       intj 
world         92 NOUN       ROOT 
.             97 PUNCT      punct
I             95 PRON       nsubj
did           87 AUX        aux  
n't           94 PART       neg  
know         100 VERB       ROOT 
what          95 PRON       attr 
was           87 AUX        ccomp
the           90 DET        det  
value         92 NOUN       nsubj
of            85 ADP        prep 
life          92 NOUN       pobj 
,             97 PUNCT      punct
until         85 ADP        mark 
i             95 PRON       nsubj
experience   100 VERB       advcl
failure       92 NOUN       dobj 
.             97 PUNCT      punct
I             95 PRON       nsubj
need         100 VERB       ccomp
to            94 PART       aux  
understand   100 VERB       xcomp
that          90 DET        det  
life          92 NOUN       nsubj
              103 SPACE           
may          100 VERB       aux  
be            87 AUX        ccomp
a             90 DET        det  
word         

In [12]:
# suppose if we want to extract a quote ie something written inside ''
doc3 = nlp(u'WE are men of honor. As said "MEN DIE WITH HONOR"')
    

In [13]:
for i in range(14):
    print(f"Text \'{doc3[i]}\' Index {i}")


Text 'WE' Index 0
Text 'are' Index 1
Text 'men' Index 2
Text 'of' Index 3
Text 'honor' Index 4
Text '.' Index 5
Text 'As' Index 6
Text 'said' Index 7
Text '"' Index 8
Text 'MEN' Index 9
Text 'DIE' Index 10
Text 'WITH' Index 11
Text 'HONOR' Index 12
Text '"' Index 13


In [14]:
#slicing a document
#type of the quote is a span
lii = doc3[8:14]
print(lii)
print(type(lii))

"MEN DIE WITH HONOR"
<class 'spacy.tokens.span.Span'>


In [15]:
#accessing individual token or text
print(doc3[1])

are


In [16]:
#following is a demo of how spacy understands different sentences in a paragraph. it does so by looking 
# for period or full stop followed by space
#doc4.sents attribute is used by spacy to understand a sentence in a paragraph

doc4 = nlp(u"We are looking at first sentence. Followed by second sentence. And lastly the third.")
for sentences in doc4.sents:
    print(sentences)

We are looking at first sentence.
Followed by second sentence.
And lastly the third.


In [17]:
#.is_sent_start is used to check if the token or text is the start of sentence if no it returns nothing. if yes it returns
#boolean True

doc4[7].is_sent_start

True

In [18]:
doc[8].is_sent_start

In [19]:
#to see the entity of each token we use .ents
doc5 = nlp(u"India is land opportunities. The Great Bengal tiger is also found mostly here. India ranks second in population. We\ 're ruler of ayurveda but humble to gift it. 130 crore is the population. $6 trillion is target.")
#following is used to check the lenght of document object
print(len(doc5))
for entity in doc5.ents:
    print(entity)
    print(entity.label_)
    print(str(spacy.explain(entity.label_)))
    print("\n")

44
India
GPE
Countries, cities, states


Great Bengal tiger
PERSON
People, including fictional


India
GPE
Countries, cities, states


second
ORDINAL
"first", "second", etc.


130
CARDINAL
Numerals that do not fall under another type


$6 trillion
MONEY
Monetary values, including unit




In [20]:
#spacy can descriminate . used in emails from full stop or period also the https
eml= "hshsh@gmail.com. we need this dude. Man has enough to lose. Eighteen Secret service. U.S."
h = nlp(eml)
for i in h.ents:
    print(i)
    print(i.label_)
    
print("*%*"*40)
for i in h:
    print(i)



Eighteen
CARDINAL
U.S.
GPE
*%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%**%*
hshsh@gmail.com
.
we
need
this
dude
.
Man
has
enough
to
lose
.
Eighteen
Secret
service
.
U.S.


In [21]:
#chunk in spacy. 
for chunck in h.noun_chunks:
    print(chunck)

we
this dude
Man
Eighteen Secret service
U.S.


In [22]:
from spacy import displacy


In [23]:
doc = nlp(u"Indian airforce will get AMCA i.e. fifth generation fighter aircraft by 2027. Hypersonic misiles will also \
be ready by 2025. ")

In [24]:
for token in doc:
    print(token.text,token.pos_,)

Indian ADJ
airforce NOUN
will VERB
get AUX
AMCA PROPN
i.e. X
fifth ADJ
generation NOUN
fighter NOUN
aircraft NOUN
by ADP
2027 NUM
. PUNCT
Hypersonic ADJ
misiles NOUN
will VERB
also ADV
be AUX
ready ADJ
by ADP
2025 NUM
. PUNCT


In [25]:
for token in doc.ents:
    print(token)
    print(token.label_)
    print(str(spacy.explain(token.label_)))
    print("\n")

Indian
NORP
Nationalities or religious or political groups


fifth
ORDINAL
"first", "second", etc.


2027
DATE
Absolute or relative dates or periods


Hypersonic
GPE
Countries, cities, states


2025
DATE
Absolute or relative dates or periods




In [26]:
#displacy is a spacy visualization technique


displacy.render(doc,style='dep',jupyter=True,options={'distance':110})

In [27]:
displacy.render(doc,style='ent',jupyter=True)

In [None]:
#to run on server
displacy.serve(doc,style='dep')




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



In [None]:
#Stemming eg boat is stem and boating and boates convey boat. stemming is word reduction technique to reach a stem.

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer


In [None]:
#Porterstemmer and snowball stemmer techniques are used for stemming. Snowball is mostly used. 
p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer('english')

In [None]:
words = ['cricket','crickets','cricketer','balling','ball','bowled','bowler','engine','fly']

In [None]:
for word in words:
    print(word + '-------->' + p_stemmer.stem(word))

In [None]:
for word in words:
    print(word + '--------->' + s_stemmer.stem(word))

In [None]:
# we will create a function to display lemma attribute properly. 
# .lemma is a hash value that refers to a particular value in the english words vocab ie. en_core_web_sm
# .lemma_ has the lemmatized value itself.

def show_lemma(text):
    for token in text:
        print(f' {token.text:{10}} {token.pos_:{10}}  {token.lemma:.>{22}} {token.lemma_:.>{10}}')
                                                                       

In [22]:
doc1 = nlp(u'We will rock you is the best song sung by beatles. Footballers play football on the ground. ')

In [41]:
show_lemma(doc1)

 We         PRON        ....561228191312463089 ....-PRON-
 will       VERB        ..18307573501153647118 ......will
 rock       VERB        ..18229060090064749881 ......rock
 you        PRON        ....561228191312463089 ....-PRON-
 is         AUX         ..10382539506755952630 ........be
 the        DET         ...7425985699627899538 .......the
 best       ADJ         ...5711639017775284443 ......good
 song       NOUN        ...9688994450605453460 ......song
 sung       VERB        ..11916416841380526164 ......sing
 by         ADP         ..16764210730586636600 ........by
 beatles    NOUN        ..12651218547021316082 ....beatle
 .          PUNCT       ..12646065887601541794 ..........
 Footballers NOUN        ...4560292878538202511 footballer
 play       VERB        ...8228585124152053988 ......play
 football   NOUN        ...1941715343824527815 ..football
 on         ADP         ...5640369432778651323 ........on
 the        DET         ...7425985699627899538 .......the
 ground     N

In [34]:
doc2 = nlp(u'She sells sea shells on the sea shore. cricketing is an art aced by cricketers playing the game of cricket.')

In [42]:
show_lemma(doc2)

 She        PRON        ....561228191312463089 ....-PRON-
 sells      VERB        ...8777643931089885836 ......sell
 sea        NOUN        ...3315219263181363146 .......sea
 shells     NOUN        ...9176892150852730065 .....shell
 on         ADP         ...5640369432778651323 ........on
 the        DET         ...7425985699627899538 .......the
 sea        NOUN        ...3315219263181363146 .......sea
 shore      NOUN        ..18010347307423029077 .....shore
 .          PUNCT       ..12646065887601541794 ..........
 cricketing NOUN        ...9786736738525211029 cricketing
 is         AUX         ..10382539506755952630 ........be
 an         DET         ..15099054000809333061 ........an
 art        NOUN        ...3460763187682215441 .......art
 aced       VERB        ..14181857182073839252 .......ace
 by         ADP         ..16764210730586636600 ........by
 cricketers NOUN        ...8572882414559619093 .cricketer
 playing    VERB        ...8228585124152053988 ......play
 the        DE

In [43]:
# stop words. words that dont have much value in our text. eg is a the.
# demo is to see the set of stop words. adding and removing a word in and from the set of words

print(nlp.Defaults.stop_words)

{'onto', 'hereafter', 'me', 'wherever', "'ll", 'they', 'move', 'could', 'fifteen', 'upon', 'twenty', 'however', 'which', 'empty', 'noone', 'per', '’s', 'should', 'perhaps', 'everywhere', 'anyhow', 'than', 'whole', 'three', 'i', 'whereupon', 'was', 'under', 'do', 'on', 'these', 'while', 'anyone', 'themselves', 'hereupon', 'beside', 'had', 'behind', 'never', 'there', 'up', 'mine', '’m', 'various', 'afterwards', 'anything', 'keep', 're', 'elsewhere', 'also', 'those', 'will', 'hundred', 'have', 'whom', 'from', 'among', 'without', 'becoming', 'most', 'go', 'out', 'any', 'when', 'then', 'own', 'where', 'whereafter', 'not', 'both', 'sixty', 'whence', '‘re', 'his', 'seems', 'please', 'top', 'herself', 'several', 'too', 'besides', 'another', 'due', 'wherein', 'becomes', 'being', 'really', 'somewhere', 'are', 'fifty', 'forty', 'she', "'ve", 'very', '’ll', 'latterly', 'myself', 'therein', 'thence', 'n’t', 'give', 'itself', 'seemed', 'is', 'same', 'my', 'our', 'moreover', 'side', 'thereafter', '‘m

In [44]:
#gives the boolean val of a word in the above set of stop words
nlp.vocab['the'].is_stop

True

In [46]:
# adding a new word is a two step process

nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True

In [48]:
#to check whther the word has been added or not
nlp.vocab['btw'].is_stop

True

In [49]:
# to remove a word

nlp.Defaults.stop_words.remove('beyond')
nlp.vocab['beyond'].is_stop = False

In [50]:
#to check whether the word has been removed or not
nlp.vocab['beyond'].is_stop

False

In [51]:
#phrase and vocabulary matching

from spacy.matcher import Matcher
m_tool = Matcher(nlp.vocab)

In [52]:
# patterns are defined next that we want to be matched
p1 = [{'LOWER':'quickbrownfox'}]
p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}]
p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}]
p4 = [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}]

In [53]:
m_tool.add('QBF', None, p1, p2, p3, p4)
#QBF is the name of the matcher. we can give it any

In [54]:
sentence = nlp(u'The quick-brown-fox jumps over the lazy dog. The quick brown fox eats well. \
               the quickbrownfox is dead. the dog misses the quick brownfox')

In [55]:
phrase_matches = m_tool(sentence)
print(phrase_matches )

[(12825528024649263697, 1, 6), (12825528024649263697, 13, 16), (12825528024649263697, 21, 22), (12825528024649263697, 29, 31)]


In [112]:
for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)
    

12825528024649263697 QBF 1 6 quick-brown-fox
12825528024649263697 QBF 13 16 quick brown fox


In [69]:
m_tool.remove('QBF')
p5 = [{'LOWER':'quick'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'brown'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'fox'}]
m_tool.add('QBF',None,p5)

In [67]:
phrase_matches = m_tool(sentence)
print(phrase_matches )

[(12825528024649263697, 1, 6), (12825528024649263697, 13, 16)]


In [79]:
sent =nlp(u'The Quick--brown-fox jumped over quick---brown---fox')
p_m = m_tool(sent)
print(p_m)

[(12825528024649263697, 1, 6), (12825528024649263697, 8, 13)]


In [107]:
for match_id, start, end in p_m:
    string_id = nlp.vocab.strings[match_id]  
    span = sent[start:end]      
    print(match_id, string_id, start, end, span.text)

12825528024649263697 QBF 1 6 Quick--brown-fox
12825528024649263697 QBF 8 13 quick---brown---fox


In [82]:
#using phrasematcher
from spacy.matcher import PhraseMatcher


In [113]:
matcher = PhraseMatcher(nlp.vocab)

In [114]:
docs = nlp(u'Hello word is the first code that every new programmer learns. Pyhton has many datastructure. Lists are \
like arrays. Dictionaries are like linked-list having key-value pair. tuples contain values which are unique alomsot like lists.')

In [129]:
phrase_list = ['linked-list','Dictionaries']

In [130]:
phrase_pattern = [nlp(text) for text in phrase_list]
#every list item is converted into a doc using nlp

In [131]:
matcher.add('Pds',None,*phrase_pattern)

In [132]:
k = matcher(docs)


In [133]:
for id, start, end in k:
    string_id = nlp.vocab.strings[id]  
    span = docs[start:end] 
    print(id, string_id, start, end, span.text)

8369155576802232941 Pds 22 23 Dictionaries
8369155576802232941 Pds 25 28 linked-list


In [134]:
k


[(8369155576802232941, 22, 23), (8369155576802232941, 25, 28)]

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urlopen(url, context=ctx).read() # returns the document which we need to parse. is a utf 8 string
soup = BeautifulSoup(html, "html.parser") # soup is an object returned by BeautifulSoup

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    # Look at the parts of a tag
    print('TAG:', tag)
    print('URL:', tag.get('href', None))
    print('Contents:', tag.contents[0])
    print('Attrs:', tag.attrs)


Enter - https://www.google.com/url?q=http://www.mca.gov.in/mcafoportal/viewCompanyMasterData.do&sa=D&ust=1605683930759000&usg=AFQjCNGAfOvLNDDLgwFfdGFP7MZU4QWrpQ


In [None]:
from sklearn. import 