# NLP Basics

# Stemming
with NLTK

In [1]:
# Import the toolkit and the full Porter Stemmer library
import nltk

In [2]:
from nltk.stem.porter import *
p_stemmer = PorterStemmer()
words = ['quickly', 'jumps', 'minimally', 'better']

for word in words:
    print(word + '  -->   ' + p_stemmer.stem(word))

quickly  -->   quickli
jumps  -->   jump
minimally  -->   minim
better  -->   better


In [3]:
from nltk.stem.snowball import SnowballStemmer

s_stemmer = SnowballStemmer(language='english')
for word in words:
    print(word + '  -->   ' + s_stemmer.stem(word))

quickly  -->   quick
jumps  -->   jump
minimally  -->   minim
better  -->   better


# spaCy Basics 
tokenisation, pos-tagging, dep-parsing, sentence segmentation, stop words

In [4]:
import spacy

# load english language model
nlp = spacy.load('en_core_web_sm')

# check pipeline
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fd3b2312860>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fd3b2168d08>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fd3b2168d68>)]

In [5]:
# pipeline component names
nlp.pipe_names

['tagger', 'parser', 'ner']

In [6]:
# create a Doc object, pass it a unicode string
doc = nlp(u'Instant coffee with slightly sour cream\nin it, and a phone call to the beyond\nwhich doesn’t seem to be' + 
          'coming any nearer.\n“Ah daddy, I wanna stay drunk many days”\non the poetry of a new friend\n' + 
          'my life held precariously in the seeing\nhands of others, their and my impossibilities.' + 
          '\nIs this love, now that the first love\nhas finally died, where there were no impossibilities?')

In [7]:
type(doc)

spacy.tokens.doc.Doc

In [8]:
# tokenisation and dependency parsing 
# print each token with pos and dep type
for token in doc[0:7]:
    print(token.text, token.pos_, token.dep_)

Instant ADJ amod
coffee NOUN ROOT
with ADP prep
slightly ADV advmod
sour ADJ amod
cream NOUN pobj

 SPACE 


In [9]:
# number of tokens in the Doc object
len(doc)

86

In [10]:
len(doc.vocab)

529

In [11]:
# access tokens by index
doc[10], doc[10].text

(and, 'and')

In [12]:
# access some tokens
doc[10:13]

and a phone

In [13]:
# access last 5 tokens
doc[-5:]

there were no impossibilities?

In [14]:
# tokens cannot be reassigned -- TypeError: 'spacy.tokens.doc.Doc' object does not support item assignment
#doc[0] = "Ground"

In [15]:
# pos-tags have special ids
doc[1].text, doc[1].pos, doc[1].pos_

('coffee', 92, 'NOUN')

In [16]:
# as do dep-tags
doc[0].text, doc[0].dep, doc[0].dep_

('Instant', 402, 'amod')

In [17]:
# to see full name of a tag, use spacy.explain()
spacy.explain('amod')

'adjectival modifier'

In [18]:
spacy.explain('ADJ')

'adjective'

In [19]:
# other token attributes
# simple  and detailed pos-tags
doc[21].text, doc[21].pos_, doc[21].tag_, spacy.explain(doc[21].tag_)

('seem', 'VERB', 'VB', 'verb, base form')

In [20]:
# is the token a stop word
doc[21].is_stop 

True

In [21]:
# shape of the word form
doc[0].text, doc[0].shape_

('Instant', 'Xxxxx')

In [22]:
# lemmatisation
doc[25].text, doc[25].lemma_

('nearer', 'nearer')

In [23]:
# count POS frequencies
POS_counts = doc.count_by(spacy.attrs.POS)
POS_counts

{84: 6,
 92: 18,
 85: 8,
 86: 5,
 103: 8,
 95: 3,
 97: 10,
 89: 2,
 90: 13,
 87: 4,
 94: 1,
 100: 6,
 91: 1,
 98: 1}

In [24]:
# find out what pos belongs to an id
doc.vocab[90].text

'DET'

In [25]:
# create frequency list for POS-tags
for k,v in sorted(POS_counts.items()):
    print(f'{k:{5}}: {doc.vocab[k].text:{5}} -- {v}')

   84: ADJ   -- 6
   85: ADP   -- 8
   86: ADV   -- 5
   87: AUX   -- 4
   89: CCONJ -- 2
   90: DET   -- 13
   91: INTJ  -- 1
   92: NOUN  -- 18
   94: PART  -- 1
   95: PRON  -- 3
   97: PUNCT -- 10
   98: SCONJ -- 1
  100: VERB  -- 6
  103: SPACE -- 8


In [26]:
# create frequency list for FINE-GRAINED POS-tags
TAG_counts = doc.count_by(spacy.attrs.TAG)
TAG_counts

for k,v in sorted(TAG_counts.items()):
    print(f'{k:{20}}: {doc.vocab[k].text:{4}} ({spacy.explain(doc.vocab[k].text)}) {v:->{4}}')

  164681854541413346: RB   (adverb) ---5
  783433942507015291: NNS  (noun, plural) ---5
 1292078113972184607: IN   (conjunction, subordinating or preposition) ---9
 1534113631682161808: VBG  (verb, gerund or present participle) ---1
 2593208677638477497: ,    (punctuation mark, comma) ---5
 3252815442139690129: UH   (interjection) ---1
 3822385049556375858: VBN  (verb, past participle) ---2
 4062917326063685704: PRP$ (pronoun, possessive) ---3
 4969857429396651903: ``   (opening quotation mark) ---1
 6893682062797376370: _SP  (None) ---8
 9188597074677201817: VBP  (verb, non-3rd person singular present) ---1
10554686591937588953: JJ   (adjective) ---6
12646065887601541794: .    (punctuation mark, sentence closer) ---3
13656873538139661788: PRP  (pronoun, personal) ---2
13927759927860985106: VBZ  (verb, 3rd person singular present) ---3
14143520107006108953: ''   (closing quotation mark) ---1
14200088355797579614: VB   (verb, base form) ---1
15267657372422890137: DT   (determiner) ---9


In [27]:
# create freqiency list for dependencies
DEP_counts = doc.count_by(spacy.attrs.DEP)
DEP_counts

for k,v in sorted(DEP_counts.items()):
    print(f'{k:{9}}: {doc.vocab[k].text:{8}} ({spacy.explain(doc.vocab[k].text)}) {v:->{4}}')

        0:          (None) ---8
      398: acomp    (adjectival complement) ---1
      399: advcl    (adverbial clause modifier) ---1
      400: advmod   (adverbial modifier) ---5
      402: amod     (adjectival modifier) ---5
      404: attr     (attribute) ---3
      405: aux      (auxiliary) ---2
      407: cc       (coordinating conjunction) ---2
      408: ccomp    (clausal complement) ---1
      410: conj     (conjunct) ---3
      415: det      (determiner) ---9
      416: dobj     (direct object) ---1
      417: expl     (expletive) ---1
      421: intj     (interjection) ---2
      423: mark     (marker) ---1
      425: neg      (negation modifier) ---1
      428: npadvmod (noun phrase as adverbial modifier) ---1
      429: nsubj    (nominal subject) ---5
      438: pcomp    (complement of preposition) ---1
      439: pobj     (object of preposition) ---7
      440: poss     (possession modifier) ---2
      443: prep     (prepositional modifier) ---8
      445: punct    (punctu

In [28]:
# a span is a slice of a Doc object
quote = doc[28:39]
quote

“Ah daddy, I wanna stay drunk many days”

In [29]:
type(quote)

spacy.tokens.span.Span

In [30]:
# a function to show lemmas and pos-tags
def show_lemma_pos(text):
    for token in text:
        print(f'{token.text:{9}} LEMMA: {token.lemma_:{9}} POS: {token.tag_:{3}} -- {spacy.explain(token.tag_)}')
        
# call function on quote
show_lemma_pos(quote)

“         LEMMA: "         POS: ``  -- opening quotation mark
Ah        LEMMA: ah        POS: UH  -- interjection
daddy     LEMMA: daddy     POS: NN  -- noun, singular or mass
,         LEMMA: ,         POS: ,   -- punctuation mark, comma
I         LEMMA: -PRON-    POS: PRP -- pronoun, personal
wanna     LEMMA: wanna     POS: MD  -- verb, modal auxiliary
stay      LEMMA: stay      POS: VBP -- verb, non-3rd person singular present
drunk     LEMMA: drunk     POS: JJ  -- adjective
many      LEMMA: many      POS: JJ  -- adjective
days      LEMMA: day       POS: NNS -- noun, plural
”         LEMMA: "         POS: ''  -- closing quotation mark


In [31]:
# NER
doc2 = nlp(u'Having a coke with you is even more fun than going to'
           + ' San Sebastian, Irún, Hendaye, Biarritz, Bayonne' 
           + ' or being sick to my stomach on the Travesera de Gracia in Barcelona partly because' 
           + ' in your orange shirt you look like a better happier St. Sebastian... '
           + '\n...in the warm New York 4 o’clock light we are drifting back and forth...'
           + '\n...except possibly for the Polish Rider occasionally and anyway it’s in the Frick...'
           + '\n... and the fact that you move so beautifully more or less takes care of Futurism'
           + ' just as at home I never think of the Nude Descending a Staircase or at a rehearsal' 
           + ' a single drawing of Leonardo or Michelangelo that used to wow me')
              
# print out names entities and their labels   
for ent in doc2.ents:
    print(ent.text + ' : ' + ent.label_ + ' : ' + spacy.explain(ent.label_))

San Sebastian : GPE : Countries, cities, states
Irún : GPE : Countries, cities, states
Hendaye : GPE : Countries, cities, states
Biarritz : ORG : Companies, agencies, institutions, etc.
the Travesera de Gracia : FAC : Buildings, airports, highways, bridges, etc.
Barcelona : GPE : Countries, cities, states
St. Sebastian : GPE : Countries, cities, states
New York : GPE : Countries, cities, states
4 o’clock : TIME : Times smaller than a day
Polish : NORP : Nationalities or religious or political groups
Frick : GPE : Countries, cities, states
Futurism : NORP : Nationalities or religious or political groups
Leonardo : PERSON : People, including fictional
Michelangelo : PERSON : People, including fictional


In [32]:
# a bit more neatly:
# function to show named entities in a text
def show_ents(text):
    for ent in text.ents:
        print(f'{ent.text:{25}} {ent.label_:{7}} {spacy.explain(ent.label_):{10}} ')
        
        
# call function on doc2        
show_ents(doc2)        

San Sebastian             GPE     Countries, cities, states 
Irún                      GPE     Countries, cities, states 
Hendaye                   GPE     Countries, cities, states 
Biarritz                  ORG     Companies, agencies, institutions, etc. 
the Travesera de Gracia   FAC     Buildings, airports, highways, bridges, etc. 
Barcelona                 GPE     Countries, cities, states 
St. Sebastian             GPE     Countries, cities, states 
New York                  GPE     Countries, cities, states 
4 o’clock                 TIME    Times smaller than a day 
Polish                    NORP    Nationalities or religious or political groups 
Frick                     GPE     Countries, cities, states 
Futurism                  NORP    Nationalities or religious or political groups 
Leonardo                  PERSON  People, including fictional 
Michelangelo              PERSON  People, including fictional 


In [33]:
#access entity by index
doc2.ents[3]

Biarritz

In [34]:
# check position of ent
doc2.ents[3].start, doc2.ents[3].end

(19, 20)

In [35]:
# check label, realize it is wrong
doc2.ents[3].label_

# HOW TO CHANGE AN ENTITY LABEL?

'ORG'

In [36]:
doc2[21].text

'Bayonne'

In [37]:
#There's a known issue with spaCy v2.0.12 where some linebreaks are interpreted as `GPE` entities:
spacy.__version__

'2.2.4'

In [38]:
# add ent
from spacy.tokens import Span

# get hash value of GPE label
GPE = doc2.vocab.strings[u'GPE']
# create span for new ent
new_ent = Span(doc2, 21, 22, label=GPE)

# add ent to the Doc object
doc2.ents = list(doc2.ents) + [new_ent] 

In [39]:
show_ents(doc2)

San Sebastian             GPE     Countries, cities, states 
Irún                      GPE     Countries, cities, states 
Hendaye                   GPE     Countries, cities, states 
Biarritz                  ORG     Companies, agencies, institutions, etc. 
Bayonne                   GPE     Countries, cities, states 
the Travesera de Gracia   FAC     Buildings, airports, highways, bridges, etc. 
Barcelona                 GPE     Countries, cities, states 
St. Sebastian             GPE     Countries, cities, states 
New York                  GPE     Countries, cities, states 
4 o’clock                 TIME    Times smaller than a day 
Polish                    NORP    Nationalities or religious or political groups 
Frick                     GPE     Countries, cities, states 
Futurism                  NORP    Nationalities or religious or political groups 
Leonardo                  PERSON  People, including fictional 
Michelangelo              PERSON  People, including fictional 


In [40]:
# difference between word and word.text
print('word:', type(doc2.ents[3])) 
print('word.text:', type(doc2.ents[3]).text)

word: <class 'spacy.tokens.span.Span'>
word.text: <attribute 'text' of 'spacy.tokens.span.Span' objects>


In [41]:
#number of named entities
len(doc2.ents)

15

In [42]:
# noun chunks
for chunk in doc.noun_chunks:
    print(type(chunk))
    print(type(chunk.text))

<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>
<class 'spacy.tokens.span.Span'>
<class 'str'>


In [43]:
# noun chunks
for chunk in doc.noun_chunks:
    print(chunk.text,  'ROOT:', chunk.root.text)

Instant coffee ROOT: coffee
slightly sour cream ROOT: cream
it ROOT: it
a phone call ROOT: call
the beyond ROOT: beyond
any nearer ROOT: nearer
I ROOT: I
the poetry ROOT: poetry
a new friend ROOT: friend
my life ROOT: life
the seeing ROOT: seeing
hands ROOT: hands
others ROOT: others
my impossibilities ROOT: impossibilities
this love ROOT: love
the first love ROOT: love
no impossibilities ROOT: impossibilities


In [44]:
# noun chunks
for chunk in doc.noun_chunks:
    print(f'{chunk.text:{20}}  ROOT: {chunk.root.text:{10}}')

Instant coffee        ROOT: coffee    
slightly sour cream   ROOT: cream     
it                    ROOT: it        
a phone call          ROOT: call      
the beyond            ROOT: beyond    
any nearer            ROOT: nearer    
I                     ROOT: I         
the poetry            ROOT: poetry    
a new friend          ROOT: friend    
my life               ROOT: life      
the seeing            ROOT: seeing    
hands                 ROOT: hands     
others                ROOT: others    
my impossibilities    ROOT: impossibilities
this love             ROOT: love      
the first love        ROOT: love      
no impossibilities    ROOT: impossibilities


In [45]:
# number of noun chunks in doc
# len(doc.noun_chunks) -- does not work: doc.noun_chunks is a generator object, as is doc.sents
len(list(doc.noun_chunks))

17

In [46]:
# sentenciser
for sent in doc.sents:
    print(sent)

Instant coffee with slightly sour cream
in it, and a phone call to the beyond
which doesn’t seem to becoming any nearer.

“Ah daddy, I wanna stay drunk many days”
on the poetry of a new friend

my life held precariously in the seeing
hands of others, their and my impossibilities.

Is this love, now that the first love
has finally died, where there were no impossibilities?


In [47]:
# number of sents
len(list(doc.sents))

4

In [48]:
# check if token is start of a sentence
for token in doc[:30]:
    print(token.is_sent_start, ' : ' + token.text)

True  : Instant
None  : coffee
None  : with
None  : slightly
None  : sour
None  : cream
None  : 

None  : in
None  : it
None  : ,
None  : and
None  : a
None  : phone
None  : call
None  : to
None  : the
None  : beyond
None  : 

None  : which
None  : does
None  : n’t
None  : seem
None  : to
None  : becoming
None  : any
None  : nearer
None  : .
None  : 

True  : “
None  : Ah


In [49]:
#reset original
nlp = spacy.load('en_core_web_sm') 

"""# add a rule to the sentenciser
def set_new_sentenciser_rule(doc):
    for token in doc[:-1]:
        if token.text == '\n':
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_new_sentenciser_rule, before='parser')"""
nlp.pipe_names

# changing the rules requires other steps

['tagger', 'parser', 'ner']

In [50]:
for sent in doc.sents:
    print(sent)

Instant coffee with slightly sour cream
in it, and a phone call to the beyond
which doesn’t seem to becoming any nearer.

“Ah daddy, I wanna stay drunk many days”
on the poetry of a new friend

my life held precariously in the seeing
hands of others, their and my impossibilities.

Is this love, now that the first love
has finally died, where there were no impossibilities?


In [51]:
# number of sents
len(list(doc.sents))

4

In [52]:
# stop words
print(nlp.Defaults.stop_words)

{'just', 'during', 'down', 'would', 'everything', 'made', 'from', 'thru', 'i', 'still', 'side', 'they', 'perhaps', 'hence', 'becoming', 'doing', 'hereafter', 'hereupon', 'indeed', 'may', 'seems', 'various', 'her', 'become', 'my', 'have', 'besides', 'sometimes', 'under', 'mostly', 'whereas', "'s", 'around', 'much', 'third', 'hundred', 'someone', 'ten', 'whereupon', 'namely', 'thereupon', 'thus', "'re", 'except', 'themselves', 'at', 'you', 'via', 'get', 'nor', 'very', 'meanwhile', 'whether', 'full', 'along', 'of', '‘s', 'will', 'regarding', 'if', '‘m', 'about', 'done', 'further', 'to', 'which', 'than', 'both', 'everywhere', 'amongst', 'since', 'its', "n't", 'nine', 'first', '‘re', 'thereby', 'off', 'this', 'me', 'anywhere', 'each', 'other', 'he', 're', 'nobody', 'part', 'that', 'with', 'without', 'former', 'last', 'up', 'been', 'serious', '’ve', 'per', 'could', 'noone', 'once', 'when', 'their', 'onto', 'through', 'neither', 'we', 'whose', 'where', 'eleven', 'towards', 'twelve', 'therefor

In [53]:
# number of stop words
len(nlp.Defaults.stop_words)

326

In [54]:
# check whether a word is a stop word
nlp.vocab['cat'].is_stop

False

In [55]:
# add a stop word
nlp.Defaults.stop_words.add('cat')
nlp.vocab['cat'].is_stop = 'True'

In [56]:
nlp.vocab['cat'].is_stop

True

In [57]:
# remove a word from stop words
nlp.Defaults.stop_words.remove('cat')
nlp.vocab['cat'].is_stop = False

# VISUALISE!

In [58]:
from spacy import displacy 

In [59]:
span = doc[0:28]
span

Instant coffee with slightly sour cream
in it, and a phone call to the beyond
which doesn’t seem to becoming any nearer.

In [60]:
# display dependency parse tree
displacy.render(span, style='dep', jupyter="True", options={'distance': 120})

In [61]:
# display dependency parse tree with additional options set
options = {'distance':100, 'color':'white', 'bg':'black', 'compact':'True', 'font':'Arial', 'add_lemma':'False', 'fine_grained':'True'}
displacy.render(quote, style='dep', jupyter="True", options = options)

In [62]:
# display named entities
displacy.render(doc2, style='ent', jupyter='True')

In [63]:
# display specific named entities, line by line
for sent in doc2.sents:
    docx = nlp(sent.text)
    # if no ner is found, just print the text -- this is to avoid an error message appearing
    if docx.ents:
        displacy.render(docx, style='ent', jupyter='True', options={'ents':'TIME'})
    else:
        print(docx.text)

and the fact that you move


In [64]:
# visualisation outside jupyter : http://127.0.0.1:5000
# displacy.serve(quote, style='dep')

# Matching

In [65]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [66]:
pattern1 = [{'LOWER': 'ferde', 'OP': '?'}, {'IS_SPACE': True, 'OP': '+'} ,{'LOWER': 'ferenc'}]
pattern2 = [{'LOWER': 'ferde'}]

matcher.add('név', None, pattern1, pattern2)

In [67]:
doc3 = nlp(u'A hivatalnok felírta Ferde Ferenc nevét a táblára, de a Ferde vezetéknevet elírta, Ferduként, a Ferenc pedig lemaradt.')

In [68]:
# return matchen in doc: id, start, end
matches = matcher(doc3)
print(matches)

[(16779574398397773062, 3, 4), (16779574398397773062, 11, 12)]


In [69]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

16779574398397773062 név 3 4 slightly
16779574398397773062 név 11 12 a
