## Library for NLP

In [23]:
!pip install spacy or pip install -U 'spacy[cuda-autodetect]'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy
  Downloading spacy-3.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement or (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for or[0m[31m
[0m

In [25]:
!python -m spacy download en_core_web_sm #trained using cnn

!python -m spacy download en_core_web_md #has word embedding (gloVe); trained using cnn

!python -m spacy download en_core_web_trf #everything is trained using transformer

2023-02-04 14:06:53.969391: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
2023-02-04 14:07:05.601212: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.4.1
  Downloading https://github.com/e

In [1]:
import spacy
spacy.__version__



'3.4.4'

## 1. Basics

### 1.1 Intro

In [2]:
# creating a spacy object that can parse a lot of stuffs
# based on some learned model

nlp = spacy.load('en_core_web_sm')

In [3]:
text = 'Thailand really like to eat naan and masala.  He also likes to eat sushi.'

In [4]:
doc = nlp(text)

In [5]:
type(doc)

spacy.tokens.doc.Doc

In [6]:
# there are so many things in this doc
for tokens in doc[:10]:
    print(tokens)  # this spacy.tokens.doc.Doc already tokenize it!!!
    break

Thailand


In [7]:
tokens

Thailand

In [8]:
for sent in doc.sents:
    print(sent)  # it also has sentence

Thailand really like to eat naan and masala.  
He also likes to eat sushi.


In [9]:
tokens

Thailand

In [10]:
tokens.ent_type # entity type ids

384

In [11]:
tokens.ent_type_ # geo political entity

'GPE'

In [12]:
spacy.explain('GPE')

'Countries, cities, states'

In [13]:
tokens.ent_iob_  # beginning of an entity

'B'

In [14]:
tokens.pos_  # proper noun

'PROPN'

In [15]:
tokens.dep_

'nsubj'

In [16]:
tokens.head

like

In [17]:
sentence1 = list(doc.sents)[0]

In [18]:
sentence1

Thailand really like to eat naan and masala.  

In [20]:
from spacy import displacy  # displaying stuffs
options = {"collapse_punct": False}

displacy.render(sentence1, options = options, style="dep", jupyter=True)

In [21]:
options = {"collapse_punct": False}

displacy.render(sentence1, options = options, style="ent", jupyter=True)

### 1.2 Word Vectors

In [26]:
nlp = spacy.load("en_core_web_md")

In [27]:
text = "Chaky likes to eat sushi."

In [28]:
doc = nlp(text)

In [29]:
sentence = list(doc.sents)[0]

In [30]:
sentence[1]

likes

In [31]:
len(sentence[1].vector)  # what is the size?? --> 300 glove embedding

300

### 1.3 Similarity

In [32]:
# before similarity, let's talk about nlp.vocab.strings
doc = nlp("I love coffee.")

In [33]:
nlp.vocab.strings['coffee']  # hash value

3197928453018144401

In [34]:
nlp.vocab.strings[3197928453018144401]

'coffee'

In [35]:
# first numericalize dog
integer = nlp.vocab.strings['dog']
integer

7562983679033046312

In [36]:
# getting the vector based on this id
vector = nlp.vocab.vectors[integer]
vector[:5] # size 300 - vector of dog

array([  1.233 ,   4.2963,  -7.9738, -10.121 ,   1.8207], dtype=float32)

In [37]:
import numpy as np

close_words = nlp.vocab.vectors.most_similar(np.asarray([vector]), n=10)
close_words

(array([[ 7918624946109788756,  4969328240109515165,  4560869431627726864,
         17429802345416193488,  6017664905485703127, 14534804554944721111,
           173986088034745168, 15668852121853073894, 11567120971096873637,
         15872191516786115817]], dtype=uint64),
 array([[ 1147,  2545,  3201,  9003,  3828, 18829,  5845, 11580,  7045,
         18612]], dtype=int32),
 array([[1.    , 0.8334, 0.8221, 0.8108, 0.7856, 0.7195, 0.685 , 0.6328,
         0.6148, 0.5966]], dtype=float32))

In [38]:
close_words[0].shape

(1, 10)

In [39]:
nlp.vocab.strings[close_words[0][0][0]]

'dogsbody'

### 1.4 Doc and Span Similarity

In [40]:
doc1 = nlp("Chaky likes french fries")
doc2 = nlp("Tonson likes sweet potato nuggets")

In [41]:
doc1.similarity(doc2)  # higher means more similar

0.681774068977061

In [44]:
# doc ---> sents ---> span ---> tokens

# do span similarity
span1 = doc1[2:4]
span1

french fries

In [45]:
span2 = doc2[2:6]
span2

sweet potato nuggets

In [46]:
span1.similarity(span2)

0.534758985042572

## 2. Entity Ruler

In [47]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [48]:
# #pipes --> everything you insert some text into nlp
# #it must through a sequential list of pipes --> they do something
# analysis = nlp.analyze_pipes(pretty=True)
# analysis

In [49]:
# adding some entityruler pipe, we use the function (add_pipe)
ruler = nlp.add_pipe('entity_ruler', before='ner') #put the pipe before ner

# adding patterns
patterns = [
                {"label": "LOC", "pattern": "Rangsit"}
            ]

ruler.add_patterns(patterns)

In [50]:
# nlp.analyze_pipes(pretty=True)

In [51]:
text = "AIT is at Rangsit."
doc = nlp(text)

In [52]:
for ent in doc.ents:
    print(ent.text, ent.label_)

AIT ORG
Rangsit LOC


### 2.1 More Patterns

In [53]:
import spacy

text = "My phone number is (555) 666-5555"
nlp = spacy.blank("en") # blank model (no pipes)

In [54]:
ruler = nlp.add_pipe('entity_ruler')

In [55]:
patterns = [
                {"label": "PHONE_NUMBER", "pattern": [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
                {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]}
           ]

ruler.add_patterns(patterns)

In [56]:
doc = nlp(text)

In [57]:
for ent in doc.ents:
    print(ent.text, ent.label_)

(555) 666-5555 PHONE_NUMBER


### 2.2 Matcher

In [58]:
from spacy.matcher import Matcher # help us recognize patterns

In [59]:
# Email
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [60]:
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL", [pattern])

doc = nlp("Chaky email is chaklam@ait.asia.")
matches = matcher(doc)

In [61]:
matches

[(17587345535198158200, 3, 4)]

In [62]:
nlp.vocab[matches[0][0]].text

'EMAIL'

In [64]:
# proper nouns and longer phrases
with open("/content/wiki_king.txt", "r") as f:
    text = f.read()
    
text

'Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist who became the most visible spokesman and leader in the American civil rights movement from 1955 until his assassination in 1968. King advanced civil rights through nonviolence and civil disobedience, inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi. He was the son of early civil rights activist and minister Martin Luther King Sr.\n\nKing participated in and led marches for blacks\' right to vote, desegregation, labor rights, and other basic civil rights.[1] King led the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped organize some of the nonviolent 1963 protests in Birmingham, Alabama. King helped organize the 1963 March on Washington, where he delivered his fam

In [65]:
nlp = spacy.load("en_core_web_sm")

In [66]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN"}]  # pos ==> part of speech
matcher.add("PROPER_NOUN_CHAKY", [pattern])
doc = nlp(text)
matches = matcher(doc)
for match in matches[:10]:
    print(match, doc[match[1]:match[2]]) # match[1] start of the span, match[2] end of the span

(2015442650195688329, 0, 1) Martin
(2015442650195688329, 1, 2) Luther
(2015442650195688329, 2, 3) King
(2015442650195688329, 3, 4) Jr.
(2015442650195688329, 6, 7) Michael
(2015442650195688329, 7, 8) King
(2015442650195688329, 8, 9) Jr.
(2015442650195688329, 10, 11) January
(2015442650195688329, 15, 16) April
(2015442650195688329, 49, 50) King


In [67]:
## multi-word token
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]  # pos ==> part of speech; + means 1 or more
matcher.add("PROPER_NOUN_CHAKY", [pattern])
doc = nlp(text)
matches = matcher(doc)
for match in matches[:10]:
    print(match, doc[match[1]:match[2]]) # match[1] start of the span, match[2] end of the span

(2015442650195688329, 0, 1) Martin
(2015442650195688329, 0, 2) Martin Luther
(2015442650195688329, 1, 2) Luther
(2015442650195688329, 0, 3) Martin Luther King
(2015442650195688329, 1, 3) Luther King
(2015442650195688329, 2, 3) King
(2015442650195688329, 0, 4) Martin Luther King Jr.
(2015442650195688329, 1, 4) Luther King Jr.
(2015442650195688329, 2, 4) King Jr.
(2015442650195688329, 3, 4) Jr.


In [68]:
## how do we get only one
## greedy = longest
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]  # pos ==> part of speech; + means 1 or more
matcher.add("PROPER_NOUN_CHAKY", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
for match in matches[:10]:
    print(match, doc[match[1]:match[2]]) # match[1] start of the span, match[2] end of the span

(2015442650195688329, 83, 88) Martin Luther King Sr.
(2015442650195688329, 0, 4) Martin Luther King Jr.
(2015442650195688329, 128, 132) Southern Christian Leadership Conference
(2015442650195688329, 6, 9) Michael King Jr.
(2015442650195688329, 69, 71) Mahatma Gandhi
(2015442650195688329, 146, 148) Albany Movement
(2015442650195688329, 193, 195) Lincoln Memorial
(2015442650195688329, 10, 11) January
(2015442650195688329, 15, 16) April
(2015442650195688329, 49, 50) King


In [69]:
## sorting
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]  # pos ==> part of speech; + means 1 or more
matcher.add("PROPER_NOUN_CHAKY", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)

matches.sort(key = lambda x: x[1])

for match in matches[:10]:
    print(match, doc[match[1]:match[2]]) # match[1] start of the span, match[2] end of the span

(2015442650195688329, 0, 4) Martin Luther King Jr.
(2015442650195688329, 6, 9) Michael King Jr.
(2015442650195688329, 10, 11) January
(2015442650195688329, 15, 16) April
(2015442650195688329, 49, 50) King
(2015442650195688329, 69, 71) Mahatma Gandhi
(2015442650195688329, 83, 88) Martin Luther King Sr.
(2015442650195688329, 89, 90) King
(2015442650195688329, 113, 114) King
(2015442650195688329, 117, 118) Montgomery


In [70]:
## adding some verb
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}, {"POS": "VERB"}]  # pos ==> part of speech; + means 1 or more
matcher.add("PROPER_NOUN_CHAKY", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)

matches.sort(key = lambda x: x[1])

for match in matches[:10]:
    print(match, doc[match[1]:match[2]]) # match[1] start of the span, match[2] end of the span

(2015442650195688329, 49, 51) King advanced
(2015442650195688329, 89, 91) King participated
(2015442650195688329, 113, 115) King led
(2015442650195688329, 167, 169) King helped


### 2.3 Regex - Regular Expression

In [71]:
import spacy

# sample text
text = "This is a sample number 5555555."

# building upon the spaCy small model
nlp = spacy.blank("en")

# adding the pipe
ruler = nlp.add_pipe("entity_ruler")

# List of Entities and Patterns (source: https://spacy.io/usage/rule-based-matching)
patterns = [
                {
                    "label": "PHONE_NUMBER", 
                    "pattern": [{"TEXT": {"REGEX": "((\d){7})"}}]
                }
            ]
# adding patterns to ruler
ruler.add_patterns(patterns)


# creating the doc
doc = nlp(text)

# extracting entities
for ent in doc.ents:
    print (ent.text, ent.label_)

5555555 PHONE_NUMBER
