#Import spacy in english

In [1]:
# run the next line only once if needed 
import spacy
nlp = spacy.load("en_core_web_lg")


#trying for the small texts

In [2]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")


My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

#Attributes that spacy add

In [3]:
import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)


Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


#removing stop word using spacy

In [4]:
text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)


[Dear, Ryan, need, sit, talk, Regards, Pete]


#finding all nouns using spacy

In [5]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)


[friend, Ryan, Peters, adventure, games]


#named entity recognition

In [6]:
text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(Ryan Peters, PERSON) 

In [7]:
text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")


(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

#Visualize NERS

In [8]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)


#trying with real dataset

In [9]:

from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.reuters.com/world/europe/ukrainian-infrastructure-pounded-again-saturday-2022-10-22/')
article = nlp(ny_bb)
len(article.ents)


1

In [10]:
displacy.render(article, style='ent', jupyter=True)


In [13]:
from collections import Counter
labels = [x.label_ for x in article.ents]
Counter(labels)


Counter({'GPE': 1})

In [14]:
items = [x.text for x in article.ents]
Counter(items).most_common(5)


[('JS', 1)]

In [17]:
sentences = [x for x in article.sents]
print(sentences)


[reuters.comPlease enable JS and disable any ad blocker]


In [19]:
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')


#popular NER types

In [21]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences)) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('[', 'X', '['),
 ('reuters.comPlease', 'NOUN', 'reuters.complease'),
 ('enable', 'VERB', 'enable'),
 ('JS', 'PROPN', 'JS'),
 ('disable', 'VERB', 'disable'),
 ('ad', 'NOUN', 'ad'),
 ('blocker', 'NOUN', 'blocker'),
 (']', 'X', ']')]

#Sentence dependency test

In [23]:
displacy.render(nlp(str(sentences)), style='dep', jupyter = True, options = {'distance': 120})
