## Text Mining using spaCy


In [2]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy import displacy

# Load a spacy model
import spacy
from nltk.corpus import stopwords
nlp=spacy.load('en_core_web_sm')

from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import requests
import re

import pandas as pd 

In [3]:
# Check the default pipeline to ensure ner is in it
nlp.pipe_names

['tagger', 'parser', 'ner']

### Web scrapping

In [4]:
def html_to_text(url):  
    # Gettiing the page source & creating text object
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    
    # Extracting the article    
    text = soup.find_all('article', id="main-article")[0].get_text(strip=True)
    
    # Cleaning text data
    text = re.sub(f"[^a-zA-Z.,]", " ", text)
  
    return text

# Extracting web article
article_text = html_to_text('https://www.stearsng.com/article/accelerating-financial-inclusion-in-nigeria')
doc = nlp(article_text)

### Performing NER on article from Niigerian NEWS site

In [5]:
displacy.render(doc, jupyter=True, style='ent')

In [28]:
entities = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
df = pd.DataFrame(entities, columns =['company_name']) 
df

Unnamed: 0,company_name
0,the World Bank
1,the Central Bank of Nigeria
2,Enhancing Financial Innovation
3,Fintech
4,the Nigeria Inter Bank Settlement System
5,the Payment Services Bank
6,fintechinvestmentin Nigeria
7,fintech start
8,PalmPay and
9,PiggyBank


### Customizing the Named Entity Recognizer

In [6]:
# Getting the pipeline component
ner=nlp.get_pipe("ner")

### Format of the training examples


In [7]:
# The default NER doesn't recognize companies like MTN Nigeria
# Enabling it to recognise such terms

In [32]:
# training data
TRAIN_DATA = [
        ("Iyinoluwa Aboyeji is the founder of several IT companies iin Nigeria", {"entities": [(0, 17, "PERSON")]}),
        ("Iyinoluwa Aboyeji is a Nigerian entrepreneur", {"entities": [(0, 17, "PERSON")]})
    ]


In [31]:
string = 'Iyinoluwa Aboyeji is a Nigerian entrepreneur'
substring = 'Iyinoluwa Aboyeji'
index = string.find(substring)
print((index, index+len(substring)))
print(string.lower())

(0, 17)
iyinoluwa aboyeji is a nigerian entrepreneur


In [33]:
# Adding labels to the `NER`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [34]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

### Training the NER model

In [35]:

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

Losses {'ner': 6.513200657565903}
Losses {'ner': 3.833668207958226}
Losses {'ner': 3.4326680179756295}
Losses {'ner': 5.140065910600242}
Losses {'ner': 4.669312517587969}
Losses {'ner': 7.549347720895133}
Losses {'ner': 4.619660388626585}
Losses {'ner': 2.353303195531023}
Losses {'ner': 5.436141415135353}
Losses {'ner': 0.2408304731120552}
Losses {'ner': 3.2678208151408796}
Losses {'ner': 4.342915257922044}
Losses {'ner': 3.622928525200223}
Losses {'ner': 1.9038424087028156}
Losses {'ner': 5.94737703653321}
Losses {'ner': 0.2816273752711174}
Losses {'ner': 1.8862617805811788}
Losses {'ner': 0.02897552059937425}
Losses {'ner': 6.978812387652013}
Losses {'ner': 1.3909019787929253}
Losses {'ner': 1.8992856323857268}
Losses {'ner': 3.3254432192679553}
Losses {'ner': 0.0831508843298252}
Losses {'ner': 3.225878791559973}
Losses {'ner': 0.01767313754378498}
Losses {'ner': 0.14411759719102626}
Losses {'ner': 0.001388995851342667}
Losses {'ner': 0.00013121641368307735}
Losses {'ner': 0.00011141

### Test the model performance using the same text


In [36]:
doc = nlp(article_text)
displacy.render(doc, jupyter=True, style='ent')


In [37]:
entities = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
df = pd.DataFrame(entities, columns =['company_name']) 
df

Unnamed: 0,company_name
0,the World Bank
1,the Central Bank of Nigeria
2,Fintech
3,the Nigeria Inter Bank
4,the Payment Services Bank
5,PiggyBank
6,Fintech
7,Paga
8,Bank Verification Number
9,KYC
