## Text Mining using spaCy


In [2]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy import displacy

# Load a spacy model
import spacy
from nltk.corpus import stopwords
nlp=spacy.load('en_core_web_sm')

from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup
import requests
import re

In [3]:
# Check the default pipeline to ensure ner is in it
nlp.pipe_names

['tagger', 'parser', 'ner']

### Web scrapping

In [4]:
def html_to_text(url):  
    # Gettiing the page source & creating text object
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    
    # Extracting the article    
    text = soup.find_all('article', id="main-article")[0].get_text(strip=True)
    
    # Cleaning text data
    text = re.sub(f"[^a-zA-Z.,]", " ", text)
  
    return text

# Extracting web article
article_text = html_to_text('https://www.stearsng.com/article/accelerating-financial-inclusion-in-nigeria')
doc = nlp(article_text)

### Performing NER on article from Niigerian NEWS site

In [5]:
displacy.render(doc, jupyter=True, style='ent')

### Customizing the Named Entity Recognizer

In [6]:
# Getting the pipeline component
ner=nlp.get_pipe("ner")

### Format of the training examples


In [7]:
# The default NER doesn't recognize companies like MTN Nigeria
# Enabling it to recognise such terms

In [8]:
# training data
TRAIN_DATA = [
        ("MTN Nigeria is leading telecommunications company", {"entities": [(0, 11, "ORG")]}),
        ("TIDAL announces partnership with MTN Nigeria ", {"entities": [(33, 44, "ORG")]})
    ]



In [9]:
# Adding labels to the `NER`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [10]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

### Training the NER model

In [11]:

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

Losses {'ner': 4.777584093803512}
Losses {'ner': 7.580789186060429}
Losses {'ner': 5.839726616803091}
Losses {'ner': 0.39938080607680604}
Losses {'ner': 3.692764957435429}
Losses {'ner': 4.025412744318601}
Losses {'ner': 4.829517441801727}
Losses {'ner': 0.0659974485015482}
Losses {'ner': 0.8095611122716946}
Losses {'ner': 1.7074044685141416}
Losses {'ner': 0.00036320815789814276}
Losses {'ner': 4.4601431063938435}
Losses {'ner': 2.415077605209717}
Losses {'ner': 0.03209291437772066}
Losses {'ner': 2.397695175126046}
Losses {'ner': 5.027688222393863}
Losses {'ner': 0.02976524812356729}
Losses {'ner': 3.8778797292907257}
Losses {'ner': 0.9037604680830913}
Losses {'ner': 0.01723403000197976}
Losses {'ner': 3.6303453429403865}
Losses {'ner': 0.1895110352015763}
Losses {'ner': 0.07464597338846701}
Losses {'ner': 0.051011385963647626}
Losses {'ner': 0.025360796981431122}
Losses {'ner': 0.39267894877730214}
Losses {'ner': 1.0684089824935654}
Losses {'ner': 4.172056087941723}
Losses {'ner': 4

### Test the model performance using the same text


In [17]:
doc = nlp(article_text)
displacy.render(doc, jupyter=True, style='ent')


In [22]:
entities = []
for ent in doc.ents:
    if (ent.label_ == 'ORG'):
        entities.append(ent.text)
        

In [23]:
entities

['the World Bank',
 'the Central Bank of Nigeria',
 'Enhancing Financial Innovation',
 'Fintech',
 'the Nigeria Inter Bank Settlement System',
 'the Payment Services Bank',
 'fintechinvestmentin Nigeria',
 'fintech start',
 'PalmPay and',
 'PiggyBank',
 'Fintech',
 'Paga, Nigeria',
 'Bank Verification Number',
 'KYC',
 'CBN s',
 'KYC',
 'Nigeria s Deposit Insurance Corporation',
 'the Nigeria Data Protection Regulation',
 'Iyinoluwa Aboyeji',
 'Flutterwave',
 'Fintech',
 'The World Bank']

In [25]:
import pandas as pd 

df = pd.DataFrame(entities, columns =['company_name']) 
df

Unnamed: 0,company_name
0,the World Bank
1,the Central Bank of Nigeria
2,Enhancing Financial Innovation
3,Fintech
4,the Nigeria Inter Bank Settlement System
5,the Payment Services Bank
6,fintechinvestmentin Nigeria
7,fintech start
8,PalmPay and
9,PiggyBank
