## Text Mining using spaCy


In [1]:
# Import libraries
import re
import requests
import random
from pathlib import Path

from spacy.util import minibatch, compounding
from spacy import displacy
import spacy

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from bs4 import BeautifulSoup

import pandas as pd 

In [2]:
# Loadiing the spaCy core
nlp=spacy.load('en_core_web_sm')

In [3]:
# Check the default pipeline to ensure ner is in it
nlp.pipe_names

['tagger', 'parser', 'ner']

### Web scrapping

In [4]:
# Function to extract web contents and cleanse the text
def html_to_text(url): 
    
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    
    text = soup.find_all('article', id="main-article")[0].get_text(strip=True)
    text = re.sub(f"[^a-zA-Z.,]", " ", text)
  
    return text


# Extracting the web data now
article_text = html_to_text('https://www.stearsng.com/article/accelerating-financial-inclusion-in-nigeria')
doc = nlp(article_text)

### Performing NER on article from Niigerian NEWS site

In [5]:
displacy.render(doc, jupyter=True, style='ent')

### Moving identified companies from spaCy to Pandas dataframe

In [10]:
entities = list(set([ent.text.lower() for ent in doc.ents if ent.label_ == 'ORG']))
df = pd.DataFrame(entities, columns =['company_name']) 
df

Unnamed: 0,company_name
0,the payment services bank
1,economist
2,nigeria s deposit insurance corporation
3,bank verification number
4,piggybank
5,fintech
6,access
7,paga
8,the world bank
9,interswitch


### Customizing the Named Entity Recognizer

In [7]:
# Getting the pipeline component
ner=nlp.get_pipe("ner")

### Format of the training examples


The default NER didn't recognize Andela amongs companies. This needs to be changed via additional training.

In [16]:
# training data
TRAIN_DATA = [
        ("Andela builds remote engineering teams that work on great infrastructure assets", {"entities": [(0,6, "ORG")]}),
        ("The average annual salary of developers workng at Andela is over $50,000", {"entities": [(50,56, "ORG")]}), 
        ("Andela is an American company with operational campuses in Africa including Nigeria", {"entities": [(0,6, "ORG")]}),
        ("My work at Andela is to advance human potential by powering teams and investing in future leaders", {"entities": [(11,17, "ORG")]})
  ]

In [15]:
string = 'My work at Andela is to advance human potential by powering teams and investing in future leaders'
substring = 'Andela'
index = string.find(substring)
print((index, index+len(substring)))
print(string.lower())

(11, 17)
my work at andela is to advance human potential by powering teams and investing in future leaders


In [17]:
# Adding labels to the `NER`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [18]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

### Training the NER model

In [19]:

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

Losses {'ner': 9.3815022019553}
Losses {'ner': 13.07024092064239}
Losses {'ner': 14.462242584274804}
Losses {'ner': 11.730359533581577}
Losses {'ner': 18.72922779567307}
Losses {'ner': 9.052381832600076}
Losses {'ner': 5.9474256653692805}
Losses {'ner': 8.223686381473726}
Losses {'ner': 11.067577486623009}
Losses {'ner': 11.703737637270024}
Losses {'ner': 6.600276415168992}
Losses {'ner': 10.526700408023316}
Losses {'ner': 3.6194768708464835}
Losses {'ner': 2.668322118722699}
Losses {'ner': 6.6270564631558955}
Losses {'ner': 9.352068306179717}
Losses {'ner': 1.4582827758129042}
Losses {'ner': 5.760641878656054}
Losses {'ner': 6.887942242901772}
Losses {'ner': 3.7027375103447815}
Losses {'ner': 0.7835095682858082}
Losses {'ner': 3.4544778063783426}
Losses {'ner': 3.4376144435782408}
Losses {'ner': 0.5596459577809583}
Losses {'ner': 0.09287047060419643}
Losses {'ner': 1.8393281203887}
Losses {'ner': 0.7968520983661733}
Losses {'ner': 1.449709952658131}
Losses {'ner': 0.008261782281696162

### Test the model performance using the same text


In [20]:
doc = nlp(article_text)
displacy.render(doc, jupyter=True, style='ent')


In [22]:
entities = list(set([ent.text.lower() for ent in doc.ents if ent.label_ == 'ORG']))
df = pd.DataFrame(entities, columns =['company_name']) 
df

Unnamed: 0,company_name
0,nigeria s deposit insurance corporation
1,kenya
2,the world bank
3,andela
4,access
5,paga
6,kyc
7,flutterwave
8,the payment services bank
9,palmpay


Spacy recognizes the Andela as companies now. Likewise, the spacy NER needs to be retrainined on more data.

Note that remaining predictions are also changed due to “catastrophic forgetting problem”. The spacy NER model is learning about the new entity type but it’s “forgetting” what it has previously learned. 