## Information extraction with spaCy

In [1]:
# importing the necessary libraries

from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy import displacy
import spacy

from bs4 import BeautifulSoup
import pandas as pd 
import requests
import re
import json

import random 

In [2]:
# Loadiing the spaCy core
nlp=spacy.load('en_core_web_sm')

In [3]:
# Check the default pipeline to ensure ner is in it
nlp.pipe_names

['tagger', 'parser', 'ner']

### A utility function to generate custom color for displacy

In [4]:
def get_entity_options():
    """
    generating color options for visualizing the named entities
    """
    def color_generator(number_of_colors):
        color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
                 for i in range(number_of_colors)]
        return color

    entities = ["COMPANY", "SECTOR"]
    
    colors = {}
    
    color = color_generator(len(entities))
    for i in range(len(entities)):
        colors[entities[i]] = color[i]
    
    options = {"ents": entities, "colors": colors}
    
    return options

### Web scrapping

In [5]:
def url_to_text(url): 
    
    html = requests.get(url)
    soup = BeautifulSoup(html.text, 'html.parser')
    
    text = soup.find_all('article', id="main-article")[0].get_text(strip=True)
    text = re.sub(r'\.(?! )', '. ', re.sub(r' +', ' ', text))
  
    return text


In [6]:
# Loading the data from the URL
url = 'https://www.stearsng.com/article/accelerating-financial-inclusion-in-nigeria'
text = url_to_text(url)

###  Customising spaCy NER model using custom rules  

In [7]:
# Utility function to lead the rules from JSON file
# The rules file shall be generated automatically. These are libraries for job-titles and company names extraction
def load_ner_rules():
    with open('rules.json') as rules_file:
        data = json.load(rules_file)
        return dict(data).get("rules")

In [8]:
# Utility function to get the customized nlp() object
def get_nlp(raw_text):
    
    nlp = spacy.load("en_core_web_sm")
    ruler = EntityRuler(nlp, overwrite_ents=True)

    patterns = load_ner_rules()

    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler, before="ner")

    return nlp(raw_text)

In [9]:
# Getting the spacy nlp object for information extraction
doc = get_nlp(text)

### Performing NER on news article

In [10]:
# Exploratory analytics to see how the entity recognition is working with custom rules
displacy.render(doc, jupyter=True, style='ent')

### Information Extraction

In [11]:
# Utility function to get sector from the given sentence. 
# Assumptions, there is only single sector given in the sentence
# Needs to be done properly using dependency parsing 

def get_sector(doc):
    return [e.text for e in doc.ents if e.label_=='SECTOR']

In [12]:
# Utility function to get the year of funding raised
# It doesn't care for syntatical dependencies which is key to this extraction. 
# Always return if there is a date in the sentence

def get_year(doc, ctx_year):
    for e in doc.ents:
        if e.label_=='DATE':
            num_year_exist = re.search(r'[12]\d{3}\s*', e.text)
            if num_year_exist != None:
                return re.search(r'[12]\d{3}\s*', e.text).group()
            else:
                if re.search('last', e.lower_) != None:
                    return ctx_year - 1
                if re.search('next', e.lower_) != None:
                    return ctx_year + 1
    return ''

In [13]:
# Utility function to funding amount from the sentence 
# It has limitions, needs to be implemented using relations extration involving POS and DEP 

def get_funding(company, doc):
    companies = [ent.text for ent in doc.ents if ent.label_=='COMPANY']
    monies = [ent.text for ent in doc.ents if ent.label_=='MONEY']
    company_monies = dict(zip(companies, monies))
    return company_monies.get(company)

### Populating a data from to retrieve the relevant information

In [14]:
data = [(ent.text, get_sector(ent.sent), get_funding(ent.text, ent.sent), get_year(ent.sent, 2020), url, ent.sent) for ent in doc.ents if ent.label_=='COMPANY']

df = pd.DataFrame(data, columns = ['company_name', 'sector', 'money_raised', 'year_money_raised', 'source', 'sentence']) 
df.head()

Unnamed: 0,company_name,sector,money_raised,year_money_raised,source,sentence
0,PwC,[],,,https://www.stearsng.com/article/accelerating-...,"(The, consensus, from, the, World, Bank, ,, th..."
1,Enhancing Financial Innovation & Access (EFInA),[],,2016.0,https://www.stearsng.com/article/accelerating-...,"(The, result, is, an, increase, in, the, banke..."
2,Nigeria Inter-Bank Settlement System (NIBSS),[],,,https://www.stearsng.com/article/accelerating-...,"(For, example, ,, the, CBN, ,, in, collaborati..."
3,PalmPay,[fintech],$40 million,2019.0,https://www.stearsng.com/article/accelerating-...,"(By, last, year, ,, fintech, start, -, ups, ha..."
4,Opay,[fintech],$120 million,2019.0,https://www.stearsng.com/article/accelerating-...,"(By, last, year, ,, fintech, start, -, ups, ha..."
