In [43]:
import pandas as pd
import spacy
from spacy import displacy
import requests
import justext
from bs4 import BeautifulSoup
import json
import wikipedia
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")


[93m    Linking successful[0m
    /home/aa5118/anaconda3/envs/plural/lib/python3.7/site-packages/en_core_web_sm
    -->
    /home/aa5118/anaconda3/envs/plural/lib/python3.7/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [25]:
class NamedEntityRecognition:
    
    def __init__(self, url):
        
        self.url = url
        self.companies = []
        self.people = []
        self.keywords = []
        self.locations = []
        self.events = []
        self.products = []
        self.text = None
        self.doc = None
        
    def visualise(self):
        
        if str(self.doc) != "":
            return displacy.render(self.doc, style="ent", jupyter=True)
        else:
            return "No entities found."
    
    def jsonify(self):
        
        list_of_entities = [self.companies, self.people, self.keywords, self.locations, self.events, self.products]
        list_of_entity_types = ["companies", "people", "keywords", "locations", "events", "products"]
        zip_object = zip(list_of_entity_types, list_of_entities)
        entity_dict = dict(zip_object)
        
        return json.dumps(entity_dict, indent = 4, ensure_ascii=False)
    
    def get_people_wiki_pages(self):
        
        wiki = []
        
        for person in self.people:
            try:
                result = wikipedia.page(person)
                summary = wikipedia.summary(person, sentences=1)
                wiki.append([result.url, summary])
            except:
                wiki.append(["",""])
        zip_object = zip(self.people, wiki)
        
        return dict(zip_object)

In [67]:
class Extract:
    
    def __init__(self, filename):
        
        sources=pd.read_csv(filename,header=None,names=["url"])
        #sources=sources.append({"url":"www.balsejfnweo.com"}, ignore_index = True) 
        
        self.sources = sources.head(10)
        self.bad_urls = []
        self.extracted = self.__extract_text()
        self.ner_list = None
        
        if "__remove_whitespace_entities" not in (dict(nlp.pipeline).keys()):
            nlp.add_pipe(self.__remove_whitespace_entities, after='ner')

    def __extract_text(self):
        
        extracted=[]
        for url in self.sources['url']:

            url = "http://"+url
            
            try:
                html = requests.get(url)
            except requests.exceptions.RequestException as e:
                self.bad_urls.append(url)
                print (e)
                
            if str(html) == "<Response [200]>":
                
                text = ""
                paragraphs = justext.justext(html.text.encode('utf-8'), justext.get_stoplist("English"))
                for paragraph in paragraphs:
                    if not paragraph.is_boilerplate:
                        text+= paragraph.text + "\n"
            else:
                text="<ERROR>"

            extracted.append(text)

        return extracted
    
    def __remove_whitespace_entities(self, doc):
        doc.ents = [e for e in doc.ents if not e.text.isspace()]
        return doc
    
    def extract_entities(self):
        
        ner_list=[]
        r = Rake()
        k = 10 #top number of key words to extract

        for i, url in enumerate(self.sources['url']):
            ner_list.append(NamedEntityRecognition(url))
            
            ner_list[i].text = self.extracted[i]
            r.extract_keywords_from_text(ner_list[i].text)
            top_k_keywords = [value for value in dict(r.get_ranked_phrases_with_scores()[:k]).values()]
            ner_list[i].keywords = top_k_keywords
            
            doc = nlp(self.extracted[i])
            ner_list[i].doc = doc
            
            for entity in doc.ents:
                if (entity.label_ == "ORG"):
                    ner_list[i].companies.append(entity.text)
                elif (entity.label_ == "PERSON"):
                    ner_list[i].people.append(entity.text)
                elif (entity.label_ == "GPE"):
                    ner_list[i].locations.append(entity.text)            
                elif (entity.label_ == "EVENT"):
                    ner_list[i].events.append(entity.text)        
                elif (entity.label_ == "PRODUCT"):
                    ner_list[i].products.append(entity.text)
        
        self.ner_list = ner_list
        return
        
    def get_number_of_urls(self):
        return len(self.sources)

In [68]:
test=Extract("./sources.txt")

In [69]:
test.bad_urls

[]

In [70]:
test.extract_entities()

[]
['tank storage facilities harnessing', 'supplying high grade', 'high ffa products', 'argent energy', 'green fuel']
['today manual operations run alongside robotic', 'attract major clients', 'day one asg', 'latest manufacturing technology', 'asg group', 'truly world']
['commercial director experience matters aston martin works ltd aston martin works limited', 'principal firm allows aston martin works limited', 'automotive compliance ltd ’', 'automotive compliance ltd', 'financial conduct authority', 'finance providers']
['produce high quality corrugated cardboard packaging', 'independent corrugated cardboard packaging specialists products', 'leading independent uk cardboard box manufacturer', 'leading independent cardboard box manufacturer', 'uk cardboard box manufacturer making', 'distinctive corrugated cardboard packaging', 'making corrugated packaging across', 'ireland wide delivery fleet', 'house award winning structural', 'corrugated cardboard packaging']
['ddudjuilp atos though

In [30]:
test.ner_list[9].people

['Barrie Beard',
 'Accredited Contractor',
 'Worksafe Contractor Membership',
 'Rob Davenport',
 'Rob',
 'Ann Brazier – Worcestershire',
 'Lynn',
 'Chris Smith',
 'Lynn Messenger',
 'Helen Chambers',
 'Rob Davenport (BB Electrician',
 'Rob',
 'Helen Chambers – Bromsgrove – March',
 'Mark Warman',
 'Dan Par',
 'Dan',
 'Dan Cattell']

In [31]:
test.ner_list[0].visualise()

'No entities found.'

In [32]:
print(test.ner_list[4].jsonify())

{
    "companies": [
        "The Independent Corrugated Cardboard Packaging\nSpecialists\nProducts\n",
        "Innovation, Leadership, Team work and Excellence"
    ],
    "people": [
        "Atlas Packaging",
        "Vimeo"
    ],
    "keywords": [],
    "locations": [
        "UK",
        "Devon",
        "UK",
        "UK",
        "Ireland",
        "UK",
        "UK",
        "Scroll"
    ],
    "events": [],
    "products": []
}
