In [73]:
import pandas as pd
import spacy
from spacy import displacy
import requests
import justext
from bs4 import BeautifulSoup
import json

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")


[93m    Linking successful[0m
    /home/aa5118/anaconda3/envs/plural/lib/python3.7/site-packages/en_core_web_sm
    -->
    /home/aa5118/anaconda3/envs/plural/lib/python3.7/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



In [116]:
class NamedEntityRecognition:
    
    def __init__(self, url):
        
        self.url = url
        self.companies = []
        self.people = []
        self.keywords = []
        self.locations = []
        self.events = []
        self.products = []
        self.text = None
        self.doc = None
        
    def visualise(self):
        if str(self.doc) != "":
            return displacy.render(self.doc, style="ent", jupyter=True)
        else:
            return "No entities found."
    
    def jsonify(self):
        
        list_of_entities = [self.companies, self.people, self.keywords, self.locations, self.events, self.products]
        list_of_entity_types = ["companies", "people", "keywords", "locations", "events", "products"]
        zip_object = zip(list_of_entity_types, list_of_entities)
        entity_dict = dict(zip_object)
        
        return json.dumps(entity_dict, sort_keys=True, indent = 4)

In [117]:
class Extract:
    
    def __init__(self, filename):
        
        sources=pd.read_csv(filename,header=None,names=["url"])
        #sources=sources.append({"url":"www.balsejfnweo.com"}, ignore_index = True) 
        
        self.sources = sources.head()
        self.bad_urls = []
        self.extracted = self.__extract_text()
        self.ner_list = None
        
        if "__remove_whitespace_entities" not in (dict(nlp.pipeline).keys()):
            nlp.add_pipe(self.__remove_whitespace_entities, after='ner')

    def __extract_text(self):
        
        extracted=[]
        for url in self.sources['url']:

            url = "http://"+url
            
            try:
                html = requests.get(url)
            except requests.exceptions.RequestException as e:
                self.bad_urls.append(url)
                print (e)
                
            if str(html) == "<Response [200]>":
                
                text = ""
                paragraphs = justext.justext(html.text.encode('utf-8'), justext.get_stoplist("English"))
                for paragraph in paragraphs:
                    if not paragraph.is_boilerplate:
                        text+= paragraph.text + "\n"
            else:
                text="<ERROR>"

            extracted.append(text)

        return extracted
    
    def __remove_whitespace_entities(self, doc):
        doc.ents = [e for e in doc.ents if not e.text.isspace()]
        return doc
    
    def extract_entities(self):
        
        ner_list=[]
        
        for i, url in enumerate(self.sources['url']):
            ner_list.append(NamedEntityRecognition(url))
            
            doc = nlp(self.extracted[i])
            ner_list[i].text = self.extracted[i]
            ner_list[i].doc = doc
            
            for entity in doc.ents:
                if (entity.label_ == "ORG"):
                    ner_list[i].companies.append(entity.text)
                elif (entity.label_ == "PERSON"):
                    ner_list[i].people.append(entity.text)
                elif (entity.label_ == "GPE"):
                    ner_list[i].locations.append(entity.text)            
                elif (entity.label_ == "EVENT"):
                    ner_list[i].events.append(entity.text)        
                elif (entity.label_ == "PRODUCT"):
                    ner_list[i].products.append(entity.text)
        
        self.ner_list = ner_list
        return
        
    def get_number_of_urls(self):
        return len(self.sources)

In [118]:
test2=Extract("./sources.txt")

In [119]:
test.bad_urls

[]

In [120]:
test.extract_entities()

In [121]:
test.ner_list[4].locations

['UK', 'Devon', 'UK', 'UK', 'Ireland', 'UK', 'UK', 'Scroll']

In [122]:
test.ner_list[0].visualise()

'No entities found.'

In [124]:
print(test.ner_list[4].jsonify())

{
    "companies": [
        "The Independent Corrugated Cardboard Packaging\nSpecialists\nProducts\n",
        "Innovation, Leadership, Team work and Excellence"
    ],
    "events": [],
    "keywords": [],
    "locations": [
        "UK",
        "Devon",
        "UK",
        "UK",
        "Ireland",
        "UK",
        "UK",
        "Scroll"
    ],
    "people": [
        "Atlas Packaging",
        "Vimeo"
    ],
    "products": []
}
