# Website Text Mining

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import requests
import justext
from bs4 import BeautifulSoup, SoupStrainer
import json
import wikipedia
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
import os
import subprocess
from neo4j import GraphDatabase
from pathlib import Path

!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_lg")


[93m    Linking successful[0m
    /home/aa5118/anaconda3/envs/plural/lib/python3.7/site-packages/en_core_web_sm
    -->
    /home/aa5118/anaconda3/envs/plural/lib/python3.7/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm')



Firstly, we create a NamedEntityRecognition class. Each object of this class will represent a given webpage and specifically hold the information regarding the named entities present within the webpage

In [2]:
class NamedEntityRecognition:
    
    def __init__(self, url):
        
        """
        Args: url of the webpage as a string
        
        Just instantiates all the lists of named entities
        
        """
        
        self.url = url
        self.companies = []
        self.people = []
        self.keywords = []
        self.locations = []
        self.events = []
        self.products = []
        self.text = None
        self.doc = None
        
    def visualise(self):
        
        """
        Args: None
        
        Returns a html rendered visualisation of the named entities within the homepage of the url using displacy
        
        """
        
        if str(self.doc) != "":
            return displacy.render(self.doc, style="ent", jupyter=True)
        else:
            return "No entities found."
    
    def jsonify(self):
        
        """
        Args: None
        
        Returns a json version of all the named entities within the webpage
        """
        
        list_of_entities = [self.companies, self.people, self.keywords, self.locations, self.events, self.products]
        list_of_entity_types = ["companies", "people", "keywords", "locations", "events", "products"]
        zip_object = zip(list_of_entity_types, list_of_entities)
        entity_dict = dict(zip_object)
        
        return json.dumps(entity_dict, indent = 4, ensure_ascii=False)
    
    def get_people_wiki_pages(self):
        
        """
        Args: None
        
        Returns a dictionary consisting of the people identified within the webpage as the key, whilst the value attempts
        to return the url and first line of the article for the closest match to the person on wikipedia
        
        *** STILL NEEDS WORK, RETURNS UNEXPECTED RESULTS IF THE PERSON IS NOT ON WIKIPEDIA ***
        
        """
        
        wiki = []
        
        for person in self.people:
            try:
                result = wikipedia.page(person)
                summary = wikipedia.summary(person, sentences=1)
                wiki.append([result.url, summary])
            except:
                wiki.append(["",""])
        zip_object = zip(self.people, wiki)
        
        return dict(zip_object)

    def __get_links(self):
        
        """
        Args: None
        
        Private function which attempts to extract all the relevant links from the original webpage that point to other
        pages on the same website and returns the urls in a list
        
        """
    
        new_url = "http://" + self.url
        html=requests.get(new_url)
        links = []

        for link in BeautifulSoup(html.text, parse_only=SoupStrainer('a')):
            try:
                new_link = (link['href'])

                short_url = self.url[4:]
                if (new_link.find(short_url) >= 0) and (new_link.count('@') == 0):
                    idx=new_link.find(short_url) + len(short_url)
                    end=new_link.find('/',idx+1)
                    if end == -1:
                        end = len(new_link)
                    links.append(new_link[:end])

                elif (new_link.count('/', 1) <= 1) and (str(new_link)[:4] != "http") and (new_link.count('#') == 0) and (new_link.count('@') == 0):

                    if (new_link.count('/', 1) == 1) and (new_link[:-1] != "/"):
                        continue
                    links.append(self.url + new_link)
            except:
                continue

        return list(set(links))
    
    def get_salience(self):
        
        """
        Args: None
        
        Returns a measure of the relative importance of all the named entities. It attempts to do this by simply counting the
        frequency of all the terms both on the webpage and on all the other relevant pages to which the original page links 
        using the __get_links() method. The function then returns a dictionary in descending order of count.
        
        """
        
        corpus=[self.text]
        links = self.__get_links()

        for link in links:

            if (link[:4] != "http"):
                link = "http://" + link

            try:
                html = requests.get(link)
            except:
                html=""
            
            text = ""
            if str(html) == "<Response [200]>":
                
                try:
                    paragraphs = justext.justext(html.text.encode('utf-8'), justext.get_stoplist("English"))
                    for paragraph in paragraphs:
                        if not paragraph.is_boilerplate:
                            text+= paragraph.text + "\n"
                except:
                    print("Could not parse text")
            corpus.append(text)

        all_entities = self.companies + self.people + self.keywords + self.locations + self.events + self.products
        counts = []
        
        for entity in all_entities:
            counts.append(sum(entity in s for s in corpus))
        
        entity_dict = dict(zip(all_entities, counts))
        
        return {k: v for k, v in sorted(entity_dict.items(), key=lambda x: x[1],reverse=True)}

Now, we create an object which parses a file containing a list of urls and extracts content from the webpages, including named entities, storing them in instances of the NamedEntityRecognition class.

In [3]:
class Extract:
    
    def __init__(self, csv_or_url):
        
        """
        Args: filename containing a list of url strings (with 'url' header) or an individual url string
        
        Takes a list of urls or single url and initialises some variables. 
        Adds a function removing whitespace to the spacy pipeline if it has not already been added
        
        """
        
        my_file = Path(csv_or_url)
        
        if my_file.is_file():
            sources=pd.read_csv(csv_or_url,header=None,names=["url"])
        else:
            sources = pd.DataFrame({'url':csv_or_url}, index=[0])
        
        self.sources = sources
        self.bad_urls = []
        self.extracted = self.__extract_text()
        self.ner_list = None
        
        if "__remove_whitespace_entities" not in (dict(nlp.pipeline).keys()):
            nlp.add_pipe(self.__remove_whitespace_entities, after='ner')

    def __extract_text(self):
        
        """
        Args: None
        
        Private function which is called on initiliatisation of the object. Loops through every url in the list and extracts
        non-boilerplate content before saving it as list. Keeps track of urls which are either broken or timeout before
        returning the webpage and saves it in the data member "bad_urls"
        
        """
        
        extracted=[]
        for url in self.sources['url']:

            url = "http://"+url
            
            try:
                html = requests.get(url)
            except requests.exceptions.RequestException as e:
                self.bad_urls.append(url)
                print (e)
            
            if str(html) == "<Response [200]>":
                
                text = ""
                paragraphs = justext.justext(html.text.encode('utf-8'), justext.get_stoplist("English"))
                for paragraph in paragraphs:
                    if not paragraph.is_boilerplate:
                        text+= paragraph.text + "\n"
            else:
                text="<ERROR>"

            extracted.append(text)

        return extracted
    
    def __remove_whitespace_entities(self, doc):
        
        """
        Args: string representing an individual document
        
        A function added to the spacy pipeline due to a bug in spacy which results in certain whitespace characters being 
        recognised as entities: https://github.com/explosion/spaCy/issues/2870
        
        """
        
        doc.ents = [e for e in doc.ents if not e.text.isspace()]
        return doc
        
    def extract_entities(self):
        
        """
        Args: None
        
        Uses the spacy namer entity recogniser to extract named entities from the non-boilerplate content of each webpage.
        Additionally uses the Rake algorithm to extract keywords from the same non-boilerplate content. Creates an instance of
        the NamedEntityRecognition class for each webpage and stores the addresses to the objects in a list called "ner_list"
        
        """
        
        ner_list=[]
        r = Rake()
        k = 10 #top number of key words to extract

        for i, url in enumerate(self.sources['url']):
            ner_list.append(NamedEntityRecognition(url))
            
            ner_list[i].text = self.extracted[i]
            r.extract_keywords_from_text(ner_list[i].text)
            top_k_keywords = [value for value in dict(r.get_ranked_phrases_with_scores()[:k]).values()]
            ner_list[i].keywords = top_k_keywords
            
            doc = nlp(self.extracted[i])
            obj = ner_list[i]
            obj.doc = doc
            
            for entity in doc.ents:
                if (entity.label_ == "ORG"):
                    if entity.text not in  obj.companies: obj.companies.append(entity.text)
                elif (entity.label_ == "PERSON"):
                    if entity.text not in  obj.people: obj.people.append(entity.text)
                elif (entity.label_ == "GPE"):
                    if entity.text not in  obj.locations: obj.locations.append(entity.text)            
                elif (entity.label_ == "EVENT"):
                    if entity.text not in  obj.events: obj.events.append(entity.text)        
                elif (entity.label_ == "PRODUCT"):
                    if entity.text not in  obj.products: obj.products.append(entity.text)
        
        self.ner_list = ner_list
        return
        
    def get_number_of_urls(self):
        
        """
        Args: None
        
        Simple function to just return the number of urls in the input file
        
        """
        return len(self.sources)

Finally, we create a class for adding our extracted entities and keywords to a neo4j graph:

In [4]:
class Neo4j:
    
    def __init__(self, NER_object):
        """
        Args: an object of type NamedEntityRecognition
        
        Takes an object of type NamedEntityRecognition and extracts the url and JSON from the object. 
        Also connects to a neo4j database.
        
        """
        self.uri = "bolt://localhost:7687"
        self.driver = GraphDatabase.driver(self.uri, auth=("neo4j", "4jNeo"))
        self.json = json.loads(NER_object.jsonify())
        self.url = NER_object.url
        self.salience = NER_object.get_salience()
        
    def __create_query(self, tx, test):
        """
        Args: a neo4j driver session
        
        Internal function which creates and runs the query that extracts the information from the JSON and
        inserts them as nodes into the neo4j graph.
        """
        temp_id = 1
        query = ("CREATE (name:website {url:'"+self.url+"'})")
        for i in self.json:
            for j in self.json[i]:
                query += " CREATE (node" + str(temp_id) + ":" + i + " {name:'"+j+"', salience:" + str(self.salience[j]) +"})"
                query += " CREATE (node" + str(temp_id) + ")-[:EXTRACTED_FROM]->(name)"
                temp_id += 1
        if test is False: tx.run(query)
        
        return ("Nodes added to graph:" + str(temp_id))
                
    def add_nodes(self, test=False):
        """
        Args: none
        
        External function which calls __create_function from a neo4j driver session
        """
        with self.driver.session() as session:
            msg = session.read_transaction(self.__create_query, test)
            
        return msg

### Usage

Let's now instantiate our class and see what our webpages have to offer

In [5]:
webpages = Extract("www.clear.ai")

In [6]:
webpages.bad_urls

[]

In [7]:
webpages.extract_entities()

In [8]:
webpages.ner_list[0].url

'www.clear.ai'

In [9]:
webpages.ner_list[0].visualise()

In [10]:
print(webpages.ner_list[0].jsonify())

{
    "companies": [
        "AI"
    ],
    "people": [],
    "keywords": [
        "leave behind disjointed trails",
        "something bigger clear provides",
        "global economy ever made",
        "independent trade data network",
        "reaching network effects",
        "actively encourages diversity",
        "global trade network",
        "processing personal data"
    ],
    "locations": [
        "London"
    ],
    "events": [],
    "products": []
}


In [11]:
salience = webpages.ner_list[0].get_salience()
salience

{'AI': 3,
 'London': 3,
 'leave behind disjointed trails': 2,
 'global economy ever made': 2,
 'independent trade data network': 2,
 'reaching network effects': 2,
 'actively encourages diversity': 2,
 'global trade network': 2,
 'processing personal data': 2,
 'something bigger clear provides': 0}

In [12]:
vals = [i for i in salience.values()]
norm = [format(float(i)/max(vals),'1.2f') for i in vals]
pd.DataFrame({'Extracted': [i for i in salience.keys()],'Relative Importance': norm})

Unnamed: 0,Extracted,Relative Importance
0,AI,1.0
1,London,1.0
2,leave behind disjointed trails,0.67
3,global economy ever made,0.67
4,independent trade data network,0.67
5,reaching network effects,0.67
6,actively encourages diversity,0.67
7,global trade network,0.67
8,processing personal data,0.67
9,something bigger clear provides,0.0


In [20]:
n4j=Neo4j(webpages.ner_list[0])

In [21]:
n4j.add_nodes()

('Nodes added to graph:', 11)

### Output

In [15]:
if os.path.isdir("json") == False:
    !mkdir json

for i in range(webpages.get_number_of_urls()):
    json_text = webpages.ner_list[i].jsonify()
    json_file = open("json/" + webpages.ner_list[i].url,'w')
    json_file.write(json_text)
    json_file.close()

### Testing

In [16]:
import unittest

In [17]:
test_file = ["www.google.com","www.balsejfnweo.cofm"]
np.savetxt("test_file.txt", test_file, delimiter="\n", fmt="%s")

In [18]:
class Test_Everything(unittest.TestCase):
 
    def test_url(self):
        
        obj = Extract("www.google.com")
        obj = Extract("test_file.txt")
        self.assertEqual(obj.get_number_of_urls(), 2)
        self.assertEqual(str(obj.bad_urls[0]), "http://www.balsejfnweo.cofm")
        
    def test_ner_extraction(self):
        
        obj = Extract("test_file.txt")
        self.assertIsNone(obj.ner_list)
        obj.extract_entities()
        self.assertIsNotNone(obj.ner_list)
        
    def test_ner_class(self):
        
        obj = Extract("test_file.txt")
        obj.extract_entities()
        
        with self.assertRaises(IndexError):
            obj.ner_list[2].url
            
        self.assertIsNotNone(obj.ner_list[0].jsonify())
        self.assertIsNotNone(obj.ner_list[0].get_salience())
    
    def test_neo4j(self):
        obj = Extract("test_file.txt")
        obj.extract_entities()
        
        self.assertIsNotNone(obj.ner_list)
        for i in webpages.ner_list:
            n4j=Neo4j(i)
            n4j.add_nodes(test=True)

In [19]:
unittest.main(argv=[''], verbosity=2, exit=False)

test_neo4j (__main__.Test_Everything) ... 

HTTPConnectionPool(host='www.balsejfnweo.cofm', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f13b9b8aa20>: Failed to establish a new connection: [Errno -2] Name or service not known'))


ok
test_ner_class (__main__.Test_Everything) ... 

HTTPConnectionPool(host='www.balsejfnweo.cofm', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f135d9355c0>: Failed to establish a new connection: [Errno -2] Name or service not known'))


ok
test_ner_extraction (__main__.Test_Everything) ... ok
test_url (__main__.Test_Everything) ... 

HTTPConnectionPool(host='www.balsejfnweo.cofm', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f13b9ae29e8>: Failed to establish a new connection: [Errno -2] Name or service not known'))
HTTPConnectionPool(host='www.balsejfnweo.cofm', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f135e1fd198>: Failed to establish a new connection: [Errno -2] Name or service not known'))


ok

----------------------------------------------------------------------
Ran 4 tests in 4.021s

OK


<unittest.main.TestProgram at 0x7f13b0addcf8>