In [7]:
from pathlib import Path
import sys, os
libs_path = (Path(os.path.abspath(os.path.join('..'))).parent)
sys.path.append(str(libs_path))

from libs.corpus import get_corpus
import re
import requests
from http.client import RemoteDisconnected
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
from bs4 import BeautifulSoup
from libs.storage import get_crawled_dataset, save_crawled_dataset
from urllib.parse import urlparse
import uuid


In [9]:
def crawl_dataset(dataset_name: str):
    corpus = dict(list(get_corpus(dataset_name).items())[:500])

    new_docs = []
    for doc in corpus.values():
        new_docs.extend(_expand_document_with_crawled_data(doc))

    for doc in new_docs:
        corpus[str(uuid.uuid4())] = doc

    save_crawled_dataset(corpus, dataset_name)


def __extractURLs(content):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', content)
    return urls


def _expand_document_with_crawled_data(doc_content: str) -> list:
    document_included_urls = __extractURLs(doc_content)
    new_docs = []
    if len(document_included_urls) > 0:
        for url in document_included_urls:
            crawled_text = __crawl(url)
            new_docs += crawled_text
    return new_docs


def __is_text_url(url):
    # send a HEAD request to the URL to retrieve the headers
    response = requests.head(url)

    # check the Content-Type and Content-Disposition headers
    content_type = response.headers['Content-Type']
    content_disposition = response.headers.get('Content-Disposition', '')
    if 'text' in content_type and 'attachment' not in content_disposition:
        # check the file extension of the URL
        resource = urlparse(url).path
        file_extension = resource.split('.')[-1]
        # array of common text files extensions
        text_file_extensions = ['txt', 'html', 'htm', 'xml', 'csv', 'json', 'md', 'rst', 'php', 'asp', 'aspx', 'css',
                                'js', 'py', 'rb', 'java', 'c', 'cpp', 'h', 'sh', 'bat', 'log', 'ini', 'conf', 'yml',
                                'yaml']
        if file_extension in text_file_extensions:
            return True

        # download a small portion of the response and check its contents
        response = requests.get(url, stream=True)
        content = response.raw.read(1024)
        if all(32 <= c < 127 or c in (9, 10, 13) for c in content):
            return True

    return False


In [10]:
def __crawl(url):
    try:
        if __is_text_url(url):
            print(f"crawling {url}")
            html = urlopen(url).read()
            soup = BeautifulSoup(html, features="html.parser")

            # kill all script and style elements
            for script in soup(["script", "style"]):
                script.extract()  # rip it out

            # get text
            text = soup.get_text()

            # ###### some text processing #######
            # break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            # drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)
            return text
        else:
            return ''
    except (HTTPError, URLError, RemoteDisconnected) as e:
        return ''
    except Exception as e:
        return ''

In [11]:


crawl_dataset("lifestyle")

corpus = get_corpus("lifestyle")
print(len(corpus))

crawled_corpus = get_crawled_dataset("lifestyle")
print(len(crawled_corpus))


# crawl_dataset("antique")

# corpus = get_corpus("antique")
# print(len(corpus))

# crawled_corpus = get_crawled_dataset("antique")
# print(len(crawled_corpus))


Loading lifestyle dataset 268893
crawling http://movies.netflix.com/WiMovie/The_Very_Best_of_Dog_Whisperer_with_Cesar_Millan/70270440.
crawling http://www.declawing.com/:
crawling http://freshaquarium.about.com
crawling http://avetsguidetolife.blogspot.com/2013/04/corn-in-foodno-its-not-bad.html
crawling http://www.kumpi.com/corn.php
crawling http://healthypets.mercola.com/sites/healthypets/archive/2013/01/02/veterinary-nutritionists-favor-commercial-food.aspx
crawling http://www.rottweiler.net/forums/general-info/91525-never-let-your-rottie-win-tug-war.html
crawling http://dogcare.dailypuppy.com/dog-stop-nipping-during-tugofwar-1709.html
crawling http://www.netplaces.com/dog-obedience/games-dogs-play/tug-of-war.htm
Loading lifestyle dataset 268893
268893
25994
