Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Defer URL management to courlan.UrlStore (experimental) #232

Merged
merged 16 commits into from
Feb 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 61 additions & 36 deletions tests/cli_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

import pytest

from trafilatura import cli, cli_utils
from courlan import UrlStore

from trafilatura import cli, cli_utils, spider
from trafilatura.downloads import add_to_compressed_dict, fetch_url
from trafilatura.settings import DEFAULT_CONFIG

Expand Down Expand Up @@ -120,12 +122,6 @@ def test_climain():
# Force encoding to utf-8 for Windows (seem to be a problem only in GitHub Actions)
env['PYTHONIOENCODING'] = 'utf-8'
assert subprocess.run([trafilatura_bin, '--input-dir', RESOURCES_DIR], env=env).returncode == 0
# dump urls
inputdict = add_to_compressed_dict(['https://www.example.org'])
f = io.StringIO()
with redirect_stdout(f):
cli.dump_on_exit(inputdict)
assert f.getvalue() == 'todo: https://www.example.org/\n'


def test_input_type():
Expand Down Expand Up @@ -242,15 +238,41 @@ def test_cli_pipeline():
#with redirect_stdout(f):
# cli.process_args(args)
#assert len(f.getvalue()) == 0
# test URL listing

# Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
os.environ['PYTHONIOENCODING'] = "utf-8"

# Crawling
testargs = ['', '--crawl', 'https://httpbin.org/html']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert f.getvalue() == 'https://httpbin.org/html\n'
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# links permitted
testargs = ['', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel', '1']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert f.getvalue() == 'https://httpbin.org/links/1/1\nhttps://httpbin.org/links/1/0\n'
spider.URL_STORE = UrlStore(compressed=False, strict=False)
# 0 links permitted
args.crawl = 'https://httpbin.org/links/4/4'
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
assert len(f.getvalue().split('\n')) == 6
spider.URL_STORE = UrlStore(compressed=False, strict=False)

# test URL listing
testargs = ['', '--list']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
assert cli_utils.url_processing_pipeline(args, {}) is False
assert cli_utils.url_processing_pipeline(args, UrlStore()) is False
# test inputlist + blacklist
testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
with patch.object(sys, 'argv', testargs):
Expand All @@ -262,16 +284,16 @@ def test_cli_pipeline():
args = cli.parse_args(testargs)
assert args.blacklist is not None
# test backoff between domain requests
inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None)
url_store = add_to_compressed_dict(my_urls, args.blacklist, None, None)
reftime = datetime.now()
cli_utils.url_processing_pipeline(args, inputdict)
cli_utils.url_processing_pipeline(args, url_store)
delta = (datetime.now() - reftime).total_seconds()
assert delta > 2
# test blacklist and empty dict
args.blacklist = cli_utils.load_blacklist(args.blacklist)
assert len(args.blacklist) == 2
inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None)
cli_utils.url_processing_pipeline(args, inputdict)
url_store = add_to_compressed_dict(my_urls, args.blacklist, None, None)
cli_utils.url_processing_pipeline(args, url_store)
# test backup
testargs = ['', '--backup-dir', '/tmp/']
with patch.object(sys, 'argv', testargs):
Expand Down Expand Up @@ -312,7 +334,7 @@ def test_cli_pipeline():
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert len(f.getvalue()) == 0
assert len(f.getvalue().strip()) == 0
# config file
testargs = ['', '--input-dir', '/dev/null', '--config-file', 'newsettings.cfg']
with patch.object(sys, 'argv', testargs):
Expand All @@ -326,10 +348,10 @@ def test_cli_pipeline():
testargs = ['', '--links', '--images']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
#with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r') as f:
# teststring = f.read()
#result = cli.examine(teststring, args)
#assert '[link](testlink.html)' in result # and 'test.jpg' in result
with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r') as f:
teststring = f.read()
result = cli.examine(teststring, args)
assert '[link](testlink.html)' in result and 'test.jpg' in result

# Crawling
testargs = ['', '--crawl', 'https://httpbin.org/html']
Expand All @@ -338,21 +360,20 @@ def test_cli_pipeline():
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert len(f.getvalue()) == 0
assert f.getvalue() == 'https://httpbin.org/html\n'
# links permitted
testargs = ['', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel', '1']
with patch.object(sys, 'argv', testargs):
args = cli.parse_args(testargs)
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args)
assert f.getvalue() == 'https://httpbin.org/links/1/0\n'
assert f.getvalue().endswith('https://httpbin.org/links/1/0\n')
# 0 links permitted
args.crawl = 'https://httpbin.org/links/4/4'
f = io.StringIO()
with redirect_stdout(f):
cli_utils.cli_crawler(args, n=0)
# print(f.getvalue())
assert len(f.getvalue().split('\n')) == 5

# Exploration (Sitemap + Crawl)
Expand All @@ -362,7 +383,7 @@ def test_cli_pipeline():
f = io.StringIO()
with redirect_stdout(f):
cli.process_args(args)
assert len(f.getvalue()) == 0
assert f.getvalue() == 'https://httpbin.org/html\n'


def test_input_filtering():
Expand All @@ -372,36 +393,40 @@ def test_input_filtering():
args = cli.parse_args(testargs)
# load dictionary
args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
inputdict = cli.load_input_dict(args)
assert inputdict['https://httpbin.org'] == deque(['/status/200', '/status/404'])
url_store = cli.load_input_dict(args)
assert len(url_store.find_known_urls('https://httpbin.org')) == 2
args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
args.blacklist = {'httpbin.org/status/404'}
inputdict = cli.load_input_dict(args)
assert inputdict['https://httpbin.org'] == deque(['/status/200'])
url_store = cli.load_input_dict(args)
assert len(url_store.find_known_urls('https://httpbin.org')) == 1
# deduplication and filtering
myinput = ['https://example.org/1', 'https://example.org/2', 'https://example.org/2', 'https://example.org/3', 'https://example.org/4', 'https://example.org/5', 'https://example.org/6']
myblacklist = {'example.org/1', 'example.org/3', 'example.org/5'}
inputdict = add_to_compressed_dict(myinput, myblacklist)
assert inputdict['https://example.org'] == deque(['/2', '/4', '/6'])
url_store = add_to_compressed_dict(myinput, myblacklist)
assert url_store.find_known_urls('https://example.org') == ['https://example.org/2', 'https://example.org/4', 'https://example.org/6']
# URL in blacklist
args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
my_urls = cli_utils.load_input_urls(args)
my_blacklist = cli_utils.load_blacklist(os.path.join(RESOURCES_DIR, 'list-discard.txt'))
inputdict = add_to_compressed_dict(my_urls, my_blacklist)
assert len(inputdict) == 0
url_store = add_to_compressed_dict(my_urls, my_blacklist)
assert len(url_store.urldict) == 0
# URL filter
args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
my_urls = cli_utils.load_input_urls(args)
assert len(add_to_compressed_dict(my_urls, None, ['status'], None)) == 1
assert len(add_to_compressed_dict(my_urls, None, ['teststring'], None)) == 0
assert len(add_to_compressed_dict(my_urls, None, ['status', 'teststring'], None)) == 1
url_store = add_to_compressed_dict(my_urls, None, ['status'], None)
assert len(url_store.urldict) == 1
url_store = add_to_compressed_dict(my_urls, None, ['teststring'], None)
assert len(url_store.urldict) == 0
url_store = add_to_compressed_dict(my_urls, None, ['status', 'teststring'], None)
assert len(url_store.urldict) == 1
# malformed URLs
inputdict = add_to_compressed_dict(['123345', 'https://www.example.org/1'], {}, None, None)
assert len(inputdict) == 1
url_store = add_to_compressed_dict(['123345', 'https://www.example.org/1'], {}, None, None)
assert len(url_store.urldict) == 1
# double URLs
args.input_file = os.path.join(RESOURCES_DIR, 'redundant-urls.txt')
my_urls = cli_utils.load_input_urls(args)
assert len(my_urls) == 5
url_store = add_to_compressed_dict(my_urls)
assert len(url_store.find_known_urls('https://example.org')) == 1


if __name__ == '__main__':
Expand Down
70 changes: 20 additions & 50 deletions tests/downloads_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@

from collections import deque
from datetime import datetime
from time import sleep
from unittest.mock import Mock, patch

from courlan import UrlStore

from trafilatura.cli import parse_args
from trafilatura.cli_utils import download_queue_processing, url_processing_pipeline
from trafilatura.core import extract
from trafilatura.downloads import DEFAULT_HEADERS, USER_AGENT, add_to_compressed_dict, fetch_url, draw_backoff_url, load_download_buffer, _determine_headers, _handle_response, _parse_config, _send_request, _send_pycurl_request
from trafilatura.downloads import DEFAULT_HEADERS, USER_AGENT, add_to_compressed_dict, fetch_url, load_download_buffer, _determine_headers, _handle_response, _parse_config, _send_request, _send_pycurl_request
from trafilatura.settings import DEFAULT_CONFIG, use_config
from trafilatura.utils import decode_response, load_html

Expand Down Expand Up @@ -123,68 +126,35 @@ def test_decode():
def test_queue():
'Test creation, modification and download of URL queues.'
# test conversion and storage
inputdict = add_to_compressed_dict(['ftps://www.example.org/', 'http://'])
assert inputdict == {}
inputdict = add_to_compressed_dict(['https://www.example.org/'])
# CLI args
url_store = add_to_compressed_dict(['ftps://www.example.org/', 'http://'])
assert isinstance(url_store, UrlStore)
# download buffer
inputurls = ['https://test.org/1', 'https://test.org/2', 'https://test.org/3', 'https://test2.org/1', 'https://test2.org/2', 'https://test2.org/3', 'https://test3.org/1', 'https://test3.org/2', 'https://test3.org/3', 'https://test4.org/1', 'https://test4.org/2', 'https://test4.org/3', 'https://test5.org/1', 'https://test5.org/2', 'https://test5.org/3', 'https://test6.org/1', 'https://test6.org/2', 'https://test6.org/3']
url_store = add_to_compressed_dict(inputurls)
bufferlist, _, _ = load_download_buffer(url_store, sleep_time=5, threads=1)
assert len(bufferlist) == 6
sleep(0.25)
bufferlist, _, _ = load_download_buffer(url_store, sleep_time=0.1, threads=2)
assert len(bufferlist) == 6
# CLI args
url_store = add_to_compressed_dict(['https://www.example.org/'])
testargs = ['', '--list']
with patch.object(sys, 'argv', testargs):
args = parse_args(testargs)
assert url_processing_pipeline(args, inputdict) is False
assert url_processing_pipeline(args, url_store) is False
# single/multiprocessing
testargs = ['', '-v']
with patch.object(sys, 'argv', testargs):
args = parse_args(testargs)
domain_dict = {
'https://httpbin.org': deque(
[
'/status/301',
'/status/304',
'/status/200',
'/status/300',
'/status/400',
'/status/505',
]
)
}
inputurls = ['https://httpbin.org/status/301', 'https://httpbin.org/status/304', 'https://httpbin.org/status/200', 'https://httpbin.org/status/300', 'https://httpbin.org/status/400', 'https://httpbin.org/status/505']
url_store = add_to_compressed_dict(inputurls)
args.archived = True
args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
config = use_config(filename=args.config_file)
config['DEFAULT']['SLEEP_TIME'] = '0.2'
results = download_queue_processing(domain_dict, args, None, config)
results = download_queue_processing(url_store, args, None, config)
## fixed: /301 missing, probably for a good reason...
assert len(results[0]) == 5 and results[1] is None
# test backoff algorithm
backoffdict = {}
testdict = {'http://test.org': deque(['/1'])}
assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
testdict['http://test.org'] = deque(['/1'])
backoffdict['http://test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
# concurrent domains
testdict = {}
backoffdict = {}
testdict['http://test.org'] = deque(['/1'])
testdict['http://example.org'] = deque(['/1'])
# simulate recent request
backoffdict['http://test.org'] = datetime.now()
# must return the other domain
test = draw_backoff_url(testdict, backoffdict, 5)
assert test[0], test[1] == ('http://example.org/1', {'http://test.org': deque(['/1'])})
print(test)
assert test[2] != {}
# sleeps and returns the rest
assert draw_backoff_url(testdict, backoffdict, 1) == ('http://test.org/1', {}, {})
# code hangs, logical:
#testdict['http://test.org'] = deque(['/1'])
#backoffdict['http://test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263)
#assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
# download buffer
domain_dict = {'https://test.org': deque(['/1', '/2', '/3']), 'https://test2.org': deque(['/1', '/2', '/3']), 'https://test3.org': deque(['/1', '/2', '/3']), 'https://test4.org': deque(['/1', '/2', '/3']), 'https://test5.org': deque(['/1', '/2', '/3']), 'https://test6.org': deque(['/1', '/2', '/3'])}
bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=1)
assert len(bufferlist) == 6
bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=2)
assert len(bufferlist) == 6


if __name__ == '__main__':
Expand Down
Loading