adbar · adbar · Feb 6, 2023 · Apr 20, 2022 · May 3, 2022 · May 3, 2022
diff --git a/tests/cli_tests.py b/tests/cli_tests.py
@@ -16,7 +16,9 @@
 
 import pytest
 
-from trafilatura import cli, cli_utils
+from courlan import UrlStore
+
+from trafilatura import cli, cli_utils, spider
 from trafilatura.downloads import add_to_compressed_dict, fetch_url
 from trafilatura.settings import DEFAULT_CONFIG
 
@@ -120,12 +122,6 @@ def test_climain():
         # Force encoding to utf-8 for Windows (seem to be a problem only in GitHub Actions)
         env['PYTHONIOENCODING'] = 'utf-8'
     assert subprocess.run([trafilatura_bin, '--input-dir', RESOURCES_DIR], env=env).returncode == 0
-    # dump urls
-    inputdict = add_to_compressed_dict(['https://www.example.org'])
-    f = io.StringIO()
-    with redirect_stdout(f):
-        cli.dump_on_exit(inputdict)
-    assert f.getvalue() == 'todo: https://www.example.org/\n'
 
 
 def test_input_type():
@@ -242,15 +238,41 @@ def test_cli_pipeline():
     #with redirect_stdout(f):
     #    cli.process_args(args)
     #assert len(f.getvalue()) == 0
-    # test URL listing
 
     # Force encoding to utf-8 for Windows in future processes spawned by multiprocessing.Pool
     os.environ['PYTHONIOENCODING'] = "utf-8"
 
+    # Crawling
+    testargs = ['', '--crawl', 'https://httpbin.org/html']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args)
+    assert f.getvalue() == 'https://httpbin.org/html\n'
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    # links permitted
+    testargs = ['', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel', '1']
+    with patch.object(sys, 'argv', testargs):
+        args = cli.parse_args(testargs)
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args)
+    assert f.getvalue() == 'https://httpbin.org/links/1/1\nhttps://httpbin.org/links/1/0\n'
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+    # 0 links permitted
+    args.crawl = 'https://httpbin.org/links/4/4'
+    f = io.StringIO()
+    with redirect_stdout(f):
+        cli_utils.cli_crawler(args, n=0)
+    assert len(f.getvalue().split('\n')) == 6
+    spider.URL_STORE = UrlStore(compressed=False, strict=False)
+
+    # test URL listing
     testargs = ['', '--list']
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)
-    assert cli_utils.url_processing_pipeline(args, {}) is False
+    assert cli_utils.url_processing_pipeline(args, UrlStore()) is False
     # test inputlist + blacklist
     testargs = ['', '-i', os.path.join(RESOURCES_DIR, 'list-process.txt')]
     with patch.object(sys, 'argv', testargs):
@@ -262,16 +284,16 @@ def test_cli_pipeline():
         args = cli.parse_args(testargs)
     assert args.blacklist is not None
     # test backoff between domain requests
-    inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None)
+    url_store = add_to_compressed_dict(my_urls, args.blacklist, None, None)
     reftime = datetime.now()
-    cli_utils.url_processing_pipeline(args, inputdict)
+    cli_utils.url_processing_pipeline(args, url_store)
     delta = (datetime.now() - reftime).total_seconds()
     assert delta > 2
     # test blacklist and empty dict
     args.blacklist = cli_utils.load_blacklist(args.blacklist)
     assert len(args.blacklist) == 2
-    inputdict = add_to_compressed_dict(my_urls, args.blacklist, None, None)
-    cli_utils.url_processing_pipeline(args, inputdict)
+    url_store = add_to_compressed_dict(my_urls, args.blacklist, None, None)
+    cli_utils.url_processing_pipeline(args, url_store)
     # test backup
     testargs = ['', '--backup-dir', '/tmp/']
     with patch.object(sys, 'argv', testargs):
@@ -312,7 +334,7 @@ def test_cli_pipeline():
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
-    assert len(f.getvalue()) == 0
+    assert len(f.getvalue().strip()) == 0
     # config file
     testargs = ['', '--input-dir', '/dev/null', '--config-file', 'newsettings.cfg']
     with patch.object(sys, 'argv', testargs):
@@ -326,10 +348,10 @@ def test_cli_pipeline():
     testargs = ['', '--links', '--images']
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)
-    #with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r') as f:
-    #    teststring = f.read()
-    #result = cli.examine(teststring, args)
-    #assert '[link](testlink.html)' in result # and 'test.jpg' in result
+    with open(os.path.join(RESOURCES_DIR, 'http_sample.html'), 'r') as f:
+        teststring = f.read()
+    result = cli.examine(teststring, args)
+    assert '[link](testlink.html)' in result and 'test.jpg' in result
 
     # Crawling
     testargs = ['', '--crawl', 'https://httpbin.org/html']
@@ -338,21 +360,20 @@ def test_cli_pipeline():
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args)
-    assert len(f.getvalue()) == 0
+    assert f.getvalue() == 'https://httpbin.org/html\n'
     # links permitted
     testargs = ['', '--crawl', 'https://httpbin.org/links/1/1', '--list', '--parallel', '1']
     with patch.object(sys, 'argv', testargs):
         args = cli.parse_args(testargs)
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args)
-    assert f.getvalue() == 'https://httpbin.org/links/1/0\n'
+    assert f.getvalue().endswith('https://httpbin.org/links/1/0\n')
     # 0 links permitted
     args.crawl = 'https://httpbin.org/links/4/4'
     f = io.StringIO()
     with redirect_stdout(f):
         cli_utils.cli_crawler(args, n=0)
-    # print(f.getvalue())
     assert len(f.getvalue().split('\n')) == 5
 
     # Exploration (Sitemap + Crawl)
@@ -362,7 +383,7 @@ def test_cli_pipeline():
     f = io.StringIO()
     with redirect_stdout(f):
         cli.process_args(args)
-    assert len(f.getvalue()) == 0
+    assert f.getvalue() == 'https://httpbin.org/html\n'
 
 
 def test_input_filtering():
@@ -372,36 +393,40 @@ def test_input_filtering():
         args = cli.parse_args(testargs)
     # load dictionary
     args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
-    inputdict = cli.load_input_dict(args)
-    assert inputdict['https://httpbin.org'] == deque(['/status/200', '/status/404'])
+    url_store = cli.load_input_dict(args)
+    assert len(url_store.find_known_urls('https://httpbin.org')) == 2
     args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
     args.blacklist = {'httpbin.org/status/404'}
-    inputdict = cli.load_input_dict(args)
-    assert inputdict['https://httpbin.org'] == deque(['/status/200'])
+    url_store = cli.load_input_dict(args)
+    assert len(url_store.find_known_urls('https://httpbin.org')) == 1
     # deduplication and filtering
     myinput = ['https://example.org/1', 'https://example.org/2', 'https://example.org/2', 'https://example.org/3', 'https://example.org/4', 'https://example.org/5', 'https://example.org/6']
     myblacklist = {'example.org/1', 'example.org/3', 'example.org/5'}
-    inputdict = add_to_compressed_dict(myinput, myblacklist)
-    assert inputdict['https://example.org'] == deque(['/2', '/4', '/6'])
+    url_store = add_to_compressed_dict(myinput, myblacklist)
+    assert url_store.find_known_urls('https://example.org') == ['https://example.org/2', 'https://example.org/4', 'https://example.org/6']
     # URL in blacklist
     args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
     my_urls = cli_utils.load_input_urls(args)
     my_blacklist = cli_utils.load_blacklist(os.path.join(RESOURCES_DIR, 'list-discard.txt'))
-    inputdict = add_to_compressed_dict(my_urls, my_blacklist)
-    assert len(inputdict) == 0
+    url_store = add_to_compressed_dict(my_urls, my_blacklist)
+    assert len(url_store.urldict) == 0
     # URL filter
     args.input_file = os.path.join(RESOURCES_DIR, 'list-process.txt')
     my_urls = cli_utils.load_input_urls(args)
-    assert len(add_to_compressed_dict(my_urls, None, ['status'], None)) == 1
-    assert len(add_to_compressed_dict(my_urls, None, ['teststring'], None)) == 0
-    assert len(add_to_compressed_dict(my_urls, None, ['status', 'teststring'], None)) == 1
+    url_store = add_to_compressed_dict(my_urls, None, ['status'], None)
+    assert len(url_store.urldict) == 1
+    url_store = add_to_compressed_dict(my_urls, None, ['teststring'], None)
+    assert len(url_store.urldict) == 0
+    url_store = add_to_compressed_dict(my_urls, None, ['status', 'teststring'], None)
+    assert len(url_store.urldict) == 1
     # malformed URLs
-    inputdict = add_to_compressed_dict(['123345', 'https://www.example.org/1'], {}, None, None)
-    assert len(inputdict) == 1
+    url_store = add_to_compressed_dict(['123345', 'https://www.example.org/1'], {}, None, None)
+    assert len(url_store.urldict) == 1
     # double URLs
     args.input_file = os.path.join(RESOURCES_DIR, 'redundant-urls.txt')
     my_urls = cli_utils.load_input_urls(args)
-    assert len(my_urls) == 5
+    url_store = add_to_compressed_dict(my_urls)
+    assert len(url_store.find_known_urls('https://example.org')) == 1
 
 
 if __name__ == '__main__':

diff --git a/tests/downloads_tests.py b/tests/downloads_tests.py
@@ -21,12 +21,15 @@
 
 from collections import deque
 from datetime import datetime
+from time import sleep
 from unittest.mock import Mock, patch
 
+from courlan import UrlStore
+
 from trafilatura.cli import parse_args
 from trafilatura.cli_utils import download_queue_processing, url_processing_pipeline
 from trafilatura.core import extract
-from trafilatura.downloads import DEFAULT_HEADERS, USER_AGENT, add_to_compressed_dict, fetch_url, draw_backoff_url, load_download_buffer, _determine_headers, _handle_response, _parse_config, _send_request, _send_pycurl_request
+from trafilatura.downloads import DEFAULT_HEADERS, USER_AGENT, add_to_compressed_dict, fetch_url, load_download_buffer, _determine_headers, _handle_response, _parse_config, _send_request, _send_pycurl_request
 from trafilatura.settings import DEFAULT_CONFIG, use_config
 from trafilatura.utils import decode_response, load_html
 
@@ -123,68 +126,35 @@ def test_decode():
 def test_queue():
     'Test creation, modification and download of URL queues.'
     # test conversion and storage
-    inputdict = add_to_compressed_dict(['ftps://www.example.org/', 'http://'])
-    assert inputdict == {}
-    inputdict = add_to_compressed_dict(['https://www.example.org/'])
-    # CLI args
+    url_store = add_to_compressed_dict(['ftps://www.example.org/', 'http://'])
+    assert isinstance(url_store, UrlStore)
+    # download buffer
+    inputurls = ['https://test.org/1', 'https://test.org/2', 'https://test.org/3', 'https://test2.org/1', 'https://test2.org/2', 'https://test2.org/3', 'https://test3.org/1', 'https://test3.org/2', 'https://test3.org/3', 'https://test4.org/1', 'https://test4.org/2', 'https://test4.org/3', 'https://test5.org/1', 'https://test5.org/2', 'https://test5.org/3', 'https://test6.org/1', 'https://test6.org/2', 'https://test6.org/3']
+    url_store = add_to_compressed_dict(inputurls)
+    bufferlist, _, _ = load_download_buffer(url_store, sleep_time=5, threads=1)
+    assert len(bufferlist) == 6
+    sleep(0.25)
+    bufferlist, _, _ = load_download_buffer(url_store, sleep_time=0.1, threads=2)
+    assert len(bufferlist) == 6
+    # CLI args 
+    url_store = add_to_compressed_dict(['https://www.example.org/'])
     testargs = ['', '--list']
     with patch.object(sys, 'argv', testargs):
         args = parse_args(testargs)
-    assert url_processing_pipeline(args, inputdict) is False
+    assert url_processing_pipeline(args, url_store) is False
     # single/multiprocessing
     testargs = ['', '-v']
     with patch.object(sys, 'argv', testargs):
         args = parse_args(testargs)
-    domain_dict = {
-        'https://httpbin.org': deque(
-            [
-                '/status/301',
-                '/status/304',
-                '/status/200',
-                '/status/300',
-                '/status/400',
-                '/status/505',
-            ]
-        )
-    }
+    inputurls = ['https://httpbin.org/status/301', 'https://httpbin.org/status/304', 'https://httpbin.org/status/200', 'https://httpbin.org/status/300', 'https://httpbin.org/status/400', 'https://httpbin.org/status/505']
+    url_store = add_to_compressed_dict(inputurls)
     args.archived = True
     args.config_file = os.path.join(RESOURCES_DIR, 'newsettings.cfg')
     config = use_config(filename=args.config_file)
     config['DEFAULT']['SLEEP_TIME'] = '0.2'
-    results = download_queue_processing(domain_dict, args, None, config)
+    results = download_queue_processing(url_store, args, None, config)
     ## fixed: /301 missing, probably for a good reason...
     assert len(results[0]) == 5 and results[1] is None
-    # test backoff algorithm
-    backoffdict = {}
-    testdict = {'http://test.org': deque(['/1'])}
-    assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
-    testdict['http://test.org'] = deque(['/1'])
-    backoffdict['http://test.org'] = datetime(2019, 5, 18, 15, 17, 8, 132263)
-    assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
-    # concurrent domains
-    testdict = {}
-    backoffdict = {}
-    testdict['http://test.org'] = deque(['/1'])
-    testdict['http://example.org'] = deque(['/1'])
-    # simulate recent request
-    backoffdict['http://test.org'] = datetime.now()
-    # must return the other domain
-    test = draw_backoff_url(testdict, backoffdict, 5)
-    assert test[0], test[1] == ('http://example.org/1', {'http://test.org': deque(['/1'])})
-    print(test)
-    assert test[2] != {}
-    # sleeps and returns the rest
-    assert draw_backoff_url(testdict, backoffdict, 1) == ('http://test.org/1', {}, {})
-    # code hangs, logical:
-    #testdict['http://test.org'] = deque(['/1'])
-    #backoffdict['http://test.org'] = datetime(2030, 5, 18, 15, 17, 8, 132263)
-    #assert draw_backoff_url(testdict, backoffdict, 0) == ('http://test.org/1', dict(), dict())
-    # download buffer
-    domain_dict = {'https://test.org': deque(['/1', '/2', '/3']), 'https://test2.org': deque(['/1', '/2', '/3']), 'https://test3.org': deque(['/1', '/2', '/3']), 'https://test4.org': deque(['/1', '/2', '/3']), 'https://test5.org': deque(['/1', '/2', '/3']), 'https://test6.org': deque(['/1', '/2', '/3'])}
-    bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=1)
-    assert len(bufferlist) == 6
-    bufferlist, _, _, _ = load_download_buffer(domain_dict, dict(), sleep_time=5, threads=2)
-    assert len(bufferlist) == 6
 
 
 if __name__ == '__main__':