From b35f4f6054a07dfeb9e0eb7e692a56c2ba66f6f8 Mon Sep 17 00:00:00 2001
From: Ghost_chu <ghostchu@111.com>
Date: Sat, 8 Jun 2024 15:33:10 +0800
Subject: [PATCH 1/2] commit to github with enchanced code

---
 pywaybackup/Verbosity.py |  4 +-
 pywaybackup/archive.py   | 95 ++++++++++++++++++++++++----------------
 pywaybackup/main.py      |  2 +-
 3 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/pywaybackup/Verbosity.py b/pywaybackup/Verbosity.py
index df7b11e..6f39fea 100644
--- a/pywaybackup/Verbosity.py
+++ b/pywaybackup/Verbosity.py
@@ -41,8 +41,8 @@ def write(cls, message: str = None, progress: int = None):
                 print("")
                 maxval = sc.count_list()
                 cls.pbar = tqdm.tqdm(total=maxval, desc="Downloading", unit=" snapshot", ascii="░▒█")
-            elif cls.pbar is not None and progress == 1:
-                cls.pbar.update(1)
+            elif cls.pbar is not None and progress is not None and progress >= 1:# Allow fast forwarding of the progress bar by a specified length
+                cls.pbar.update(progress)
                 cls.pbar.refresh()
         elif cls.mode == "json":
             pass
diff --git a/pywaybackup/archive.py b/pywaybackup/archive.py
index b8e016e..a69899e 100644
--- a/pywaybackup/archive.py
+++ b/pywaybackup/archive.py
@@ -6,6 +6,7 @@
 import json
 import urllib.parse
 import http.client
+import queue
 from urllib.parse import urljoin
 from datetime import datetime, timezone
 
@@ -144,7 +145,7 @@ def query_list(url: str, range: int, start: int, end: int, explicit: bool, mode:
 
 
 # example download: http://web.archive.org/web/20190815104545id_/https://www.google.com/
-def download_list(output, retry, no_redirect, workers, skipset: set = None):
+def download_list(output, retry, no_redirect, workers, skipset: set = None, skipfile = None):
     """
     Download a list of urls in format: [{"timestamp": "20190815104545", "url": "https://www.google.com/"}]
     """
@@ -152,20 +153,27 @@ def download_list(output, retry, no_redirect, workers, skipset: set = None):
         vb.write("\nNothing to download");
         return
     vb.write("\nDownloading snapshots...", progress=0)
-    if workers > 1:
-        vb.write(f"\n-----> Simultaneous downloads: {workers}")
-        batch_size = sc.count_list() // workers + 1
-    else:
-        batch_size = sc.count_list()
     sc.create_collection()
     vb.write("\n-----> Snapshots prepared")
-    batch_list = [sc.SNAPSHOT_COLLECTION[i:i + batch_size] for i in range(0, len(sc.SNAPSHOT_COLLECTION), batch_size)]    
+    fifo_queue = queue.Queue()
     threads = []
     worker = 0
-    for batch in batch_list:
+
+    skipped = 0;
+    # Allows for quick skipping of files marked as skipped or downloaded as soon as possible.
+    for snapshot in sc.SNAPSHOT_COLLECTION:
+        download_url = snapshot["url_archive"]
+        if should_skip(snapshot, skipset, output):
+            skip_write(skipset, download_url)
+            skipped+=1
+        else:
+            fifo_queue.put(snapshot)
+    print(f"Skipped {skipped} urls from skipset")
+    vb.write(progress=skipped)
+    for i in range(workers):
         worker += 1
         vb.write(f"\n-----> Starting worker: {worker}")
-        thread = threading.Thread(target=download_loop, args=(batch, output, worker, retry, no_redirect, skipset))
+        thread = threading.Thread(target=download_loop, args=(fifo_queue, output, worker, retry, no_redirect, skipset, skipfile))
         threads.append(thread)
         thread.start()
     for thread in threads:
@@ -175,11 +183,15 @@ def download_list(output, retry, no_redirect, workers, skipset: set = None):
     vb.write(f"\nFiles downloaded: {successed}")
     vb.write(f"Files missing: {failed}\n")
 
+def should_skip(snapshot, skipset, output):
+    if(snapshot["url_archive"] in skipset):
+        return True
+    output_file = sc.create_output(snapshot["url_archive"], snapshot["timestamp"], output)
+    if os.path.isfile(output_file):
+        return True
+    return False
 
-
-
-
-def download_loop(snapshot_batch, output, worker, retry, no_redirect, skipset=None, attempt=1, connection=None):
+def download_loop(queue, output, worker, retry, no_redirect, skipset=None, skipfile=None, attempt=1, connection=None):
     """
     Download a list of URLs in a recursive loop. If a download fails, the function will retry the download.
     The "snapshot_collection" dictionary will be updated with the download status and file information.
@@ -189,13 +201,24 @@ def download_loop(snapshot_batch, output, worker, retry, no_redirect, skipset=No
     failed_urls = []
     if not connection:
         connection = http.client.HTTPSConnection("web.archive.org")
-    if attempt > max_attempt:
-        connection.close()
-        vb.write(f"\n-----> Worker: {worker} - Failed downloads: {len(snapshot_batch)}")
-        return
-    for snapshot in snapshot_batch:
-        status = f"\n-----> Attempt: [{attempt}/{max_attempt}] Snapshot [{snapshot_batch.index(snapshot)+1}/{len(snapshot_batch)}] - Worker: {worker}"
+    try:
+        if attempt > max_attempt:
+            connection.close()
+            return
+    except:
+        print()
+    count = 0
+
+    status = ""
+    # Use queue refactoring to avoid the problem that some snapshots are no longer processed when a particular worker crashes
+    while True:
+        snapshot = queue.get()
+        if snapshot is None:
+            break
         download_status = download(output, snapshot, connection, status, no_redirect, skipset)
+        count += 1
+        if queue.qsize() % 300 == 0:
+            skip_save(skipfile, skipset)
         if not download_status:
             failed_urls.append(snapshot)
         if download_status:
@@ -205,10 +228,7 @@ def download_loop(snapshot_batch, output, worker, retry, no_redirect, skipset=No
         if not attempt > max_attempt: 
             vb.write(f"\n-----> Worker: {worker} - Retry Timeout: 15 seconds")
             time.sleep(15)
-        download_loop(failed_urls, output, worker, retry, no_redirect, skipset, attempt, connection)            
-
-
-
+        download_loop(failed_urls, output, worker, retry, no_redirect, skipset, attempt, connection)   
 
 
 def download(output, snapshot_entry, connection, status_message, no_redirect=False, skipset=None):
@@ -220,9 +240,7 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
     """
     download_url = snapshot_entry["url_archive"]
     encoded_download_url = urllib.parse.quote(download_url, safe=':/') # used for GET - otherwise always download_url
-    if skipset and skip_read(skipset, download_url):
-        vb.write(f"\nSKIPPING -> URL: {download_url}")
-        return True
+    # Removed this part of the check as it was already done at the very beginning
     max_retries = 2
     sleep_time = 45
     headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'}
@@ -314,18 +332,8 @@ def download(output, snapshot_entry, connection, status_message, no_redirect=Fal
                 f"          -> {e}"
             vb.write(status_message)
             return False
-        # connection timeout waits and retries
-        except requests.exceptions.Timeout as e:
-            status_message = f"{status_message}\n" + \
-                f"TIMEOUT   -> ({i+1}/{max_retries}), reconnect in {sleep_time} seconds...\n" + \
-                f"         -> {e}"
-            vb.write(status_message)
-            time.sleep(sleep_time)
-        # connection refused waits and retries
-        except ConnectionRefusedError as e:
-            status_message = f"{status_message}\n" + \
-                f"REFUSED  -> ({i+1}/{max_retries}), reconnect in {sleep_time} seconds...\n" + \
-                f"         -> {e}"
+        # We always miss some exceptions, so completely intercept all exceptions
+        except:
             vb.write(status_message)
             time.sleep(sleep_time)
     vb.write(f"FAILED  -> download, append to failed_urls: {download_url}")
@@ -413,6 +421,17 @@ def skip_read(skipset: set, archive_url: str) -> bool:
     """
     return archive_url in skipset
 
+def skip_save(skipfile: object, skipset: set):
+    """
+    Overwrite existing skip file with the new set content.
+    """
+    try:
+        skipfile.seek(0)
+        skipfile.truncate()
+        skipfile.write('\n'.join(skipset))
+    except Exception as e:
+        ex.exception("Could not save skip-file", e)
+
 def skip_close(skipfile: object, skipset: set):
     """
     Overwrite existing skip file with the new set content.
diff --git a/pywaybackup/main.py b/pywaybackup/main.py
index bb45c5b..42ac5de 100644
--- a/pywaybackup/main.py
+++ b/pywaybackup/main.py
@@ -39,7 +39,7 @@ def main():
             if args.list:
                 archive.print_list()
             else:
-                archive.download_list(args.output, args.retry, args.no_redirect, args.workers, skipset)
+                archive.download_list(args.output, args.retry, args.no_redirect, args.workers, skipset, skipfile)
         finally:
             archive.skip_close(skipfile, skipset) if args.skip else None
             archive.csv_close(args.csv, args.url) if args.csv else None

From d3ee2ebdd95b1e7acaa95a0a2e878b77beaf3eb0 Mon Sep 17 00:00:00 2001
From: Ghost_chu <ghostchu@111.com>
Date: Sat, 8 Jun 2024 15:35:43 +0800
Subject: [PATCH 2/2] Add more comment

---
 pywaybackup/archive.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pywaybackup/archive.py b/pywaybackup/archive.py
index a69899e..26fad48 100644
--- a/pywaybackup/archive.py
+++ b/pywaybackup/archive.py
@@ -218,7 +218,7 @@ def download_loop(queue, output, worker, retry, no_redirect, skipset=None, skipf
         download_status = download(output, snapshot, connection, status, no_redirect, skipset)
         count += 1
         if queue.qsize() % 300 == 0:
-            skip_save(skipfile, skipset)
+            skip_save(skipfile, skipset) # Save in period to avoid loss progress when progeam stucked
         if not download_status:
             failed_urls.append(snapshot)
         if download_status: