In [1]:
from tqdm import tqdm
import requests
import zipfile
import os
import time
import hashlib

# Example list of presigned URLs for multipart ZIP files
presigned_urls = ['http://129.241.241.254/mhbucket/CMIC/raw_data/PDAC_organoids_caspase_multipart/PDAC_organoids_caspase_part1.zip?AWSAccessKeyId=AKIA8E210FD3550BED0D&Signature=On4qa9RB0Wxiqx8sbhEJHJ7QjAg%3D&Expires=1711060841',
                  'http://129.241.241.254/mhbucket/CMIC/raw_data/PDAC_organoids_caspase_multipart/PDAC_organoids_caspase_part2.zip?AWSAccessKeyId=AKIA8E210FD3550BED0D&Signature=8zABBummdjSBuhSEb%2BZ9MBCaRxE%3D&Expires=1711060841',
                  'http://129.241.241.254/mhbucket/CMIC/raw_data/PDAC_organoids_caspase_multipart/PDAC_organoids_caspase_part3.zip?AWSAccessKeyId=AKIA8E210FD3550BED0D&Signature=2DgwLPtMnfsICaZyz8Fdd4jFnCk%3D&Expires=1711060841',
                  'http://129.241.241.254/mhbucket/CMIC/raw_data/PDAC_organoids_caspase_multipart/PDAC_organoids_caspase_part4.zip?AWSAccessKeyId=AKIA8E210FD3550BED0D&Signature=VLiW4gSb6W2y2EcYhg7bN%2BRW%2Fi8%3D&Expires=1711060841',
                  'http://129.241.241.254/mhbucket/CMIC/raw_data/PDAC_organoids_caspase_multipart/PDAC_organoids_caspase_part5.zip?AWSAccessKeyId=AKIA8E210FD3550BED0D&Signature=q7Ok3gK%2FyySAmDE05MCQd1M5mq0%3D&Expires=1711060841'
                  ]

def download_and_extract_file(url, filename, retries=5, backoff_factor=1.0):
    """Download an individual file to disk with retry logic, calculate its MD5, and extract it."""
    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(url, stream=True, timeout=(10, 60))  # Connect and read timeout
            response.raise_for_status()  # Check for HTTP errors

            # Setup progress bar
            total_size = int(response.headers.get('content-length', 0))
            block_size = 1024  # 1 Kibibyte
            progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"Downloading {filename}")
            
            # Initialize MD5 hash calculator
            md5_hash = hashlib.md5()

            # Write file to disk and update MD5 hash
            with open(filename, 'wb') as file:
                for data in response.iter_content(block_size):
                    progress_bar.update(len(data))
                    file.write(data)
                    md5_hash.update(data)  # Update MD5 hash with chunk of data
            progress_bar.close()

            # Display the calculated MD5 hash
            file_md5 = md5_hash.hexdigest()
            print(f"MD5 hash of {filename}: {file_md5}")

            # Extract the ZIP file
            print(f"Extracting {filename}...")
            with zipfile.ZipFile(filename, 'r') as zip_ref:
                zip_ref.extractall("./")  # Specify root directory
            print(f"{filename} extracted.")

            # Remove the ZIP file after extraction
            os.remove(filename)
            return  # Exit function after successful download, hash calculation, and extraction
        
        except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e:
            print(f"Download error: {e}, retrying in {backoff_factor * (2 ** attempt)} seconds...")
            time.sleep(backoff_factor * (2 ** attempt))
            attempt += 1

    print(f"Failed to download and extract the file after {retries} attempts.")

# Create directories for ZIP parts and extracted files
temp_dir = "temp_zip_parts"
os.makedirs(temp_dir, exist_ok=True)

# Download, extract, and clean up each ZIP file
for index, url in enumerate(presigned_urls, start=1):
    part_filename = os.path.join(temp_dir, f"part_{index}.zip")
    download_and_extract_file(url, part_filename)

# Final cleanup: Remove the temporary directory (should be empty by now)
os.rmdir(temp_dir)

print("All files downloaded, extracted, and cleaned up.")



Downloading temp_zip_parts\part_1.zip: 100%|██████████| 3.82G/3.82G [03:31<00:00, 18.1MiB/s]


MD5 hash of temp_zip_parts\part_1.zip: fff68b9e41ddb39eadaedb6d13d274a4
Extracting temp_zip_parts\part_1.zip...
temp_zip_parts\part_1.zip extracted.


Downloading temp_zip_parts\part_2.zip: 100%|██████████| 4.19G/4.19G [03:49<00:00, 18.2MiB/s]


MD5 hash of temp_zip_parts\part_2.zip: 9d0a9c68fb4c99a0146e83c230ada991
Extracting temp_zip_parts\part_2.zip...
temp_zip_parts\part_2.zip extracted.


Downloading temp_zip_parts\part_3.zip: 100%|██████████| 5.38G/5.38G [04:56<00:00, 18.1MiB/s]


MD5 hash of temp_zip_parts\part_3.zip: f2bb53c23a816683e79861849a378982
Extracting temp_zip_parts\part_3.zip...
temp_zip_parts\part_3.zip extracted.


Downloading temp_zip_parts\part_4.zip: 100%|██████████| 4.48G/4.48G [04:07<00:00, 18.1MiB/s]


MD5 hash of temp_zip_parts\part_4.zip: e318ef2d28b0b47843b4c3b01e3b430b
Extracting temp_zip_parts\part_4.zip...
temp_zip_parts\part_4.zip extracted.


Downloading temp_zip_parts\part_5.zip: 100%|██████████| 4.62G/4.62G [04:19<00:00, 17.8MiB/s]


MD5 hash of temp_zip_parts\part_5.zip: 30f31867a593620ef70cfbf1e37fed91
Extracting temp_zip_parts\part_5.zip...
temp_zip_parts\part_5.zip extracted.
All files downloaded, extracted, and cleaned up.
