In [1]:
from tqdm import tqdm
import requests
import zipfile
import os
import time
import hashlib

# Example list of presigned URLs for multipart ZIP files
presigned_urls = ['https://brage.it.ntnu.no/mhbucket/CMIC/raw_data/bf_intorg_YOLOv8_dev.zip?AWSAccessKeyId=AKIA8E210FD3550BED0D&Signature=6Rxt%2FIYIAB%2BE2RaZvx86b93CO6A%3D&Expires=1740584919']

def download_and_extract_file(url, filename, retries=5, backoff_factor=1.0):
    """Download an individual file to disk with retry logic, calculate its MD5, and extract it."""
    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(url, stream=True, timeout=(10, 60))  # Connect and read timeout
            response.raise_for_status()  # Check for HTTP errors

            # Setup progress bar
            total_size = int(response.headers.get('content-length', 0))
            block_size = 1024  # 1 Kibibyte
            progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"Downloading {filename}")
            
            # Initialize MD5 hash calculator
            md5_hash = hashlib.md5()

            # Write file to disk and update MD5 hash
            with open(filename, 'wb') as file:
                for data in response.iter_content(block_size):
                    progress_bar.update(len(data))
                    file.write(data)
                    md5_hash.update(data)  # Update MD5 hash with chunk of data
            progress_bar.close()

            # Display the calculated MD5 hash
            file_md5 = md5_hash.hexdigest()
            print(f"MD5 hash of {filename}: {file_md5}")

            # Extract the ZIP file
            print(f"Extracting {filename}...")
            with zipfile.ZipFile(filename, 'r') as zip_ref:
                zip_ref.extractall("./")  # Specify root directory
            print(f"{filename} extracted.")

            # Remove the ZIP file after extraction
            os.remove(filename)
            return  # Exit function after successful download, hash calculation, and extraction
        
        except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e:
            print(f"Download error: {e}, retrying in {backoff_factor * (2 ** attempt)} seconds...")
            time.sleep(backoff_factor * (2 ** attempt))
            attempt += 1

    print(f"Failed to download and extract the file after {retries} attempts.")

# Create directories for ZIP parts and extracted files
temp_dir = "temp_zip_parts"
os.makedirs(temp_dir, exist_ok=True)

# Download, extract, and clean up each ZIP file
for index, url in enumerate(presigned_urls, start=1):
    part_filename = os.path.join(temp_dir, f"part_{index}.zip")
    download_and_extract_file(url, part_filename)

# Final cleanup: Remove the temporary directory (should be empty by now)
os.rmdir(temp_dir)

print("All files downloaded, extracted, and cleaned up.")



Downloading temp_zip_parts\part_1.zip: 100%|██████████| 168M/168M [00:01<00:00, 107MiB/s] 


MD5 hash of temp_zip_parts\part_1.zip: dde7538f693682c84d4d2916069cdfcf
Extracting temp_zip_parts\part_1.zip...
temp_zip_parts\part_1.zip extracted.
All files downloaded, extracted, and cleaned up.
