In [1]:
import os
from multiprocessing import Pool, Manager
from urllib.parse import urlparse
from urllib.request import urlretrieve
import requests
import numpy as np
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
# Function to download a single file (supports FTP and HTTP)
def download_file(url, save_path):
    filename = url.split("/")[-1]
    if os.path.exists(save_path):
        print(f"{filename} exists to {save_path}")
    file_path = os.path.join(save_path, filename)

    parsed_url = urlparse(url)
    scheme = parsed_url.scheme.lower()

    try:
        if scheme in ['http', 'https']:
            session = requests.Session()
            retries = Retry(
                total=5,
                backoff_factor=1,
                status_forcelist=[429, 500, 502, 503, 504],
                raise_on_status=False
            )
            adapter = HTTPAdapter(max_retries=retries)
            session.mount("http://", adapter)
            session.mount("https://", adapter)

            response = session.get(url, stream=True, timeout=10)
            response.raise_for_status()

            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            print(f"Downloaded {filename} via HTTP to {save_path}")

        elif scheme == 'ftp':
            urlretrieve(url, file_path)
            print(f"Downloaded {filename} via FTP to {save_path}")

        else:
            print(f"Unsupported URL scheme for: {url}")

    except Exception as e:
        print(f"Failed to download {url}: {e}")

# Function to download multiple files using multiprocessing
def download_files(urls, save_path, num_processes=4):
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    with Pool(processes=num_processes) as pool:
        pool.starmap(download_file, [(url, save_path) for url in urls])

In [5]:
file_list = np.load('prot_rep.npy')
download_list = [f'https://files.rcsb.org/download/{i}.cif' for i in np.unique(file_list[:,0])]
#download_files(download_list, './PDB/', num_processes=50)

In [4]:
mpun, pun = np.unique(np.load('mem_prot_rep.npy')[:,0]), np.unique(np.load('prot_rep.npy')[:,0])
print(len(mpun), len(pun), len(mpun)+len(pun), len(np.unique(list(mpun)+list(pun))))

565 16683 17248 17248
