In [None]:

import os
import json

import numpy as np
import faiss
import pickle

import torch
from tqdm import tqdm
import pickle
import json
import numpy as np
import os
from cuvs.neighbors import hnsw,cagra
import cupy as cp
import aiohttp
import asyncio
from tqdm import tqdm
import time

In [None]:

async def fetch_submitter_id(session, uuid, pbar, max_retries=3):
    url = f"https://api.gdc.cancer.gov/files/{uuid}?expand=cases"
    retry_delay = 0.1  # 初始延迟 1 秒
    
    for attempt in range(max_retries):
        try:
            async with session.get(url) as response:
                if response.status == 429:
                    # 如果被限流，等待并增加延迟
                    wait_time = retry_delay * (2 ** attempt)  # 指数退避
                    print(f"Rate limited, retrying in {wait_time} seconds...")
                    await asyncio.sleep(wait_time)
                    continue
                
                data = await response.json()
                submitter_id = data['data']['cases'][0]['submitter_id']
                pbar.update(1)
                return submitter_id
        except Exception as e:
            print(f"Error fetching {uuid} (attempt {attempt + 1}): {e}")
            if attempt == max_retries - 1:
                return None
            await asyncio.sleep(retry_delay * (2 ** attempt))  # 指数退避

async def main(total_wsis, max_concurrent=10):
    sub_ids = []
    connector = aiohttp.TCPConnector(limit=max_concurrent)  # 限制并发数
    async with aiohttp.ClientSession(connector=connector) as session:
        with tqdm(total=len(total_wsis)) as pbar:
            tasks = [fetch_submitter_id(session, uuid, pbar) for uuid in total_wsis]
            sub_ids = await asyncio.gather(*tasks)
    return sub_ids

# total_wsis = ["3864f9ba-92b6-4d7d-b7df-1ba7c20cdc65", "0dda6c6c-d840-4ac2-9f9f-182c5779bac7", ...]  


In [None]:

def get_all_folders(path):
    all_folders = []
    for root, dirs, files in os.walk(path):
        if len(dirs) == 0:
            all_folders.append(root)
    return all_folders

root_path = '/hpc2hdd/home/ysi538/my_cuda_code/TCGA_slide_retrieval/embed_cache'
sites = os.listdir(root_path)
for site in sites:
    site_path = os.path.join(root_path, site)
    site_wsis = os.listdir(site_path)
    sub_ids = await main(site_wsis)
    # 将sub_ids保存到文件中
    with open(os.path.join(site_path, 'sub_ids.json'), 'w') as f:
        json.dump(sub_ids, f)
    if len(sub_ids) == len(site_wsis):
        print(f"成功获取 {site} 的所有 UUID")
        for i, sub_id in enumerate(sub_ids):
            wsi_path = os.path.join(site_path, site_wsis[i])
            patch_info_path = os.path.join(wsi_path, 'patch_info.json')
            with open(patch_info_path, 'r') as f:
                patch_info = json.load(f)
            for info in patch_info:
                info['sub_id'] = sub_id
            with open(patch_info_path, 'w') as f:
                json.dump(patch_info, f)
    else:
        print(f"获取 {site} 的 UUID 失败，可能是由于网络问题或其他原因。")



100%|██████████| 3624/3624 [02:52<00:00, 21.06it/s]


成功获取 brain 的所有 UUID


100%|██████████| 1446/1446 [01:02<00:00, 23.06it/s]


成功获取 liver 的所有 UUID


100%|██████████| 1865/1865 [01:34<00:00, 19.64it/s]


成功获取 endocrine 的所有 UUID


100%|█████████▉| 3562/3565 [03:19<00:00, 22.74it/s]

Error fetching 20136936-1d81-4bd7-9cd9-54c4cf6142a8 (attempt 1): 


100%|██████████| 3565/3565 [05:02<00:00, 11.80it/s]


成功获取 gastrointestinal 的所有 UUID


100%|██████████| 4195/4195 [03:50<00:00, 18.20it/s]


成功获取 urinary 的所有 UUID


100%|██████████| 421/421 [00:23<00:00, 17.89it/s]


成功获取 hematopoietic 的所有 UUID


100%|██████████| 3610/3610 [03:19<00:00, 18.06it/s]


成功获取 gynecologic 的所有 UUID


100%|██████████| 1100/1100 [00:59<00:00, 18.38it/s]


成功获取 melanocytic 的所有 UUID


100%|██████████| 1585/1585 [01:25<00:00, 18.52it/s]


成功获取 prostate 的所有 UUID


100%|██████████| 3391/3391 [03:13<00:00, 17.53it/s]


成功获取 pulmonary 的所有 UUID


总计 24802 个 UUID


In [5]:
split_wsis = [wsi.split('-') for wsi in total_wsis]

In [6]:
len(split_wsis)

3624

In [20]:
total_wsis

['4dfaab01-a22b-4c48-b131-f97e51a89ac4',
 'ed08265b-1bc6-4ec9-baf0-07d6e8674f0a',
 'f34f02f7-a6ab-4bcd-95a7-ea19039986a0',
 'cfd4d28a-364d-4fb4-9e37-59a61cb5b5c9',
 '62f22a6f-b980-4ad5-b785-4e15bd7a2b1b',
 '5f94e55a-5ec9-4428-a2f3-c9d38ca803e1',
 'ae84b4b0-55f6-43ca-99ff-550c170b9f61',
 '0e885689-255e-45b2-af36-5a9995e8b84a',
 '8a1193b5-fe23-4191-b5cb-a34b596ff6fe',
 '1cc1f6ac-c504-42be-8c7d-484846937283',
 '440e126f-b246-482d-96e7-4454b49430b0',
 '1cc4634e-4f68-4ae8-bff1-eab006889b95',
 'dad17992-2584-426b-ab54-d6e09aba6f2f',
 'ed2ccbe3-457e-4dcc-826b-fe76cfc09e02',
 '0ebc1ea7-8641-4951-b9ef-c337798b52f6',
 '17b4691b-6225-4e47-9d0b-2e10de3c0777',
 'bae07b70-bf4c-4ccb-86d3-cdf841272f43',
 '1356b876-18b5-434e-a347-bc9374120a0e',
 '252902f6-0131-4fc4-b324-7cc65dee2c12',
 'aee61c17-a57f-4011-b4a2-7e8a07616fdb',
 '69ed07d0-dfc0-489c-b073-bc21dbe52222',
 '224e1a41-cb81-44fa-89b2-52d07207ce8f',
 'b35f190b-0f9a-46c7-955a-ce8e256a5311',
 '58102436-b379-4e04-b985-e740ff7a0676',
 '3ec10eb3-c8cd-

  0%|          | 0/3624 [00:00<?, ?it/s]

100%|██████████| 3624/3624 [02:42<00:00, 22.32it/s]

['TCGA-HT-7690', 'TCGA-06-5413', 'TCGA-HW-7495', 'TCGA-06-0241', 'TCGA-12-0688', 'TCGA-VV-A829', 'TCGA-FG-5962', 'TCGA-HT-A74O', 'TCGA-06-0137', 'TCGA-12-0703', 'TCGA-26-5136', 'TCGA-26-1442', 'TCGA-S9-A7J0', 'TCGA-12-0691', 'TCGA-08-0245', 'TCGA-06-0151', 'TCGA-27-2519', 'TCGA-02-0025', 'TCGA-06-0397', 'TCGA-DU-7294', 'TCGA-P5-A5ET', 'TCGA-HT-8108', 'TCGA-19-2624', 'TCGA-08-0351', 'TCGA-32-1991', 'TCGA-12-1094', 'TCGA-HT-7620', 'TCGA-06-0394', 'TCGA-HT-7692', 'TCGA-FG-A713', 'TCGA-02-0290', 'TCGA-14-1821', 'TCGA-28-1753', 'TCGA-P5-A5F4', 'TCGA-06-0178', 'TCGA-06-2557', 'TCGA-HT-8104', 'TCGA-HT-A617', 'TCGA-06-6697', 'TCGA-F6-A8O3', 'TCGA-E1-5311', 'TCGA-12-1598', 'TCGA-06-0130', 'TCGA-15-1449', 'TCGA-06-A7TL', 'TCGA-27-2526', 'TCGA-02-0034', 'TCGA-06-0189', 'TCGA-28-1755', 'TCGA-VM-A8CH', 'TCGA-06-5410', 'TCGA-12-3653', 'TCGA-12-0707', 'TCGA-14-1795', 'TCGA-14-0871', 'TCGA-14-0740', 'TCGA-TM-A84L', 'TCGA-HT-7880', 'TCGA-W9-A837', 'TCGA-27-2518', 'TCGA-06-0178', 'TCGA-HT-7686', 'TCGA-0




In [None]:
os.join(root_path, 'root_path')

3624

In [18]:
len(set(sub_ids))

1123