In [17]:
import os.path as osp
from io import BytesIO, StringIO

import pandas as pd
import requests as rq
from bs4 import BeautifulSoup
from PIL import Image
from tqdm import tqdm


OUTPUT_DIR = "../input/extra/hpa/images/"


def url(ens, gene, organ):
    return f"https://www.proteinatlas.org/{ens}-{gene}/tissue/{organ}"

In [18]:
r = rq.get("https://www.proteinatlas.org/api/search_download.php?search=&columns=g&compress=no&format=tsv")
string = StringIO(r.text)
df = pd.read_csv(string, sep="\t")
df.head()

Unnamed: 0,Ensembl,Gene
0,ENSG00000121410,A1BG
1,ENSG00000148584,A1CF
2,ENSG00000175899,A2M
3,ENSG00000166535,A2ML1
4,ENSG00000184389,A3GALT2


In [28]:
from pathlib import Path

exist_ens_gene = set()
for path in Path(osp.join(OUTPUT_DIR, "lung")).rglob("*MANUAL.json"):
    ens, gene = osp.basename(path).split("|")[0].split("_")[1:3]
    exist_ens_gene.add((ens, gene))

In [30]:
organs = ['lung']  #, 'spleen', 'kidney', 'colon']

metadata = []

pbar = tqdm(df.sample(200).index)

img_idx = 0
for idx in pbar:
    ens = df.loc[idx, "Ensembl"]
    gene = df.loc[idx, "Gene"]
    if (ens, gene) in exist_ens_gene:
        continue
    
    for organ in organs:
        response = rq.get(url(ens, gene, organ))
        images = BeautifulSoup(response.text, 'html.parser').findAll('img')
        links = ['https:' + img['src'].replace('_medium', '') for img in images if img['src'].startswith('//images.proteinatlas.org')]
        
        for link in links:
            img_name = f"{organ}_{ens}_{gene}.jpg"
            img_file = osp.join(OUTPUT_DIR, organ, img_name)
            img_idx += 1
            if osp.exists(img_file):
                continue
            
            r = rq.get(link)
            Image.open(BytesIO(r.content)).save(img_file)
            metadata.append([img_name, organ])

            pbar.set_description(f"{len(metadata):>5d} imgs saved")
            pbar.refresh()


  154 imgs saved: 100%|███████████████████████| 200/200 [10:33<00:00,  3.17s/it]
