In [1]:
from utils.parse_pdb import align_pdb, open_pdb, PDBError, get_pdb_file
import os
import boto3
import pickle
from tqdm import tqdm
from p_tqdm import p_map
import sidechainnet as scn
import numpy as np
from rcsbsearch import TextQuery, Attr
import subprocess

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TMP_FOLDER = "data/tmp_pdb"
OUTPUT_FOLDER = "data/pdb"
PDB_PREFIX = "20220103/pub/pdb/data/biounit/PDB/all/"

i = 0
while os.path.exists(f"./log_{i}.txt"):
    i += 1
LOG_FILE = f"./log_{i}.txt"

if not os.path.exists(TMP_FOLDER):
    os.mkdir(TMP_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)

In [3]:
MIN_LENGTH = 30
MAX_LENGTH = 10000
RESOLUTION_THR = 3.5
MISSING_THR = 0.1

In [4]:
def visualize(id):
    with open(f"./data/pdb/{id}.pickle", "rb") as f:
        data = pickle.load(f)

    crd = np.concatenate([data["A"]["crd_bb"], data["A"]["crd_sc"]], axis=1).reshape((-1, 3))
    seq = data["A"]["seq"]
    sb2 = scn.StructureBuilder(seq, crd)
    return sb2.to_3Dmol()

In [5]:
def clean(pdb_id):
    for file in os.listdir(TMP_FOLDER):
        if file.startswith(pdb_id):
            subprocess.run(["rm", os.path.join(TMP_FOLDER, file)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [6]:
def get_log_stats(log_file):
    stats = defaultdict(lambda: 0)
    with open(log_file, "r") as f:
        for line in f.readlines():
            if line.startswith("<<<"):
                stats[line.split(':')[0]] += 1
    return

In [7]:
def log_exception(exception, log_file, pdb_id):
    # raise exception
    clean(pdb_id)
    if isinstance(exception, PDBError):
        with open(log_file, "a") as f:
            f.write(f'<<< {str(exception)}: {pdb_id} \n')
    else:
        with open(log_file, "a") as f:
            f.write(f'<<< Unknown: {pdb_id} \n')
            f.write(str(exception))
            f.write("\n")

In [8]:
s3 = boto3.resource('s3')
bucket = s3.Bucket("pdbsnapshots")

pdb_ids = Attr('rcsb_entry_info.selected_polymer_entity_types').__eq__("Protein (only)") \
    .and_("exptl.method").in_(["X-RAY DIFFRACTION", "ELECTRON MICROSCOPY"]) \
    .and_("rcsb_entry_info.resolution_combined").__le__(RESOLUTION_THR) \
    .exec("assembly")

def process_f(pdb_id, show_error=False, force=True):
    pdb_id = pdb_id.lower()
    id, biounit = pdb_id.split('-')
    target_file = os.path.join(OUTPUT_FOLDER, pdb_id + '.pickle')
    if not force and os.path.exists(target_file):
        return
    pdb_file = PDB_PREFIX + f'{id}.pdb{biounit}.gz'
    local_path = get_pdb_file(pdb_file, s3.Bucket("pdbsnapshots"), tmp_folder=TMP_FOLDER)
    try:
        pdb_dict = open_pdb(
            local_path, 
            tmp_folder=TMP_FOLDER,
        )
        pdb_dict = align_pdb(pdb_dict, min_length=MIN_LENGTH, max_length=MAX_LENGTH, max_missing=MISSING_THR)
    except Exception as e:
        if show_error:
            raise e
        else:
            log_exception(e, LOG_FILE, pdb_id)
            pdb_dict = None
    
    if pdb_dict is not None:
        with open(target_file, "wb") as f:
            pickle.dump(pdb_dict, f)


In [9]:
# process_f("101m-1", show_error=True)

In [10]:
N = 300

pdbs = []
for i, pdb in enumerate(pdb_ids):
    pdbs.append(pdb)
    if i == N:
        break

In [11]:
_ = p_map(process_f, pdbs)

100%|██████████| 301/301 [00:57<00:00,  5.22it/s]


In [12]:
from multiprocessing import Pool

with Pool(10) as pool:
    pool.map(process_f, pdbs)