In [1]:
from utils.parse_pdb import align_pdb, open_pdb, PDBError, get_pdb_file
import os
import boto3
import pickle
from tqdm import tqdm
from p_tqdm import p_map
import sidechainnet as scn
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TMP_FOLDER = "data/tmp_pdb"
OUTPUT_FOLDER = "data/pdb"
PDB_PREFIX = "20220103/pub/pdb/data/biounit/PDB/all/"

i = 0
while os.path.exists(f"./log_{i}.txt"):
    i += 1
LOG_FILE = f"./log_{i}.txt"

if not os.path.exists(TMP_FOLDER):
    os.mkdir(TMP_FOLDER)
if not os.path.exists(OUTPUT_FOLDER):
    os.mkdir(OUTPUT_FOLDER)

In [3]:
MIN_LENGTH = 30
MAX_LENGTH = None
RESOLUTION_THR = 3.5
MISSING_THR = 0.1

In [4]:
def clean(pdb_id):
    pdb, bu = pdb_id.split('_')
    for file in os.listdir(TMP_FOLDER):
        if file.startswith(pdb):
            os.remove(os.path.join(TMP_FOLDER, file))

In [5]:
def get_log_stats(log_file):
    stats = defaultdict(lambda: 0)
    with open(log_file, "r") as f:
        for line in f.readlines():
            if line.startswith("<<<"):
                stats[line.split(':')[0]] += 1
    return

In [11]:
def log_exception(exception, log_file, pdb_id):
    # raise exception
    clean(pdb_id)
    if isinstance(exception, PDBError):
        with open(log_file, "a") as f:
            f.write(f'<<< {str(exception)}: {pdb_id} \n')
    else:
        with open(log_file, "a") as f:
            f.write(f'<<< Unknown: {pdb_id} \n')
            f.write(str(exception))
            f.write("\n")

In [21]:
s3 = boto3.resource('s3')
bucket = s3.Bucket("pdbsnapshots")
all_pdbs = bucket.objects.filter(Prefix=PDB_PREFIX)
N = 20

sample_pdbs = []
for i, x in enumerate(all_pdbs):
    sample_pdbs.append(x)
    if i == N:
        break

def process(pdb_file, tmp_folder, min_length, max_length, max_missing, resolution_thr, log_file, output_folder):
    if not isinstance(pdb_file, str):
        pdb_file = pdb_file.key
    local_path = get_pdb_file(pdb_file, s3.Bucket("pdbsnapshots"), tmp_folder=tmp_folder)
    basename = os.path.basename(local_path)
    id = f"{basename.split('.')[0]}_{basename.split('.')[1][-1]}"
    try:
        pdb_dict = open_pdb(
            local_path, 
            thr_resolution=resolution_thr, 
            tmp_folder=tmp_folder,
            bucket=s3.Bucket("pdbsnapshots"),
        )
        pdb_dict = align_pdb(pdb_dict, min_length=min_length, max_length=max_length, max_missing=max_missing)
    except Exception as e:
        log_exception(e, log_file, id)
        return 0
    if pdb_dict is not None:
        with open(os.path.join(output_folder, id + '.pickle'), "wb") as f:
            pickle.dump(pdb_dict, f)
    return 1

def f(x):
    return process(x, TMP_FOLDER, MIN_LENGTH, MAX_LENGTH, MISSING_THR, RESOLUTION_THR, LOG_FILE, OUTPUT_FOLDER)

# for pdb_i, pdb_file in tqdm(enumerate(all_pdbs), total=N):
#     if pdb_i == N:
#         break
#     process(pdb_file)

# for x in tqdm(sample_pdbs):
#     process(x)

from pathos.multiprocessing import ProcessingPool as Pool

with Pool(5) as p:
    p.map(f, sample_pdbs)
# p_map(process, sample_pdbs)

# to_rerun = []
# with open("./log_0.txt", "r") as f:
#     for line in f.readlines():
#         if line.startswith("<<< Unknown"):
#             to_rerun.append(line.split(": ")[-1].strip())

TypeError: no default __reduce__ due to non-trivial __cinit__

In [16]:
def visualize(file):
    with open("data/pdb/101m_1.pickle", "rb") as f:
        data = pickle.load(f)

    crd = np.concatenate([data["A"]["crd_bb"], data["A"]["crd_sc"]], axis=1).reshape((-1, 3))
    seq = data["A"]["seq"]
    sb2 = scn.StructureBuilder(seq, crd)
    sb2.to_3Dmol()

<py3Dmol.view at 0x7fb43cb5fbe0>