In [1]:
from utils.parse_pdb import align_pdb, open_pdb, PDBError, get_pdb_file
import os
import boto3
import pickle
from tqdm import tqdm
from p_tqdm import p_map
import sidechainnet as scn
import numpy as np
from rcsbsearch import TextQuery, Attr
import subprocess

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def visualize(id):
    with open(f"./data/pdb/{id}.pickle", "rb") as f:
        data = pickle.load(f)
    crds = []
    seq = ""
    for chain in data:  
        crd = np.concatenate([data[chain]["crd_bb"], data[chain]["crd_sc"]], axis=1).reshape((-1, 3))
        crds.append(crd)
        seq += data[chain]["seq"]
    crd = np.concatenate(crds, 0)
    sb2 = scn.StructureBuilder(seq, crd)
    return sb2.to_3Dmol()

In [3]:
from collections import defaultdict

def get_log_stats(log_file):
    stats = defaultdict(lambda: 0)
    with open(log_file, "r") as f:
        for line in f.readlines():
            if line.startswith("<<<"):
                stats[line.split(':')[0]] += 1
    keys = sorted(stats.keys(), key=lambda x: stats[x], reverse=True)
    for key in keys:
        value = stats[key]
        print(f'{key}: {value}')

In [4]:
def get_unknown_stats(log_file):
    stats = defaultdict(lambda: [])
    with open(log_file, "r") as f:
        error = None
        id = None
        for line in f.readlines():
            if line.startswith("<<< Unknown"):
                error = ""
                id = line.split(":")[-1].strip()
            elif line.startswith("<<<") and error is not None:
                if error.startswith("Could not download"):
                    error = "Could not download PDB"
                stats[error].append(id)
                error = None
            elif error is not None:
                error += line
    keys = sorted(stats.keys(), key=lambda x: len(stats[x]), reverse=True)
    for key in keys:
        value = stats[key]
        print(f'{key}: {value}')

In [5]:
from utils.parse_pdb import get_pdb_file
bucket = boto3.resource('s3').Bucket("pdbsnapshots")

In [6]:
s3 = boto3.resource('s3')
bucket = s3.Bucket('pdbsnapshots')

import os
import boto3
from collections import namedtuple
from operator import attrgetter


S3Obj = namedtuple('S3Obj', ['key', 'mtime', 'size', 'ETag'])


def s3list(bucket, path, start=None, end=None, recursive=True, list_dirs=True,
           list_objs=True, limit=None):
    """
    Iterator that lists a bucket's objects under path, (optionally) starting with
    start and ending before end.

    If recursive is False, then list only the "depth=0" items (dirs and objects).

    If recursive is True, then list recursively all objects (no dirs).

    Args:
        bucket:
            a boto3.resource('s3').Bucket().
        path:
            a directory in the bucket.
        start:
            optional: start key, inclusive (may be a relative path under path, or
            absolute in the bucket)
        end:
            optional: stop key, exclusive (may be a relative path under path, or
            absolute in the bucket)
        recursive:
            optional, default True. If True, lists only objects. If False, lists
            only depth 0 "directories" and objects.
        list_dirs:
            optional, default True. Has no effect in recursive listing. On
            non-recursive listing, if False, then directories are omitted.
        list_objs:
            optional, default True. If False, then directories are omitted.
        limit:
            optional. If specified, then lists at most this many items.

    Returns:
        an iterator of S3Obj.

    Examples:
        # set up
        >>> s3 = boto3.resource('s3')
        ... bucket = s3.Bucket('bucket-name')

        # iterate through all S3 objects under some dir
        >>> for p in s3list(bucket, 'some/dir'):
        ...     print(p)

        # iterate through up to 20 S3 objects under some dir, starting with foo_0010
        >>> for p in s3list(bucket, 'some/dir', limit=20, start='foo_0010'):
        ...     print(p)

        # non-recursive listing under some dir:
        >>> for p in s3list(bucket, 'some/dir', recursive=False):
        ...     print(p)

        # non-recursive listing under some dir, listing only dirs:
        >>> for p in s3list(bucket, 'some/dir', recursive=False, list_objs=False):
        ...     print(p)
"""
    kwargs = dict()
    if start is not None:
        if not start.startswith(path):
            start = os.path.join(path, start)
        # note: need to use a string just smaller than start, because
        # the list_object API specifies that start is excluded (the first
        # result is *after* start).
        kwargs.update(Marker=__prev_str(start))
    if end is not None:
        if not end.startswith(path):
            end = os.path.join(path, end)
    if not recursive:
        kwargs.update(Delimiter='/')
        if not path.endswith('/') and len(path) > 0:
            path += '/'
    kwargs.update(Prefix=path)
    if limit is not None:
        kwargs.update(PaginationConfig={'MaxItems': limit})

    paginator = bucket.meta.client.get_paginator('list_objects')
    for resp in paginator.paginate(Bucket=bucket.name, **kwargs):
        q = []
        if 'CommonPrefixes' in resp and list_dirs:
            q = [S3Obj(f['Prefix'], None, None, None) for f in resp['CommonPrefixes']]
        if 'Contents' in resp and list_objs:
            q += [S3Obj(f['Key'], f['LastModified'], f['Size'], f['ETag']) for f in resp['Contents']]
        # note: even with sorted lists, it is faster to sort(a+b)
        # than heapq.merge(a, b) at least up to 10K elements in each list
        q = sorted(q, key=attrgetter('key'))
        if limit is not None:
            q = q[:limit]
            limit -= len(q)
        for p in q:
            if end is not None and p.key >= end:
                return
            yield p

folders = [x.key for x in s3list(bucket, "", recursive=False, list_objs=False)]

In [13]:
PDB_PREFIX = "pub/pdb/data/biounit/PDB/all/"
ordered_folders = [x.key + PDB_PREFIX for x in s3list(boto3.resource('s3').Bucket("pdbsnapshots"), "", recursive=False, list_objs=False)]
ordered_folders = sorted(ordered_folders, reverse=True)
get_pdb_file("2btj.pdb1.gz", bucket, "data/tmp_pdb", folders=ordered_folders)

file='20220103/pub/pdb/data/biounit/PDB/all/2btj.pdb1.gz'
file='20210105/pub/pdb/data/biounit/PDB/all/2btj.pdb1.gz'


'data/tmp_pdb/2btj-1.pdb.gz'

In [12]:
bucket.download_file("20210105//pub/pdb/data/biounit/PDB/all/2btj.pdb1.gz", "data/tmp_pdb/file.pdb.gz")