In [1]:
from pathlib import Path

import mdtraj
import nglview
import numpy as np
import openmm
import openmm.app
import openmm.unit
from openff.units import ensure_quantity
import polars as pl

def nglview_show_openmm(
    topology: openmm.app.Topology, positions, image_molecules=False
):
    top = mdtraj.Topology.from_openmm(topology)

    if isinstance(positions, str) or isinstance(positions, Path):
        traj = mdtraj.load(positions, top=top)
        if image_molecules:
            traj.image_molecules(inplace=True)
    else:
        positions = ensure_quantity(positions, "openmm").value_in_unit(
            openmm.unit.nanometer
        )
        xyz = np.asarray([positions])
        box_vectors = topology.getPeriodicBoxVectors()
        if box_vectors is not None:
            (
                l1,
                l2,
                l3,
                alpha,
                beta,
                gamma,
            ) = mdtraj.utils.box_vectors_to_lengths_and_angles(
                *np.asarray(box_vectors.value_in_unit(openmm.unit.nanometer))
            )
            unitcell_angles, unitcell_lengths = [alpha, beta, gamma], [l1, l2, l3]
        else:
            unitcell_angles, unitcell_lengths = None, None
        traj = mdtraj.Trajectory(
            xyz, top, unitcell_lengths=unitcell_lengths, unitcell_angles=unitcell_angles
        )
    return nglview.show_mdtraj(traj)



In [2]:
import gzip
import string
from collections import defaultdict
from itertools import batched
from multiprocessing import Pool
from pathlib import Path

import polars as pl
from pdbfixer import PDBFixer

datadir = Path("/home/joshmitchell/Downloads/pdb")

%load_ext snakeviz

In [3]:
metals_atomic_numbers = [
    3,
    4,
    11,
    12,
    13,
    19,
    20,
    21,
    22,
    23,
    24,
    25,
    26,
    27,
    28,
    29,
    30,
    31,
    37,
    38,
    39,
    40,
    41,
    42,
    43,
    44,
    45,
    46,
    47,
    48,
    49,
    50,
    55,
    56,
    57,
    58,
    59,
    60,
    61,
    62,
    63,
    64,
    65,
    66,
    67,
    68,
    69,
    70,
    71,
    72,
    73,
    74,
    75,
    76,
    77,
    78,
    79,
    80,
    81,
    82,
    83,
    84,
    87,
    88,
    89,
    90,
    91,
    92,
    93,
    94,
    95,
    96,
    97,
    98,
    99,
    100,
    101,
    102,
    103,
    104,
    105,
    106,
    107,
    108,
    109,
    110,
    111,
    112,
    113,
    114,
    115,
    116,
]

In [4]:
schema = {
    "id": pl.datatypes.String,
    "err": pl.datatypes.String,
    "n_chains": pl.datatypes.UInt64,
    "n_atoms": pl.datatypes.UInt64,
    "n_hydrogens": pl.datatypes.UInt64,
    "n_carbon": pl.datatypes.UInt64,
    "n_oxygen": pl.datatypes.UInt64,
    "n_nitrogen": pl.datatypes.UInt64,
    "n_phosphorus": pl.datatypes.UInt64,
    "n_sulfur": pl.datatypes.UInt64,
    "n_metals": pl.datatypes.UInt64,
    "n_no_element": pl.datatypes.UInt64,
    **{
        f"chain_{c}_seq": pl.datatypes.List(pl.datatypes.String)
        for c in string.printable
    },
    **{
        f"chain_{c}_res": pl.datatypes.List(pl.datatypes.String)
        for c in string.printable
    },
}

In [5]:
# %%snakeviz

chunk_size = 100
threads = 24

df = pl.DataFrame(schema=schema)


def proc_batch(batch):
    data = {k: [] for k in schema}
    for path in batch:
        for column in data.values():
            column.append(None)
        data["id"][-1] = path.stem[3:-4]
        try:
            with gzip.open(path, "r") as f:
                fixer = PDBFixer(pdbfile=f)
        except Exception as e:
            data["err"][-1] = repr(e)
            continue

        data["n_chains"][-1] = fixer.topology.getNumChains()
        data["n_atoms"][-1] = fixer.topology.getNumAtoms()
        elements = [
            atom.element for atom in fixer.topology.atoms() if atom.element is not None
        ]
        
        data["n_no_element"][-1] = fixer.topology.getNumAtoms() - len(elements)
        data["n_hydrogens"][-1] = sum(
            1 for element in elements if element.symbol == "H"
        )
        data["n_carbon"][-1] = sum(1 for element in elements if element.symbol == "C")
        data["n_oxygen"][-1] = sum(1 for element in elements if element.symbol == "O")
        data["n_nitrogen"][-1] = sum(1 for element in elements if element.symbol == "N")
        data["n_phosphorus"][-1] = sum(
            1 for element in elements if element.symbol == "P"
        )
        data["n_sulfur"][-1] = sum(1 for element in elements if element.symbol == "S")
        data["n_metals"][-1] = sum(
            1 for element in elements if element.atomic_number in metals_atomic_numbers
        )
        for sequence in fixer.sequences:
            data[f"chain_{sequence.chainId}_seq"][-1] = sequence.residues
        for chain in fixer.topology.chains():
            data[f"chain_{chain.id}_res"][-1] = [res.name for res in chain.residues()]

    return pl.DataFrame(data, schema=schema)


for i, batch in enumerate(batched(datadir.glob("*/*.ent.gz"), n=chunk_size * threads)):
    print(i)
    with Pool() as pool:
        df = pl.concat(
            [
                df,
                *pool.imap_unordered(
                    proc_batch,
                    batched(batch, chunk_size),
                ),
            ]
        )
    df.rechunk()
    if i >= 2:
        break

# df.write_parquet(datadir / "dataframe.parquet")

0
1
2


In [8]:
proc_batch(list(datadir.glob("*/*.ent.gz"))[0:10])

id,err,n_chains,n_atoms,n_hydrogens,n_carbon,n_oxygen,n_nitrogen,n_phosphorus,n_sulfur,n_metals,n_no_element,chain_0_seq,chain_1_seq,chain_2_seq,chain_3_seq,chain_4_seq,chain_5_seq,chain_6_seq,chain_7_seq,chain_8_seq,chain_9_seq,chain_a_seq,chain_b_seq,chain_c_seq,chain_d_seq,chain_e_seq,chain_f_seq,chain_g_seq,chain_h_seq,chain_i_seq,chain_j_seq,chain_k_seq,chain_l_seq,chain_m_seq,chain_n_seq,chain_o_seq,…,"chain_""_res",chain_#_res,chain_$_res,chain_%_res,chain_&_res,chain_'_res,chain_(_res,chain_)_res,chain_*_res,chain_+_res,"chain_,_res",chain_-_res,chain_._res,chain_/_res,chain_:_res,chain_;_res,chain_<_res,chain_=_res,chain_>_res,chain_?_res,chain_@_res,chain_[_res,chain_\_res,chain_]_res,chain_^_res,chain___res,chain_`_res,chain_{_res,chain_|_res,chain_}_res,chain_~_res,chain_ _res,chain_	_res,chain_ _res,chain_ _res,chain__res,chain__res
str,str,u64,u64,u64,u64,u64,u64,u64,u64,u64,u64,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],…,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str].1,list[str].2,list[str],list[str]
"""6xcb""",,2,3663,0,2174,929,550,0,9,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""7xca""",,4,3314,0,2050,708,540,0,14,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""7xcd""",,2,1592,0,931,419,234,0,2,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""7xc8""",,2,1818,0,1100,400,305,0,13,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2xck""",,2,2468,0,1500,577,380,1,10,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""6xcv""",,4,8036,0,4741,1987,1286,0,22,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1xcg""",,8,8750,0,5467,1709,1524,0,50,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""4xcj""",,2,1801,0,1074,441,276,2,7,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1xcb""",,14,11140,0,7125,2022,1959,14,7,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""6xcr""",,1,617,314,193,58,48,0,4,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [75]:
df.filter(pl.col("err").is_not_null())

id,err,n_chains,n_atoms,n_hydrogens,n_carbon,n_oxygen,n_nitrogen,n_phosphorus,n_sulfur,n_metals,n_no_element,chain_0_seq,chain_1_seq,chain_2_seq,chain_3_seq,chain_4_seq,chain_5_seq,chain_6_seq,chain_7_seq,chain_8_seq,chain_9_seq,chain_a_seq,chain_b_seq,chain_c_seq,chain_d_seq,chain_e_seq,chain_f_seq,chain_g_seq,chain_h_seq,chain_i_seq,chain_j_seq,chain_k_seq,chain_l_seq,chain_m_seq,chain_n_seq,chain_o_seq,…,"chain_""_res",chain_#_res,chain_$_res,chain_%_res,chain_&_res,chain_'_res,chain_(_res,chain_)_res,chain_*_res,chain_+_res,"chain_,_res",chain_-_res,chain_._res,chain_/_res,chain_:_res,chain_;_res,chain_<_res,chain_=_res,chain_>_res,chain_?_res,chain_@_res,chain_[_res,chain_\_res,chain_]_res,chain_^_res,chain___res,chain_`_res,chain_{_res,chain_|_res,chain_}_res,chain_~_res,chain_ _res,chain_	_res,chain_ _res,chain_ _res,chain__res,chain__res
str,str,u64,u64,u64,u64,u64,u64,u64,u64,u64,u64,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],…,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str].1,list[str].2,list[str],list[str]
"""1joq""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1jox""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1jo1""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1jor""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1jo5""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""1ik1""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1j46""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1k0v""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""1k0h""","""ValueError(""could not convert …",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [78]:
df[[s.name for s in df if not (s.null_count() == df.height)]]

id,err,n_chains,n_atoms,n_hydrogens,n_carbon,n_oxygen,n_nitrogen,n_phosphorus,n_sulfur,n_metals,n_no_element,chain_0_seq,chain_1_seq,chain_2_seq,chain_3_seq,chain_4_seq,chain_5_seq,chain_6_seq,chain_7_seq,chain_8_seq,chain_9_seq,chain_a_seq,chain_b_seq,chain_c_seq,chain_d_seq,chain_e_seq,chain_f_seq,chain_g_seq,chain_h_seq,chain_i_seq,chain_j_seq,chain_k_seq,chain_l_seq,chain_m_seq,chain_n_seq,chain_o_seq,…,chain_p_res,chain_q_res,chain_r_res,chain_s_res,chain_t_res,chain_u_res,chain_v_res,chain_w_res,chain_x_res,chain_y_res,chain_z_res,chain_A_res,chain_B_res,chain_C_res,chain_D_res,chain_E_res,chain_F_res,chain_G_res,chain_H_res,chain_I_res,chain_J_res,chain_K_res,chain_L_res,chain_M_res,chain_N_res,chain_O_res,chain_P_res,chain_Q_res,chain_R_res,chain_S_res,chain_T_res,chain_U_res,chain_V_res,chain_W_res,chain_X_res,chain_Y_res,chain_Z_res
str,str,u64,u64,u64,u64,u64,u64,u64,u64,u64,u64,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],…,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""1jo8""",,2,611,0,302,238,69,0,2,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""SO4"", ""SO4"", … ""HOH""]",,,,,,,,,,,,,,,,,,,,,,,,,
"""5jo0""",,6,6995,0,4268,1606,1077,2,40,2,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""HOH"", ""HOH"", … ""HOH""]","[""HOH"", ""HOH"", … ""HOH""]",,,,,,,,,,,,,,,,,,,,,,,,
"""4jox""",,2,991,0,565,264,161,0,1,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""HOH"", ""HOH"", … ""HOH""]",,,,,,,,,,,,,,,,,,,,,,,,,
"""7jog""",,5,1309,453,409,248,158,40,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""DG"", ""DA"", … ""DA""]","[""DT"", ""DC"", … ""DA""]","[""DT"", ""DC"", … ""DC""]","[""CAC""]",,,,,,,,,,,,,,,,,,,,,,
"""5jox""",,6,8853,0,5012,2495,1316,0,30,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""HOH"", ""HOH"", … ""HOH""]","[""HOH"", ""HOH"", … ""HOH""]",,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""3j32""",,2,6606,0,6606,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""ASP"", ""ASN"", … ""ALA""]","[""ASP"", ""ASN"", … ""ALA""]",,,,,,,,,,,,,,,,,,,,,,,,
"""4j3l""",,2,1886,0,1076,487,308,0,13,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""ZN"", ""SO4"", … ""HOH""]",,,,,,,,,,,,,,,,,,,,,,,,,
"""4j3g""",,12,5647,0,3206,1515,910,0,14,2,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""HOH"", ""HOH"", … ""HOH""]","[""HOH"", ""HOH"", … ""HOH""]","[""HOH"", ""HOH"", … ""HOH""]","[""HOH"", ""HOH"", … ""HOH""]",,,,,,,,,,,,,,,,,,,,,,
"""5j3v""",,4,13357,0,8558,2455,2242,0,102,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,"[""ASP"", ""GLU"", … ""TYR""]","[""ASP"", ""THR"", … ""GLY""]","[""THR"", ""GLY"", … ""LYS""]","[""THR"", ""GLY"", … ""LYS""]",,,,,,,,,,,,,,,,,,,,,,


In [87]:
df.filter(pl.col("n_hydrogens").eq(0).is_not())

  df.filter(pl.col("n_hydrogens").eq(0).is_not())


id,err,n_chains,n_atoms,n_hydrogens,n_carbon,n_oxygen,n_nitrogen,n_phosphorus,n_sulfur,n_metals,n_no_element,chain_0_seq,chain_1_seq,chain_2_seq,chain_3_seq,chain_4_seq,chain_5_seq,chain_6_seq,chain_7_seq,chain_8_seq,chain_9_seq,chain_a_seq,chain_b_seq,chain_c_seq,chain_d_seq,chain_e_seq,chain_f_seq,chain_g_seq,chain_h_seq,chain_i_seq,chain_j_seq,chain_k_seq,chain_l_seq,chain_m_seq,chain_n_seq,chain_o_seq,…,"chain_""_res",chain_#_res,chain_$_res,chain_%_res,chain_&_res,chain_'_res,chain_(_res,chain_)_res,chain_*_res,chain_+_res,"chain_,_res",chain_-_res,chain_._res,chain_/_res,chain_:_res,chain_;_res,chain_<_res,chain_=_res,chain_>_res,chain_?_res,chain_@_res,chain_[_res,chain_\_res,chain_]_res,chain_^_res,chain___res,chain_`_res,chain_{_res,chain_|_res,chain_}_res,chain_~_res,chain_ _res,chain_	_res,chain_ _res,chain_ _res,chain__res,chain__res
str,str,u64,u64,u64,u64,u64,u64,u64,u64,u64,u64,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],…,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str].1,list[str].2,list[str],list[str]
"""7jog""",,5,1309,453,409,248,158,40,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""7job""",,2,2280,6,1318,595,355,0,5,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""4jom""",,2,14275,7094,4545,1373,1218,3,40,2,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2joz""",,1,2126,1043,672,218,187,0,6,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""2jon""",,1,1414,673,458,148,128,0,7,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""5j36""",,15,16883,8221,5325,1857,1465,5,10,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""6j3x""",,3,6658,3043,2074,940,589,0,10,2,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""8j3v""",,1,1016,525,322,84,85,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
"""8j3q""",,1,871,430,266,82,82,0,11,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
