Skip to content
316 lines (280 sloc) 13.6 KB
# Copyright (C) 2002, Thomas Hamelryck (
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
"""Parser for PDB files."""
from __future__ import print_function
import warnings
import numpy
except ImportError:
from Bio import MissingPythonDependencyError
raise MissingPythonDependencyError(
"Install NumPy if you want to use the PDB parser.")
from Bio.File import as_handle
from Bio.PDB.PDBExceptions import PDBConstructionException
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from Bio.PDB.StructureBuilder import StructureBuilder
from Bio.PDB.parse_pdb_header import _parse_pdb_header_list
# If PDB spec says "COLUMNS 18-20" this means line[17:20]
class PDBParser(object):
"""Parse a PDB file and return a Structure object."""
def __init__(self, PERMISSIVE=True, get_header=False,
structure_builder=None, QUIET=False):
"""Create a PDBParser object.
The PDB parser call a number of standard methods in an aggregated
StructureBuilder object. Normally this object is instanciated by the
PDBParser object itself, but if the user provides his/her own
StructureBuilder object, the latter is used instead.
- PERMISSIVE - Evaluated as a Boolean. If false, exceptions in
constructing the SMCRA data structure are fatal. If true (DEFAULT),
the exceptions are caught, but some residues or atoms will be missing.
- structure_builder - an optional user implemented StructureBuilder class.
- QUIET - Evaluated as a Boolean. If true, warnings issued in constructing
the SMCRA data will be suppressed. If false (DEFAULT), they will be shown.
These warnings might be indicative of problems in the PDB file!
if structure_builder is not None:
self.structure_builder = structure_builder
self.structure_builder = StructureBuilder()
self.header = None
self.trailer = None
self.line_counter = 0
self.QUIET = bool(QUIET)
# Public methods
def get_structure(self, id, file):
"""Return the structure.
- id - string, the id that will be used for the structure
- file - name of the PDB file OR an open filehandle
with warnings.catch_warnings():
if self.QUIET:
warnings.filterwarnings("ignore", category=PDBConstructionWarning)
self.header = None
self.trailer = None
# Make a StructureBuilder instance (pass id of structure as parameter)
with as_handle(file, mode='rU') as handle:
# Return the Structure instance
structure = self.structure_builder.get_structure()
return structure
def get_header(self):
"""Return the header."""
return self.header
def get_trailer(self):
"""Return the trailer."""
return self.trailer
# Private methods
def _parse(self, header_coords_trailer):
"""Parse the PDB file (PRIVATE)."""
# Extract the header; return the rest of the file
self.header, coords_trailer = self._get_header(header_coords_trailer)
# Parse the atomic data; return the PDB file trailer
self.trailer = self._parse_coordinates(coords_trailer)
def _get_header(self, header_coords_trailer):
"""Get the header of the PDB file, return the rest (PRIVATE)."""
structure_builder = self.structure_builder
i = 0
for i in range(0, len(header_coords_trailer)):
structure_builder.set_line_counter(i + 1)
line = header_coords_trailer[i]
record_type = line[0:6]
if record_type == "ATOM " or record_type == "HETATM" or record_type == "MODEL ":
header = header_coords_trailer[0:i]
# Return the rest of the coords+trailer for further processing
self.line_counter = i
coords_trailer = header_coords_trailer[i:]
header_dict = _parse_pdb_header_list(header)
return header_dict, coords_trailer
def _parse_coordinates(self, coords_trailer):
"""Parse the atomic data in the PDB file (PRIVATE)."""
local_line_counter = 0
structure_builder = self.structure_builder
current_model_id = 0
# Flag we have an open model
model_open = 0
current_chain_id = None
current_segid = None
current_residue_id = None
current_resname = None
for i in range(0, len(coords_trailer)):
line = coords_trailer[i].rstrip('\n')
record_type = line[0:6]
global_line_counter = self.line_counter + local_line_counter + 1
if record_type == "ATOM " or record_type == "HETATM":
# Initialize the Model - there was no explicit MODEL record
if not model_open:
current_model_id += 1
model_open = 1
fullname = line[12:16]
# get rid of whitespace in atom names
split_list = fullname.split()
if len(split_list) != 1:
# atom name has internal spaces, e.g. " N B ", so
# we do not strip spaces
name = fullname
# atom name is like " CA ", so we can strip spaces
name = split_list[0]
altloc = line[16]
resname = line[17:20]
chainid = line[21]
serial_number = int(line[6:11])
except Exception:
serial_number = 0
resseq = int(line[22:26].split()[0]) # sequence identifier
icode = line[26] # insertion code
if record_type == "HETATM": # hetero atom flag
if resname == "HOH" or resname == "WAT":
hetero_flag = "W"
hetero_flag = "H"
hetero_flag = " "
residue_id = (hetero_flag, resseq, icode)
# atomic coordinates
x = float(line[30:38])
y = float(line[38:46])
z = float(line[46:54])
except Exception:
# Should we allow parsing to continue in permissive mode?
# If so, what coordinates should we default to? Easier to abort!
raise PDBConstructionException("Invalid or missing coordinate(s) at line %i."
% global_line_counter)
coord = numpy.array((x, y, z), "f")
# occupancy & B factor
occupancy = float(line[54:60])
except Exception:
self._handle_PDB_exception("Invalid or missing occupancy",
occupancy = None # Rather than arbitrary zero or one
if occupancy is not None and occupancy < 0:
# TODO - Should this be an error in strict mode?
# self._handle_PDB_exception("Negative occupancy",
# global_line_counter)
# This uses fixed text so the warning occurs once only:
warnings.warn("Negative occupancy in one or more atoms", PDBConstructionWarning)
bfactor = float(line[60:66])
except Exception:
self._handle_PDB_exception("Invalid or missing B factor",
bfactor = 0.0 # The PDB use a default of zero if the data is missing
segid = line[72:76]
element = line[76:78].strip().upper()
if current_segid != segid:
current_segid = segid
if current_chain_id != chainid:
current_chain_id = chainid
current_residue_id = residue_id
current_resname = resname
structure_builder.init_residue(resname, hetero_flag, resseq, icode)
except PDBConstructionException as message:
self._handle_PDB_exception(message, global_line_counter)
elif current_residue_id != residue_id or current_resname != resname:
current_residue_id = residue_id
current_resname = resname
structure_builder.init_residue(resname, hetero_flag, resseq, icode)
except PDBConstructionException as message:
self._handle_PDB_exception(message, global_line_counter)
# init atom
structure_builder.init_atom(name, coord, bfactor, occupancy, altloc,
fullname, serial_number, element)
except PDBConstructionException as message:
self._handle_PDB_exception(message, global_line_counter)
elif record_type == "ANISOU":
anisou = [float(x) for x in (line[28:35], line[35:42], line[43:49],
line[49:56], line[56:63], line[63:70])]
# U's are scaled by 10^4
anisou_array = (numpy.array(anisou, "f") / 10000.0).astype("f")
elif record_type == "MODEL ":
serial_num = int(line[10:14])
except Exception:
self._handle_PDB_exception("Invalid or missing model serial number",
serial_num = 0
structure_builder.init_model(current_model_id, serial_num)
current_model_id += 1
model_open = 1
current_chain_id = None
current_residue_id = None
elif record_type == "END " or record_type == "CONECT":
# End of atomic data, return the trailer
self.line_counter += local_line_counter
return coords_trailer[local_line_counter:]
elif record_type == "ENDMDL":
model_open = 0
current_chain_id = None
current_residue_id = None
elif record_type == "SIGUIJ":
# standard deviation of anisotropic B factor
siguij = [float(x) for x in (line[28:35], line[35:42], line[42:49],
line[49:56], line[56:63], line[63:70])]
# U sigma's are scaled by 10^4
siguij_array = (numpy.array(siguij, "f") / 10000.0).astype("f")
elif record_type == "SIGATM":
# standard deviation of atomic positions
sigatm = [float(x) for x in (line[30:38], line[38:45], line[46:54],
line[54:60], line[60:66])]
sigatm_array = numpy.array(sigatm, "f")
local_line_counter += 1
# EOF (does not end in END or CONECT)
self.line_counter = self.line_counter + local_line_counter
return []
def _handle_PDB_exception(self, message, line_counter):
"""Handle exception (PRIVATE).
This method catches an exception that occurs in the StructureBuilder
object (if PERMISSIVE), or raises it again, this time adding the
PDB line number to the error message.
message = "%s at line %i." % (message, line_counter)
# just print a warning - some residues/atoms may be missing
warnings.warn("PDBConstructionException: %s\n"
"Exception ignored.\n"
"Some atoms or residues may be missing in the data structure."
% message, PDBConstructionWarning)
# exceptions are fatal - raise again with new message (including line nr)
raise PDBConstructionException(message)
if __name__ == "__main__":
import sys
p = PDBParser(PERMISSIVE=True)
filename = sys.argv[1]
s = p.get_structure("scr", filename)
for m in s:
p = m.get_parent()
assert(p is s)
for c in m:
p = c.get_parent()
assert(p is m)
for r in c:
p = r.get_parent()
assert(p is c)
for a in r:
p = a.get_parent()
if p is not r:
print("%s %s" % (p, r))
Something went wrong with that request. Please try again.