In [8]:
from mmCif import mmcifIO
#from mmcifIO import mmcifIO
# load the mmCIF parser
cfr = mmcifIO.CifFileReader(input='dictionary')
# read an mmCIF file
cif_file = cfr.read("1a3n.cif", output='cif_file')


CifFile: <CifFile "1a3n.cif">


mmCIF files are organised into 
DataBlocks --> Categories --> Items
The data we want in PDB mmCIF files is in the first (and in most cases) the only DataBlock

In [9]:
mmCIF_data = cif_file.getDataBlocks()[0]

Entity information is stored witin the mmCIF category entity

In [13]:
entity_category = mmCIF_data.getCategory('entity')

print("Entity information:", entity_category)

Entity information: <Category "_entity" with items ['id', 'type', 'src_method', 'pdbx_description', 'formula_weight', 'pdbx_number_of_molecules', 'pdbx_ec', 'pdbx_mutation', 'pdbx_fragment', 'details']>


In [None]:
The items within a category can be 

In [18]:
entity_items = entity_category.getItemNames()
print(entity_items)

['id', 'type', 'src_method', 'pdbx_description', 'formula_weight', 'pdbx_number_of_molecules', 'pdbx_ec', 'pdbx_mutation', 'pdbx_fragment', 'details']


Entity names are stored in _entity.pdbx_description item

In [16]:
entity_names = entity_category.getItem('pdbx_description').value
print(entity_names)

['HEMOGLOBIN (ALPHA CHAIN)', 'HEMOGLOBIN (BETA CHAIN)', 'PROTOPORPHYRIN IX CONTAINING FE', 'water']


only some of these are polymers

In [17]:
entity_types = entity_category.getItem('type').value
print(entity_types)

['polymer', 'polymer', 'non-polymer', 'water']


Their IDs are available in the ID item

In [37]:
entity_ids = entity_category.getItem('id').value
print(entity_ids)

['1', '2', '3', '4']


We can get data for all items with a simple function

In [29]:
def get_cat_dict_values(category_object):
    result = dict()
    items = category_object.getItemNames()
    if items:
        for item in items:
            values = category_object.getItem(item).value
            # the mmCIF parser will return a string for a single value - here we convert it to a list so its consistent
            if not isinstance(values, list):
               values = [values]
            result[item] = values
    return result

In [28]:
category_data = get_cat_dict_values(entity_category)
print(category_data)

{'id': ['1', '2', '3', '4'], 'type': ['polymer', 'polymer', 'non-polymer', 'water'], 'src_method': ['nat', 'nat', 'syn', 'nat'], 'pdbx_description': ['HEMOGLOBIN (ALPHA CHAIN)', 'HEMOGLOBIN (BETA CHAIN)', 'PROTOPORPHYRIN IX CONTAINING FE', 'water'], 'formula_weight': ['15150.353', '15890.198', '616.487', '18.015'], 'pdbx_number_of_molecules': ['2', '2', '4', '451'], 'pdbx_ec': ['?', '?', '?', '?'], 'pdbx_mutation': ['?', '?', '?', '?'], 'pdbx_fragment': ['?', '?', '?', '?'], 'details': ['?', '?', '?', '?']}


We can change this to a list of dictionaries if this is more convient

In [34]:
def get_cat_list_of_dict(category_object):
    result = list()
    items = category_object.getItemNames()
    values_in_lists = get_cat_dict_values(category_object)
    if items and values_in_lists:
        num_values = len(values_in_lists[items[0]])
        range_of_values = range(num_values)
        for row in range_of_values:
            row_dict = dict()
            for item in values_in_lists:
                item_value = values_in_lists[item][row]
                row_dict[item] = item_value
            result.append(row_dict)
    return result

In [36]:
category_list_of_dicts = get_cat_list_of_dict(entity_category)
print(category_list_of_dicts)

[{'id': '1', 'type': 'polymer', 'src_method': 'nat', 'pdbx_description': 'HEMOGLOBIN (ALPHA CHAIN)', 'formula_weight': '15150.353', 'pdbx_number_of_molecules': '2', 'pdbx_ec': '?', 'pdbx_mutation': '?', 'pdbx_fragment': '?', 'details': '?'}, {'id': '2', 'type': 'polymer', 'src_method': 'nat', 'pdbx_description': 'HEMOGLOBIN (BETA CHAIN)', 'formula_weight': '15890.198', 'pdbx_number_of_molecules': '2', 'pdbx_ec': '?', 'pdbx_mutation': '?', 'pdbx_fragment': '?', 'details': '?'}, {'id': '3', 'type': 'non-polymer', 'src_method': 'syn', 'pdbx_description': 'PROTOPORPHYRIN IX CONTAINING FE', 'formula_weight': '616.487', 'pdbx_number_of_molecules': '4', 'pdbx_ec': '?', 'pdbx_mutation': '?', 'pdbx_fragment': '?', 'details': '?'}, {'id': '4', 'type': 'water', 'src_method': 'nat', 'pdbx_description': 'water', 'formula_weight': '18.015', 'pdbx_number_of_molecules': '451', 'pdbx_ec': '?', 'pdbx_mutation': '?', 'pdbx_fragment': '?', 'details': '?'}]
