# pypdb demos

This is a set of basic examples of the usage and outputs of the various individual functions included in. There are generally three types of functions.

### Preamble

In [1]:
%pylab inline
from IPython.display import HTML

# Import from local directory
# import sys
# sys.path.insert(0, '../pypdb')
# from pypdb import *

# Import from installed package
from pypdb import *

%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib


# Search functions that return lists of PDB IDs

#### Get a list of PDBs for a specific search term

In [2]:
found_pdbs = Query("ribosome").search()
print(found_pdbs[:10])

  and should_run_async(code)


['486D', '6C0F', '6CB1', '1T1O', '1T1M', '7OHS', '1QI7', '7OHW', '6EM5', '6EM1']


#### Search by PubMed ID Number

In [3]:
found_pdbs = Query(27499440, "PubmedIdQuery").search()
print(found_pdbs[:10])

  and should_run_async(code)


['5IMT', '5IMW', '5IMY']


#### Search by source organism using NCBI TaxId

In [3]:
found_pdbs = Query('6239', 'TreeEntityQuery').search() #TaxID for C elegans
print(found_pdbs[:5])

  and should_run_async(code)


['1D4X', '1DYW', '1E3B', '1E8K', '1EMS']


#### Search by a specific experimental method

In [4]:
found_pdbs = Query('SOLID-STATE NMR', query_type='ExpTypeQuery').search()
print(found_pdbs[:10])

['1CEK', '1EQ8', '1M8M', '1MAG', '1MP6', '1MZT', '1NH4', '1NYJ', '1PI7', '1PI8']


  and should_run_async(code)


#### Search by protein structure similarity

In [5]:
found_pdbs = Query('2E8D', query_type="structure").search()
print(found_pdbs[:10])

  and should_run_async(code)


['2E8D', '4OBA', '4OGV', '4JVR', '3LBL', '4QO4', '4JWR', '2WS4', '4ERE', '2CEU']


#### Search by Author

In [6]:
found_pdbs = Query('Perutz, M.F.', query_type='AdvancedAuthorQuery').search()
print(found_pdbs)

['1CQ4', '1FDH', '1GDJ', '1HDA', '1PBX', '2DHB', '2GDM', '2HHB', '2MHB', '3HHB', '4HHB']


  and should_run_async(code)


#### Search by organism

In [7]:
q = Query("Dictyostelium", query_type="OrganismQuery")
print(q.search()[:10])

['2H84', '3MNQ', '4AE3', '5AN9', '6QKL', '2VM9', '2VMC', '2VMD', '2VME', '2W94']


  and should_run_async(code)


# Information Search functions
While the basic functions described in the previous section are useful for looking up and manipulating individual unique entries, these functions are intended to be more user-facing: they take search keywords and return lists of authors or dates

#### Find papers for a given keyword

In [8]:
matching_papers = find_papers('crispr', max_results=10)
print(list(matching_papers)[:10])

  and should_run_async(code)


['Structures of the Cmr-beta Complex Reveal the Regulation of the Immunity Mechanism of Type III-B CRISPR-Cas.', 'Structures of the Cmr-beta Complex Reveal the Regulation of the Immunity Mechanism of Type III-B CRISPR-Cas', 'Cas1-Cas2 complex formation mediates spacer acquisition during CRISPR-Cas adaptive immunity.', 'An RNA-Induced Conformational Change Required for Crispr RNA Cleavage by the Endoribonuclease Cse3.', 'Structural plasticity and in vivo activity of Cas1 from the type I-F CRISPR-Cas system.']


# Functions that return information about single PDB IDs

#### Get the full PDB file

In [9]:
pdb_file = get_pdb_file('4lza', filetype='cif', compression=False)
print(pdb_file[:400])

data_4LZA
# 
_entry.id   4LZA 
# 
_audit_conform.dict_name       mmcif_pdbx.dic 
_audit_conform.dict_version    5.281 
_audit_conform.dict_location   http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx.dic 
# 
loop_
_database_2.database_id 
_database_2.database_code 
PDB   4LZA         
RCSB  RCSB081269   
WWPDB D_1000081269 
# 
_pdbx_database_related.db_name        TargetTrack 
_pdbx_database_rela


#### Get a general description of the entry's metadata

In [11]:
all_info = get_info('4LZA')
print(list(all_info.keys()))

['audit_author', 'cell', 'citation', 'diffrn', 'diffrn_detector', 'diffrn_radiation', 'diffrn_source', 'entry', 'exptl', 'exptl_crystal', 'exptl_crystal_grow', 'pdbx_sgproject', 'pdbx_audit_revision_details', 'pdbx_audit_revision_history', 'pdbx_database_related', 'pdbx_database_status', 'pdbx_vrpt_summary', 'rcsb_accession_info', 'rcsb_entry_container_identifiers', 'rcsb_entry_info', 'rcsb_primary_citation', 'refine', 'refine_hist', 'refine_ls_restr', 'reflns', 'reflns_shell', 'software', 'struct', 'struct_keywords', 'symmetry', 'rcsb_id']


#### Run a Sequence search

Formerly using BLAST, this method now uses MMseqs2

In [20]:
q = Query("VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTAVAHVDDMPNAL", 
          query_type="sequence", 
          return_type="polymer_entity")

print(q.search())

{'query_id': '90344865-6c1d-448f-ba5b-64a2d727ade6', 'result_type': 'polymer_entity', 'total_count': 782, 'explain_meta_data': {'total_timing': 71, 'sort_timing': 0, 'terminal_node_timings': {'6469': 70}}, 'result_set': [{'identifier': '1C7D_1', 'score': 1.0, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 6469, 'original_score': 164.0, 'norm_score': 1.0, 'match_context': [{'sequence_identity': 0.987, 'evalue': 2.053e-47, 'bitscore': 164, 'alignment_length': 80, 'mismatches': 0, 'gaps_opened': 1, 'query_beg': 1, 'query_end': 79, 'subject_beg': 144, 'subject_end': 223, 'query_length': 79, 'subject_length': 284, 'query_aligned_seq': 'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALT-AVAHVDDMPNAL', 'subject_aligned_seq': 'VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNAL'}]}]}]}, {'identifier': '3OO5_1', 'score': 1.0, 'services': [{'service_type': 'sequence', 'nodes': [{'node_id': 6469, 'original_score': 164.0, 'norm_score':

#### Search by PFAM number

In [5]:
pfam_info = Query("PF00008", query_type="pfam").search()
print(pfam_info[:5])

['2F8Y', '2F8X', '2EC9', '2EI8', '2EI7']


# New API for advanced search

The old API will gradually migrate to use these functions

In [3]:
from pypdb.clients.search.search_client import perform_search
from pypdb.clients.search.search_client import ReturnType
from pypdb.clients.search.operators import text_operators

  and should_run_async(code)


## Search for all entries that mention the word 'ribosome'

In [11]:
search_operator = text_operators.DefaultOperator(value="ribosome")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:10])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"value": "ribosome"}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 

['1JGQ', '1JGP', '1JGO', '1IBM', '1ML5', '1IBL', '1IBK', '1SM1', '1NKW', '1NWY']


## Search for polymers from 'Mus musculus'

In [4]:
search_operator = text_operators.ExactMatchOperator(value="Mus musculus",
                                                    attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
return_type = ReturnType.POLYMER_ENTITY

results = perform_search(search_operator, return_type)

print(results[:5])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"attribute": "rcsb_entity_source_organism.taxonomy_lineage.name", "operator": "exact_match", "value": "Mus musculus"}}, "request_options": {"return_all_hits": true}, "return_type": "polymer_entity"} 

['5KD7_1', '5KD7_2', '6OIJ_3', '3ARF_2', '3ARF_1']


## Search for non-polymers from 'Mus musculus' or 'Homo sapiens'

In [13]:
search_operator = text_operators.InOperator(values=["Mus musculus", "Homo sapiens"],
                                            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")
return_type = ReturnType.NON_POLYMER_ENTITY

results = perform_search(search_operator, return_type)
print(results[:5])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"attribute": "rcsb_entity_source_organism.taxonomy_lineage.name", "operator": "in", "value": ["Mus musculus", "Homo sapiens"]}}, "request_options": {"return_all_hits": true}, "return_type": "non_polymer_entity"} 

['6WV6_3', '6WV6_2', '3TWQ_3', '3TWQ_2', '6RRI_7']


## Search for polymer instances whose titles contain "actin" or "binding" or "protein"

In [14]:
search_operator = text_operators.ContainsWordsOperator(value="actin-binding protein",
                                            attribute="struct.title")
return_type = ReturnType.POLYMER_INSTANCE

results = perform_search(search_operator, return_type)

print(results[:5])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"attribute": "struct.title", "operator": "contains_words", "value": "actin-binding protein"}}, "request_options": {"return_all_hits": true}, "return_type": "polymer_instance"} 

['1WM4.A', '1HQZ.H', '1HQZ.I', '1HQZ.F', '1HQZ.G']


## Search for assemblies that contain the words "actin binding protein"
(must be in that order).

For example, "actin-binding protein" and "actin binding protein" will match,
but "protein binding actin" will not.

In [15]:
search_operator = text_operators.ContainsPhraseOperator(value="actin-binding protein",
                                            attribute="struct.title")
return_type = ReturnType.ASSEMBLY

results = perform_search(search_operator, return_type)

print(results[:5])

Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"attribute": "struct.title", "operator": "contains_phrase", "value": "actin-binding protein"}}, "request_options": {"return_all_hits": true}, "return_type": "assembly"} 

['1HQZ-7', '1HQZ-8', '1HQZ-5', '1HQZ-6', '1HQZ-9']


  and should_run_async(code)


## Search for entries released in 2019 or later

In [16]:
search_operator = text_operators.ComparisonOperator(
       value="2019-01-01T00:00:00Z",
       attribute="rcsb_accession_info.initial_release_date",
       comparison_type=text_operators.ComparisonType.GREATER)
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"operator": "greater", "attribute": "rcsb_accession_info.initial_release_date", "value": "2019-01-01T00:00:00Z"}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 

['3QOB', '6TZU', '6TZT', '6TZW', '6TZV']


## Search for entries released only in 2019

In [11]:
search_operator = text_operators.RangeOperator(
    from_value="2019-01-01T00:00:00Z",
    to_value="2020-01-01T00:00:00Z",
    include_lower=True,
    include_upper=False,
    attribute="rcsb_accession_info.initial_release_date")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"operator": "range", "attribute": "rcsb_accession_info.initial_release_date", "negation": false, "value": ["2019-01-01T00:00:00Z", "2020-01-01T00:00:00Z"]}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 

['6CJJ', '6CJI', '6CJL', '6CJC', '6CKY']


## Search by cell length

In [10]:
from pypdb.clients.search.search_client import perform_search_with_graph, SearchService, ReturnType
from pypdb.clients.search.operators import text_operators

cell_a_operator = text_operators.RangeOperator(
    attribute='cell.length_a',
    from_value=80,
    to_value=84,
    include_upper=True
)

results = perform_search_with_graph(
    query_object=cell_a_operator,
    return_type=ReturnType.ENTRY
)

print(results[:5])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"operator": "range", "attribute": "cell.length_a", "negation": false, "value": [80, 84]}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 

['6CMZ', '6D0X', '6D2C', '6CPD', '6CPF']


## Search for structures under 4 angstroms of resolution

In [18]:
search_operator = text_operators.ComparisonOperator(
           value=4,
           attribute="rcsb_entry_info.resolution_combined",
           comparison_type=text_operators.ComparisonType.LESS)
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"operator": "less", "attribute": "rcsb_entry_info.resolution_combined", "value": 4}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 

['3F2M', '3EQN', '3F2N', '3EQA', '3F2A']


## Search for structures with a given attribute.

(Admittedly every structure has a release date, but the same logic would
 apply for a more sparse RCSB attribute).


In [19]:
search_operator = text_operators.ExistsOperator(
    attribute="rcsb_accession_info.initial_release_date")
return_type = ReturnType.ENTRY

results = perform_search(search_operator, return_type)

print(results[:5])

  and should_run_async(code)


Querying RCSB Search using the following parameters:
 {"query": {"type": "terminal", "service": "text", "parameters": {"operator": "exists", "attribute": "rcsb_accession_info.initial_release_date"}}, "request_options": {"return_all_hits": true}, "return_type": "entry"} 

['3F2M', '3EQN', '3F2N', '3EQA', '3F2A']


## Search for 'Mus musculus' or 'Homo sapiens' structures after 2019 using graph search


In [20]:
from pypdb.clients.search.search_client import perform_search_with_graph
from pypdb.clients.search.search_client import ReturnType
from pypdb.clients.search.search_client import QueryGroup, LogicalOperator
from pypdb.clients.search.operators import text_operators

# SearchOperator associated with structures with under 4 Angstroms of resolution
under_4A_resolution_operator = text_operators.ComparisonOperator(
       value=4,
       attribute="rcsb_entry_info.resolution_combined",
       comparison_type=text_operators.ComparisonType.GREATER)

# SearchOperator associated with entities containing 'Mus musculus' lineage
is_mus_operator = text_operators.ExactMatchOperator(
            value="Mus musculus",
            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")

# SearchOperator associated with entities containing 'Homo sapiens' lineage
is_human_operator = text_operators.ExactMatchOperator(
            value="Homo sapiens",
            attribute="rcsb_entity_source_organism.taxonomy_lineage.name")

# QueryGroup associated with being either human or `Mus musculus`
is_human_or_mus_group = QueryGroup(
    queries = [is_mus_operator, is_human_operator],
    logical_operator = LogicalOperator.OR
)

# QueryGroup associated with being ((Human OR Mus) AND (Under 4 Angstroms))
is_under_4A_and_human_or_mus_group = QueryGroup(
    queries = [is_human_or_mus_group, under_4A_resolution_operator],
    logical_operator = LogicalOperator.AND
)

return_type = ReturnType.ENTRY

results = perform_search_with_graph(
  query_object=is_under_4A_and_human_or_mus_group,
  return_type=return_type)
print("\n", results[:10]) # Huzzah

Querying RCSB Search using the following parameters:
 {"query": {"type": "group", "logical_operator": "and", "nodes": [{"type": "group", "logical_operator": "or", "nodes": [{"type": "terminal", "service": "text", "parameters": {"attribute": "rcsb_entity_source_organism.taxonomy_lineage.name", "operator": "exact_match", "value": "Mus musculus"}}, {"type": "terminal", "service": "text", "parameters": {"attribute": "rcsb_entity_source_organism.taxonomy_lineage.name", "operator": "exact_match", "value": "Homo sapiens"}}]}, {"type": "terminal", "service": "text", "parameters": {"attribute": "rcsb_entry_info.resolution_combined", "operator": "greater", "value": 4}}]}, "request_options": {"return_all_hits": true}, "return_type": "entry"}

 ['6PYH', '1JL4', '6WOV', '6X5A', '6YAF', '6YAH', '6XJD', '6OE5', '6OEN', '4K24']
