In [4]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [34]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
# data import funcs
from BioPlexPy.data_import_funcs import getCorum

## [0] Environment Setup

setup for running 
- created virtual environment in conda 
> conda create -n CCB_BioPlexPy python=3.7
- installed **pandas**
> conda install -c anaconda pandas
- installed **requests**
> conda install -c anaconda requests
- installed **anndata**
> conda install anndata -c bioconda
- installed **jupyter notebook**
> conda install -c conda-forge notebook
- installed **networkx**
> pip install networkx[default]
- installed **biopython**
> conda install -c conda-forge biopython
- installed **nglview**

> conda install nglview -c conda-forge

> jupyter-nbextension enable nglview --py --sys-prefix

> jupyter-nbextension enable --py --sys-prefix widgetsnbextension

- installed **pypdb**
> pip install pypdb

In [28]:
import pandas as pd
from pypdb import *
import itertools
from collections import Counter

def CORUM_to_PDB(Corum_DF, Complex_ID):
    '''
    Retreive PDB ID for protein structre corresponding to a CORUM complex.
    
    This function takes a CORUM complex ID and maps the corresponding 
    UniProt IDs for the complex to a PDB ID.

    Parameters
    ----------
    DataFrame of CORUM complexes : Pandas DataFrame
    Corum Complex ID: int

    Returns
    -------
    PDB ID for CORUM complex

    Examples
    --------
    >>> Corum_DF = getCorum('core', 'Human') # (1) Obtain CORUM complexes
    >>> PDB_ID_Arp_2_3 = CORUM_to_PDB(Corum_DF, 27) # (2) Get PDB ID for specified protein complex (Arp 2/3 complex ID: 17)
    >>> PDB_ID_ING2 = CORUM_to_PDB(Corum_DF, 2851) # (3) Get PDB ID for specified protein complex (ING2 complex ID: 2851), demonstrates WARNINGS
    '''
    # get UniProt IDs for each protein in the CORUM complex
    uniprot_IDs_complex_i = Corum_DF[Corum_DF.ComplexID == Complex_ID].loc[:,'subunits(UniProt IDs)'].values[0].split(';')
    num_proteins_CORUM_complex_i = len(uniprot_IDs_complex_i)

    # Map from CORUM complex subunits given as UniProt IDs 
    # via [SIFTS](https://www.ebi.ac.uk/pdbe/docs/sifts/quick.html) to PDB structures:
    # "A summary of the UniProt to PDB mappings showing the UniProt accession followed by a semicolon-separated list of PDB four letter codes."
    uniprot_pdb_mapping_df = pd.read_csv("ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_pdb.csv.gz", header = 1, sep = ',', compression = 'gzip')

    # set UniProt IDs as index
    uniprot_pdb_mapping_df.set_index('SP_PRIMARY', drop = True, inplace = True)

    # convert col PDB semicolon-separated list into Python list
    uniprot_pdb_mapping_df.loc[:,'PDB'] = [PDB_codes_i.split(';') for PDB_codes_i in uniprot_pdb_mapping_df.PDB]

    # get PDB IDs that map to each UniProt ID
    PDB_IDs_complex_i = []
    for uniprot_ID_i_complex_i in uniprot_IDs_complex_i:

        # check to see if UniProt ID exists in mapping
        if uniprot_ID_i_complex_i in uniprot_pdb_mapping_df.index:

            # append to list of PDB IDs, take ALL PDB IDs in mapped list
            mapped_PDB_ID_i = uniprot_pdb_mapping_df.loc[uniprot_ID_i_complex_i,:].values[0]
            PDB_IDs_complex_i.append(mapped_PDB_ID_i)

        else:
            print(f'WARNING: {uniprot_ID_i_complex_i} does not have any corresponding PDB IDs mapped.')

    # check to see there are any PDB IDs that are present in all Uniprot - PDB mappings
    PDB_IDs_complex_i = list(itertools.chain(*PDB_IDs_complex_i)) # flatten list of PDB IDs that mapped to UniProt IDs
    # convert to uppercase
    PDB_IDs_complex_i = [PDB_ID.upper() for PDB_ID in PDB_IDs_complex_i]
    PDB_IDs_complex_i_count = pd.Series(Counter(PDB_IDs_complex_i))

    # check if one PDB ID maps to all UniProt IDs
    if sum(PDB_IDs_complex_i_count == len(uniprot_IDs_complex_i)) == 1:

        # use this PDB ID for CORUM complex
        PDB_ID_structure_for_CORUM_complex_i = list(PDB_IDs_complex_i_count[PDB_IDs_complex_i_count == len(uniprot_IDs_complex_i)].index)[0]

    else:

        # check to see if multiple PDB IDs map to all UniProt IDs, then search through those
        if sum(PDB_IDs_complex_i_count == len(uniprot_IDs_complex_i)) > 1:

            # get PDB IDs that map to all Uniprot IDs
            PDB_IDs_mapped_to_complex_i = list(PDB_IDs_complex_i_count[PDB_IDs_complex_i_count == len(uniprot_IDs_complex_i)].index)

        # if no PDB ID maps to all UniProt IDs, 
        # then run a query with all unique PDB IDs that mapped to any UniProt ID and search the resulting PDB IDs to find complex 
        else:
            PDB_IDs_complex_i = list(set(PDB_IDs_complex_i)) # if redundant PDB IDs
            PDB_IDs_complex_i = ' '.join(PDB_IDs_complex_i) # convert seperate PDB ids into one string

            # use PDB IDs from SIFTS mapping as search terms to find associated complex PDB ID in PDB database
            PDB_IDs_mapped_to_complex_i = Query(PDB_IDs_complex_i).search()
            
        # if query returns an empty list, raise warning
        if len(PDB_IDs_mapped_to_complex_i) == 0:
            print(f'WARNING: Could not map PDB ID to this CORUM complex ID.')
            PDB_ID_structure_for_CORUM_complex_i = None

        else:
            # iterate through PDB ID & retreive metadata
            PDB_protein_count = [] # stores number of polymer proteins for this structure
            PDB_deposit_date = [] # store the date of deposit for this structure
            for PDB_ID in PDB_IDs_mapped_to_complex_i:

                # retreive metadata for this structure from PDB
                PDB_structure_all_info = get_info(PDB_ID)
                PDB_protein_count.append(PDB_structure_all_info['rcsb_entry_info']['polymer_entity_count_protein'])
                PDB_deposit_date.append(pd.to_datetime(PDB_structure_all_info['rcsb_accession_info']['deposit_date']))

            # convert CORUM complex i - associated PDB IDs into DataFrame w/ # proteins & resolution
            complex_i_PDBs_df = pd.DataFrame(index = PDB_IDs_mapped_to_complex_i)
            complex_i_PDBs_df.loc[:,'num_proteins'] = PDB_protein_count
            complex_i_PDBs_df.loc[:,'deposit_date'] = PDB_deposit_date

            # column for number of proteins in PDB structure different from num proteins listed in CORUM complex
            complex_i_PDBs_df.loc[:,'num_proteins_PDB_CORUM_diff'] = abs(complex_i_PDBs_df.num_proteins - num_proteins_CORUM_complex_i)

            # pick PDB structure that has same number of proteins/chains as CORUM complex (or matches closest), then rank by most recent deposit date
            complex_i_PDBs_df.sort_values(by = ['num_proteins_PDB_CORUM_diff','deposit_date'], ascending = [True, False], inplace = True)
            PDB_ID_structure_for_CORUM_complex_i = complex_i_PDBs_df.index[0] # take PDB ID corresponding to top row after ranking
            
            # print a warning if the number of proteins for PDB ID complex differs from number of subunits in CORUM complex
            num_proteins_diff = complex_i_PDBs_df.loc[PDB_ID_structure_for_CORUM_complex_i, 'num_proteins_PDB_CORUM_diff']
            if num_proteins_diff != 0:
                print(f'WARNING: The number of proteins in PDB ID {PDB_ID_structure_for_CORUM_complex_i} and number of subunits in CORUM complex {Complex_ID} differs by {num_proteins_diff}.')
        
    return PDB_ID_structure_for_CORUM_complex_i

  and should_run_async(code)


In [41]:
Corum_DF = getCorum('core', 'Human') # (1) Obtain CORUM complexes
PDB_ID_ING2 = CORUM_to_PDB(Corum_DF, 2851) # (2) Get PDB ID for specified protein complex (ING2 complex ID: 2851)
print(PDB_ID_ING2)

6WKR


In [42]:
PDB_ID_Arp_2_3 = CORUM_to_PDB(Corum_DF, 27) # (2) Get PDB ID for specified protein complex (Arp 2/3 complex ID: 17)
print(PDB_ID_Arp_2_3)

6UHC


In [None]:
from Bio.PDB import *

In [None]:
protein_structure_dir = '/n/data1/hms/ccb/lab/projects/bioplex/BioPlexPy/protein_function_testing'

In [88]:
PDB_ID_structure_for_CORUM_complex_i

'6YW6'

In [83]:
# download structure from PDB
pdbl = PDBList()
PBD_file_path = pdbl.retrieve_pdb_file(PDB_ID_structure_for_CORUM_complex_i, pdir=protein_structure_dir, file_format='pdb', overwrite=True)

# create a structure object
parser = PDBParser()
structure = parser.get_structure(PDB_ID_structure_for_CORUM_complex_i, PBD_file_path)

Downloading PDB structure '6YW6'...


In [176]:
model = structure[0]
chain_IDs = [chain.get_id() for chain in model] # get a list of all chains

# we want to test every pair of chains to see if they have any atoms that are < 6 angstroms in distance
possible_chain_pairs = list(itertools.combinations(chain_IDs, 2))

chain_pairs_direct_interaction = []
# iterate through all chain pairs and check to see if any atoms are close
for chain_i_id, chain_j_id in possible_chain_pairs:
    
    # get chain objects from models
    chain_i = model[chain_i_id]
    chain_j = model[chain_j_id]
    
    # get all atoms from each chain, 'A' stands for ATOM
    atom_list_i = Selection.unfold_entities(chain_i, "A")
    atom_list_j = Selection.unfold_entities(chain_j, "A")
    
    # iterate through each pair of atoms and break if a pair of atoms < 6 angstroms apart
    for atom_i in atom_list_i:
        for atom_j in atom_list_j:
              
            distance = atom_i - atom_j
            if distance < 6:
                chain_pairs_direct_interaction.append([chain_i_id, chain_j_id])
                break # break inner loop
        else:
            continue # continue if inner loop not broken
        break # break outer loop if innter loop is broken

In [168]:
possible_chain_pairs

[('A', 'C'),
 ('A', 'D'),
 ('A', 'E'),
 ('A', 'B'),
 ('A', 'G'),
 ('A', 'F'),
 ('C', 'D'),
 ('C', 'E'),
 ('C', 'B'),
 ('C', 'G'),
 ('C', 'F'),
 ('D', 'E'),
 ('D', 'B'),
 ('D', 'G'),
 ('D', 'F'),
 ('E', 'B'),
 ('E', 'G'),
 ('E', 'F'),
 ('B', 'G'),
 ('B', 'F'),
 ('G', 'F')]

In [177]:
chain_pairs_direct_interaction

[['A', 'D'],
 ['A', 'E'],
 ['A', 'B'],
 ['A', 'F'],
 ['C', 'D'],
 ['C', 'B'],
 ['C', 'G'],
 ['C', 'F'],
 ['D', 'F'],
 ['B', 'G'],
 ['B', 'F'],
 ['G', 'F']]

## TEST CODE

In [None]:
model = structure[0]

for chain_i in model:
    for chain_j in model:
        if chain_i != chain_j:
            
            for residue_i in chain_i:
                for residue_j in chain_j:
                        
                        for atom_i in residue_i:
                            for atom_j in residue_j:
                                distance = atom_i - atom_j
                                if distance < 6:
                                    print(chain_i, residue_i, atom_i)
                                    print(chain_j, residue_j, atom_j)
                                    print(distance)
                                    print('')

In [51]:
UniProt_to_PDB_map_dict

{'O15143': ['6yw6', '6uhc'],
 'O15144': ['6yw7', '6yw6', '6uhc'],
 'O15145': ['6yw6', '6uhc', '6yw7'],
 'O15511': ['6uhc', '6yw7'],
 'P59998': ['6uhc', '6yw6', '6yw7'],
 'P61158': ['6yw7', '6uhc', '6yw6'],
 'P61160': ['6uhc', '6yw7', '6yw6']}

In [52]:
complex_i_found_pdbs

['6YW7', '6YW6', '6UHC']

In [91]:
model = structure[0]
chain = model['A']

# this example uses only the first residue of a single chain.
# it is easy to extend this to multiple chains and residues.
for residue1 in chain:
    for residue2 in chain:
        if residue1 != residue2:
            # compute distance between CA atoms
            try:
                distance = residue1['CA'] - residue2['CA']
            except KeyError:
                ## no CA atom, e.g. for H_NAG
                continue
            if distance < 6:
                print(residue1, residue2, distance)
        # stop after first residue
        break

<Residue ARG het=  resseq=4 icode= > <Residue GLY het=  resseq=3 icode= > 3.7895255
<Residue LEU het=  resseq=5 icode= > <Residue GLY het=  resseq=3 icode= > 5.899156


In [47]:
all_info = get_info('6YW7')
print(list(all_info.keys()))

['audit_author', 'citation', 'em3d_fitting', 'em3d_reconstruction', 'em_ctf_correction', 'em_entity_assembly', 'em_experiment', 'em_image_recording', 'em_imaging', 'em_single_particle_entity', 'em_software', 'em_specimen', 'em_vitrification', 'entry', 'exptl', 'pdbx_audit_revision_category', 'pdbx_audit_revision_details', 'pdbx_audit_revision_group', 'pdbx_audit_revision_history', 'pdbx_audit_revision_item', 'pdbx_audit_support', 'pdbx_database_related', 'pdbx_database_status', 'pdbx_vrpt_summary', 'rcsb_accession_info', 'rcsb_entry_container_identifiers', 'rcsb_entry_info', 'rcsb_primary_citation', 'struct', 'struct_keywords', 'rcsb_id', 'rcsb_external_references']


In [48]:
all_info['pdbx_database_related']

[{'content_type': 'associated EM volume',
  'db_id': 'EMD-10960',
  'db_name': 'EMDB',
  'details': 'Cryo-EM structure of the ARP2/3 1A5C isoform complex.'}]

In [49]:
all_info['rcsb_entry_info']['polymer_entity_count_protein']

7

In [50]:
all_info['rcsb_entry_info']['resolution_combined'][0]

4.5

In [103]:
all_info['rcsb_accession_info']['revision_date']

'2020-07-01T00:00:00+0000'

In [86]:
pdb_file = get_pdb_file('6UHC', filetype='cif', compression=False)
print(pdb_file[:400])

Sending GET request to https://files.rcsb.org/download/6UHC.cif to fetch 6UHC's cif file as a string.
data_6UHC
# 
_entry.id   6UHC 
# 
_audit_conform.dict_name       mmcif_pdbx.dic 
_audit_conform.dict_version    5.328 
_audit_conform.dict_location   http://mmcif.pdb.org/dictionaries/ascii/mmcif_pdbx.dic 
# 
loop_
_database_2.database_id 
_database_2.database_code 
PDB   6UHC         
WWPDB D_1000244501 
EMDB  EMD-20770    
# 
_pdbx_database_related.db_name        EMDB 
_pdbx_database_related.det


In [88]:
import requests
import re

base_url = "https://www.ebi.ac.uk/pdbe/"

api_base = base_url + "api/"

uniprot_mapping_url = api_base + 'mappings/uniprot/'

In [89]:
def make_request(url, mode, pdb_id):
    """
    This function can make GET and POST requests to
    the PDBe API
    
    :param url: String,
    :param mode: String,
    :param pdb_id: String
    :return: JSON or None
    """
    if mode == "get":
        response = requests.get(url=url+pdb_id)
    elif mode == "post":
        response = requests.post(url, data=pdb_id)

    if response.status_code == 200:
        return response.json()
    else:
        print("[No data retrieved - %s] %s" % (response.status_code, response.text))
    
    return None

In [90]:
def get_mappings_data(pdb_id):
    """
    This function will GET the mappings data from
    the PDBe API using the make_request() function
    
    :param pdb_id: String
    :return: JSON
    """
    # Check if the provided PDB id is valid
    # There is no point in making an API call
    # with bad PDB ids
    if not re.match("[0-9][A-Za-z][A-Za-z0-9]{2}", pdb_id):
        print("Invalid PDB id")
        return None
    
    # GET the mappings data
    mappings_data = make_request(uniprot_mapping_url, "get", pdb_id)
    
    # Check if there is data
    if not mappings_data:
        print("No data found")
        return None
    
    return mappings_data

def list_uniprot_pdb_mappings(pdb_id):
    """
    This function retrieves PDB > UniProt
    mappings using the get_mappings_data() function
    
    :param pdb_id: String,
    :return: None
    """
    
    # Getting the mappings data
    mappings_data = get_mappings_data(pdb_id)
    
    # If there is no data, return None
    if not mappings_data:
        return None
    
    
    uniprot = mappings_data[pdb_id]["UniProt"]
    for uniprot_id in uniprot.keys():
        mappings = uniprot[uniprot_id]["mappings"]
        
        for mapping in mappings:
            entity_id = mapping["entity_id"]
            chain_id = mapping["chain_id"]
            pdb_start = mapping["start"]["residue_number"]
            pdb_end = mapping["end"]["residue_number"]
            uniprot_start = mapping["unp_start"]
            uniprot_end = mapping["unp_end"]
            print("entity %i in chain %s is indexed from %i to %i in PDB, and from %i to %i in UniProt %s" % (
                entity_id,
                chain_id,
                pdb_start,
                pdb_end,
                uniprot_start,
                uniprot_end,
                uniprot_id
                )
            )
        
    return None

In [92]:
list_uniprot_pdb_mappings('6uhc')

entity 6 in chain F is indexed from 1 to 168 in PDB, and from 1 to 168 in UniProt P59998
entity 1 in chain A is indexed from 1 to 418 in PDB, and from 1 to 418 in UniProt P61158
entity 2 in chain B is indexed from 1 to 394 in PDB, and from 1 to 394 in UniProt P61160
entity 7 in chain G is indexed from 1 to 151 in PDB, and from 1 to 151 in UniProt O15511
entity 3 in chain C is indexed from 1 to 372 in PDB, and from 1 to 372 in UniProt O15143
entity 4 in chain D is indexed from 1 to 300 in PDB, and from 1 to 300 in UniProt O15144
entity 5 in chain E is indexed from 1 to 178 in PDB, and from 1 to 178 in UniProt O15145
entity 8 in chain H is indexed from 1 to 501 in PDB, and from 1 to 501 in UniProt Q91YD9
entity 8 in chain I is indexed from 1 to 501 in PDB, and from 1 to 501 in UniProt Q91YD9


In [93]:
mappings_data = get_mappings_data('6uhc')

In [94]:
mappings_data

{'6uhc': {'UniProt': {'P59998': {'identifier': 'ARPC4_HUMAN',
    'name': 'ARPC4_HUMAN',
    'mappings': [{'entity_id': 6,
      'end': {'author_residue_number': 168,
       'author_insertion_code': '',
       'residue_number': 168},
      'chain_id': 'F',
      'start': {'author_residue_number': None,
       'author_insertion_code': '',
       'residue_number': 1},
      'unp_end': 168,
      'unp_start': 1,
      'struct_asym_id': 'F'}]},
   'P61158': {'identifier': 'ARP3_HUMAN',
    'name': 'ARP3_HUMAN',
    'mappings': [{'entity_id': 1,
      'end': {'author_residue_number': None,
       'author_insertion_code': '',
       'residue_number': 418},
      'chain_id': 'A',
      'start': {'author_residue_number': None,
       'author_insertion_code': '',
       'residue_number': 1},
      'unp_end': 418,
      'unp_start': 1,
      'struct_asym_id': 'A'}]},
   'P61160': {'identifier': 'ARP2_HUMAN',
    'name': 'ARP2_HUMAN',
    'mappings': [{'entity_id': 2,
      'end': {'author_residu

## Viewing sandbox

In [3]:
from Bio.PDB import *
import nglview as nv
import ipywidgets

protein_structure_dir = '/n/data1/hms/ccb/lab/projects/bioplex/BioPlexPy/protein_function_testing'
PDB_id = '6PGR'



In [4]:
# download structure from PDB
pdbl = PDBList()
PBD_file_path = pdbl.retrieve_pdb_file(PDB_id, pdir=protein_structure_dir, file_format='pdb', overwrite=True)

# create a structure object
parser = PDBParser()
structure = parser.get_structure(PDB_id, PBD_file_path)

model = structure[0]
chain = model['A']

# this example uses only the first residue of a single chain.
# it is easy to extend this to multiple chains and residues.
for residue1 in chain:
    for residue2 in chain:
        if residue1 != residue2:
            # compute distance between CA atoms
            try:
                distance = residue1['CA'] - residue2['CA']
            except KeyError:
                ## no CA atom, e.g. for H_NAG
                continue
            if distance < 6:
                print(residue1, residue2, distance)
        # stop after first residue
        break

Downloading PDB structure '6PGR'...
<Residue LEU het=  resseq=14 icode= > <Residue ALA het=  resseq=13 icode= > 3.8256369
<Residue ALA het=  resseq=15 icode= > <Residue ALA het=  resseq=13 icode= > 5.2555428
<Residue ALA het=  resseq=16 icode= > <Residue ALA het=  resseq=13 icode= > 4.958143




In [5]:
PBD_file_path 

'/n/data1/hms/ccb/lab/projects/bioplex/BioPlexPy/protein_function_testing/pdb6pgr.ent'

In [6]:
view = nv.show_biopython(structure)
view

NGLWidget()

In [7]:
view = nv.show_pdbid("3pqr")  # load "3pqr" from RCSB PDB and display viewer widget
view

NGLWidget()

In [8]:
view._remote_call("setSize", target="Widget", args=["500px", "500px"])
# Center and zoom molecule
view.center_view()



In [9]:
view

NGLWidget()

In [14]:
view.download_image()

In [24]:
import ipywidgets 
ipywidgets.Text("hello") 

Text(value='hello')

In [25]:
nv.demo()

NGLWidget()