In [1]:
%%autosave 60

In [41]:
# manage JSON format 
# and data analysis
import json as js
import numpy as np
import pandas as pd

# manage XML objects
import xml.etree.ElementTree as ET

# libraries for 
# handling URLs
import requests
from urllib.error import URLError
from urllib.request import Request, urlopen, urlretrieve

# import functions 
# from jupyter notebooks
import import_ipynb
from REST_functions import *

# process-based “threading” interface
from multiprocessing import Pool, Process

Identifiers
------
Execute an advanced search query to [RCSB-PDB](www.rcsb.org). `XML` query examples can be find [here](https://www.rcsb.org/pages/webservices/rest-search).

In [3]:
# set URL
api_url = 'https://www.rcsb.org/pdb/rest/search'

# XML query: retrieve all PDB IDs which records match 'transient receptor potential'
query_text = """
<?xml version="1.0" encoding="UTF-8"?>
<orgPdbQuery>
<queryType>org.pdb.query.simple.AdvancedKeywordQuery</queryType>
<description>Text Search for: "transient receptor potential"</description>
<keywords>transient receptor potential</keywords>
</orgPdbQuery>
"""

# header for server
header = {'Content-Type': 'application/x-www-form-urlencoded'}

# POST a XML representation of an advanced search
response = requests.post(api_url, data=query_text, headers=header)

# check & print response
if response.status_code == 200:
    pdb_list = response.text.split()
    print("Found %d PDB entries matching query." % len(pdb_list))
    print("Matches: \n%s" % ",".join(pdb_list))
else:
    print("Failed to retrieve results")

Found 127 PDB entries matching query.
Matches: 
1IA9,1IAH,1IAJ,2ETA,2ETB,2ETC,2F37,2NYJ,2PNN,2RFA,3E7K,3HRN,3HRO,3J5P,3J5Q,3J5R,3J9J,3J9P,3SUI,3TE3,3VVI,4DX1,4DX2,4GIF,4N5Q,5HI9,5IRX,5IRZ,5IS0,5IWK,5IWP,5IWR,5IWT,5K47,5MKE,5MKF,5OEO,5T4D,5W3S,5WO6,5WO7,5WO8,5WO9,5WOA,5WP6,5YDZ,5YE1,5YE2,5YE5,5YX9,5Z96,5ZBG,5ZX5,6A70,6AYF,6AYG,6B5V,6BBJ,6BCJ,6BCL,6BCO,6BCQ,6BO4,6BO5,6BO8,6BO9,6BOA,6BOB,6BPQ,6BQR,6BQV,6BWD,6BWF,6BWI,6BWJ,6BWM,6C8F,6C8G,6C8H,6CUD,6CV9,6D1W,6D73,6D7L,6D7O,6D7P,6D7Q,6D7S,6D7T,6D7V,6D7X,6DJR,6DJS,6DMR,6DMU,6DMW,6DRJ,6DRK,6DVW,6DVY,6DVZ,6E2F,6E2G,6E7P,6E7Y,6E7Z,6F55,6G1K,6HRR,6HRS,6MHO,6MHS,6MHV,6MHW,6MHX,6MIX,6MIZ,6MJ2,6NR2,6NR3,6NR4,6O1N,6O1P,6O1U,6O20,6OT2,6OT5


Molecules in the entry
------
Execute an advanced search query to [PDBe](http://www.ebi.ac.uk/pdbe) using `REST` calls based on PDB entry data. Calls to [PDBe-API-URL/molecules](http://www.ebi.ac.uk/pdbe/api/pdb/entry/molecules/) provides details of molecules modelled in the entry, such as entity id, description, type, number of copies in the entry, source organism(s), etc. Details at PDBe `REST` API [doc](`http://www.ebi.ac.uk/pdbe/api/doc/`) page.

In [4]:
# dict with "PDB ID" as keys, e.g.:
# 6d7v: { molecule: Transient receptor potential cation channel subfamily V member 6, 
#         gene_name: Trpv6,
#         assembly: homo-tetramer,
#         method: x-ray, 
#         resolution: 4.3, 
#         data_url: http://www.ebi.ac.uk/pdbe/coordinates/files/6d7v.ccp4} 
dataset = {}

In [5]:
# set URL
api_url = """http://www.ebi.ac.uk/pdbe/api/pdb/entry/molecules/"""

# query list of PDB IDs
query_text = ",".join(pdb_list)

# POST a XML representation of an advanced search
description = requests.post(api_url, data=query_text)

In [6]:
# process response
if description.status_code == 200:
    # create an JSON object from response
    pdb_molecules = js.loads(description.content.decode('utf-8'))
    
    # number of PDB matches
    print("Matches: \n%s of %s\n" % (len(pdb_molecules), len(pdb_list)))
    print("Summary:")
    
    # read summary sections
    for k, v in pdb_molecules.items():
        # macromolecule gene & molecule name
        macromolecules_gene_names = find_json(v, "gene_name")
        macromolecules_mole_names = find_json(v, "molecule_name")
        
        # fill dataset
        if len(macromolecules_mole_names) > 0:
            dataset['{0}'.format(k.upper())] = {'molecule': macromolecules_mole_names[0]}
        if len(macromolecules_gene_names) > 0:
            dataset['{0}'.format(k.upper())].update({'gene_name': macromolecules_gene_names[0]})
        
        # example results
        try:
            print('{0}: {1}'.format(k, macromolecules_gene_names[0]), end=" ")
        except IndexError:
            print('{0}: {1}'.format(k, "None"), end=" ")
    
else:
    print("Failed to retrieve results")

Matches: 
127 of 127

Summary:
6dvy: Trpv3 6dvz: Trpv3 6dvw: Trpv3 5wp6: LTRPC4 6d7l: TRP3 6mix: EREG1 6miz: EREG1 6ayg: MCOLN3 5wo7: Trpv6 5wo6: Trpv6 5wo9: Trpv6 5wo8: Trpv6 5ydz: Mcoln1 5iwr: Trpv6 6o20: Ecac1 5iwt: Trpv6 5iwk: Trpv6 6dmw: Ecac1 5woa: Trpv6 6c8g: trpv4 5z96: Trpc4 6d7x: Trpv6 5oeo: CALM 6cud: TRP3 4dx1: TRPV4 4dx2: TRPV4 6bwf: Chak 6bwi: LTRPC4 6bwj: TRPV2 3sui: CALM3 5irz: Trpv1 6ot2: TRPV3 1ia9: Chak 6e7z: MCOLN1 6d1w: PKD2 6f55: PACSIN3 6cv9: Trp6 6d7p: Trpv6 6o1n: Ecac1 6bob: Trpv6 6d7v: Trpv6 5ye1: Mcoln1 6ayf: MCOLN3 6o1u: Ecac1 5ye5: Mcoln1 6o1p: Ecac1 5zx5: Chak 1iah: Chak 1iaj: Chak 5zbg: TRP3 3hro: PKD2 3hrn: PKD2 5k47: PKD2 3j9j: Trpv1 5yx9: TRP6 3j9p: ANKTM1 6e2g: Trpv6 6e2f: ECAC2 2f37: TRPV2 2etc: Sac2b 2etb: Sac2b 2eta: Sac2b 6mho: TRPV3 4gif: PKD2L 6bo9: ECAC2 6bo8: ECAC2 6mhx: TRPV3 6mhs: TRPV3 6mhw: TRPV3 6mhv: TRPV3 6d7o: Trpv6 5mke: PKD2 5is0: Trpv1 6bbj: trpv4 6bwd: Chak 6dmr: Ecac1 6dmu: Ecac1 6drj: trpm2 6drk: trpm2 6e7p: MCOLN1 6c8f: trpv4 2n

Summary
------
Calls to [PDBe-API-URL/summary](http://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/) provide a summary of properties of a PDB entry, such as the title of the entry, experimental method, assemblies, etc. Details at PDBe `REST` API [doc](`http://www.ebi.ac.uk/pdbe/api/doc/`) page.

In [7]:
# set URL
api_url = """http://www.ebi.ac.uk/pdbe/api/pdb/entry/summary/"""

# query list of PDB IDs
query_text = ",".join(pdb_list)

# POST a XML representation of an advanced search
description = requests.post(api_url, data=query_text)

In [8]:
# process response
if description.status_code == 200:
    # create an JSON object from response
    pdb_summary = js.loads(description.content.decode('utf-8'))
    
    # number of PDB matches
    print("Matches: \n%s of %s\n" % (len(pdb_summary), len(pdb_list)))
    print("Summary:")
    
    # read summary sections
    for k, v in pdb_summary.items():
        # Assembly composition / multimeric state
        structure_assemblies_form = find_json(v, "assemblies")[0]['form']
        structure_assemblies_name = find_json(v, "assemblies")[0]['name']
        multimeric_assembly_state = (structure_assemblies_form + "-" +
                                     structure_assemblies_name)
        # experimental method info
        experimental_method_class = find_json(v, "experimental_method_class")
        
        ## test experimental method class > 
        ## should be one of these [x-ray, em, nmr]
        if experimental_method_class[0] not in ['x-ray', 'em', 'nmr']:
            print('unusual experimental method class:\n%s' 
                  % experimental_method_class[0])
        
        # fill dataset
        dataset['{0}'.format(k.upper())].update({'assembly': multimeric_assembly_state})
        dataset['{0}'.format(k.upper())].update({'method': experimental_method_class[0]})
        
        # example results
        print('{0}: {1}'.format(k, experimental_method_class[0]), end =" ")
    
else:
    print("Failed to retrieve results")

Matches: 
127 of 127

Summary:
6d7v: x-ray 6dvy: em 6dvz: em 6dvw: em 5wp6: em 6d7l: em 6mix: em 5irx: em 6ayg: em 5wo7: x-ray 5wo6: x-ray 5wo9: x-ray 5wo8: x-ray 5ydz: em 5iwr: x-ray 5iwp: x-ray 5iwt: x-ray 5iwk: x-ray 6drj: em 5woa: x-ray 6e7p: em 5z96: em 6d7x: x-ray 5oeo: nmr 6cud: em 4dx1: x-ray 4dx2: x-ray 6bwf: em 6bwi: em 6bwj: x-ray 6bwm: x-ray 5irz: em 6ot2: em 1ia9: x-ray 6e7z: em 6d1w: em 6f55: nmr 6cv9: em 6e7y: em 6boa: em 6bob: em 6c8h: x-ray 5ye1: em 6ayf: em 5ye2: em 5ye5: em 6o1p: em 6nr2: em 6d7t: em 1iah: x-ray 1iaj: x-ray 5zbg: em 3hro: x-ray 3hrn: x-ray 5k47: em 3j9j: em 6mj2: em 3j9p: em 6e2g: em 6e2f: em 2f37: x-ray 2etc: x-ray 2etb: x-ray 2eta: x-ray 6mho: em 4gif: x-ray 6bo9: em 6bo8: em 6mhx: em 6mhs: em 6bo5: em 6bo4: em 6d7o: x-ray 5mke: em 5is0: em 6bbj: em 6dmr: em 6dmu: em 6dmw: em 6drk: em 6c8g: x-ray 6c8f: x-ray 6d7s: em 5hi9: em 6d7q: x-ray 6d7p: x-ray 2nyj: x-ray 6bwd: em 6bqv: em 5yx9: em 6bqr: em 6miz: em 6ot5: em 2rfa: x-ray 3sui: x-ray 6g1k: em 6

Experiments
----
Execute an advanced search query to [PDBe](http://www.ebi.ac.uk/pdbe) using `REST` calls based on PDB entry data. Calls to [PDBe-API-URL/experiment](http://www.ebi.ac.uk/pdbe/api/pdb/entry/experiment/) This call provides details of experiment(s) carried out in determining the structure of the entry. Each experiment is described in a separate dictionary. For **X-ray diffraction**, the description consists of resolution, spacegroup, cell dimensions, R and Rfree, refinement program, etc. For **NMR**, details of spectrometer, sample, spectra, refinement, etc. are included. For **EM**, details of specimen, imaging, acquisition, reconstruction, fitting etc. are included.  Details at PDBe `REST` API [doc](`http://www.ebi.ac.uk/pdbe/api/doc/`) page.

In [9]:
# set URL
api_url = """http://www.ebi.ac.uk/pdbe/api/pdb/entry/experiment/"""

# query list of PDB IDs
query_text = ",".join(pdb_list)

# POST a XML representation of an advanced search
description = requests.post(api_url, data=query_text)

In [10]:
# process response
if description.status_code == 200:
    # create an JSON object from response
    exp_summary = js.loads(description.content.decode('utf-8'))
    
    # number of PDB matches
    print("Matches: \n%s of %s\n" % (len(exp_summary), len(pdb_list)))
    
    # read method sections
    for k, v in exp_summary.items():
        # experimental method info
        experimental_method_class = find_json(v, "experimental_method_class")
        experimental_rressolution = find_json(v, "resolution")
        experimental_datasets_url = find_urls(v, k)
        
        ## check experimental resolution
        expres = experimental_rressolution
        if len(expres) == 0 or not isinstance(expres[0], (int, float)):
            experimental_rressolution = [0.0]
        
        ## check if working url link was generated
        check_urls(experimental_datasets_url[0])
        
        # fill dataset
        # check experimental_method_class!
        dataset['{0}'.format(k.upper())].update({'resolution': experimental_rressolution[0]})
        dataset['{0}'.format(k.upper())].update({'url': experimental_datasets_url[0]})
        
        # results
        print('{0}: {1}'.format(k, experimental_rressolution[0]), end =" ")
        
else:
    print("Failed to retrieve results")

Matches: 
127 of 127

6d7v: 4.3 6dvy: 4 6dvz: 4.24 6dvw: 4.3 5wp6: 3.8 5is0: 3.43 6mix: 3.6 6miz: 6.1 5ye2: 5.8 5wo7: 3.246 5wo6: 3.31 5wo9: 3.7 5wo8: 3.4 5ydz: 5.8 5iwr: 3.85 6o20: 3.3 5iwt: 3.8 5iwk: 3.247 6dmw: 4.4 5woa: 3.899 6c8g: 6.31 5z96: 3.28 6d7x: 3.6 5oeo: 0.0 6cud: 3.3 4dx1: 2.85 4dx2: 2.95 6bwf: 4.1 6bwi: 3.7 6bwj: 3.1 5irx: 2.95 5irz: 3.28 6ot2: 4.1 1ia9: 2 6d7s: 4.34 6d1w: 3.54 6f55: 0.0 6cv9: 3.8 6d7p: 3.37 6boa: 4.2 6bob: 3.9 2nyj: 3.2 5ye1: 5.8 6ayf: 3.62 6o1u: 2.8 5ye5: 5.8 6o1p: 3 6nr2: 4 6d7t: 4.44 1iah: 2.4 1iaj: 2.8 5zbg: 4.36 6ayg: 4.65 3hro: 1.9 3hrn: 1.9 5k47: 4.2 3j9j: 3.275 5yx9: 3.8 3j9p: 4.24 6e2g: 3.6 6e2f: 3.9 2f37: 1.7 2etc: 3.1 2etb: 1.65 2eta: 2.2 6mho: 3.4 4gif: 2.8 6bo9: 4 6bo8: 3.6 6mhx: 4 6mhs: 3.2 6bo5: 3.6 6mhv: 3.5 6d7o: 3.45 5mke: 4.3 6d7l: 4 6bbj: 3.8 6dmr: 3.9 6dmu: 4 6drj: 3.3 6drk: 3.8 6e7p: 3.5 6c8f: 6.5 6e7z: 3.73 5hi9: 4.4 6d7q: 3.497 6e7y: 3.57 6c8h: 6.5 6bwd: 3.7 6bqv: 3.1 6mj2: 6.36 6bqr: 3.2 6ot5: 3.6 2rfa: 1.7 3sui: 1.95 6g1k: 3.6 

Resume
---------
data frame `pandas`, save to a `csv` file

In [11]:
# show dataset_summary as dataframe
data = pd.DataFrame.from_dict(dataset, orient='index').reset_index()
# changing index cols with rename() 
data.rename(columns = {'index':'PDB'}, inplace = True)
# show dataframe
data.head()

Unnamed: 0,PDB,molecule,gene_name,assembly,method,resolution,url
0,1IA9,Transient receptor potential cation channel su...,Chak,homo-dimer,x-ray,2.0,http://www.ebi.ac.uk/pdbe/coordinates/files/1i...
1,1IAH,Transient receptor potential cation channel su...,Chak,homo-dimer,x-ray,2.4,http://www.ebi.ac.uk/pdbe/coordinates/files/1i...
2,1IAJ,Transient receptor potential cation channel su...,Chak,homo-dimer,x-ray,2.8,http://www.ebi.ac.uk/pdbe/coordinates/files/1i...
3,2ETA,Transient receptor potential cation channel su...,Sac2b,homo-monomer,x-ray,2.2,http://www.ebi.ac.uk/pdbe/coordinates/files/2e...
4,2ETB,Transient receptor potential cation channel su...,Sac2b,homo-monomer,x-ray,1.65,http://www.ebi.ac.uk/pdbe/coordinates/files/2e...


Download
--------
download files

In [62]:
%%time
# download link
def wget_urls(url):
    """Download http/ftp links
       save files on current path
       
       Keyword arguments:
       urls -- web link http/ftp
    """
    name = url.split("/")[-1]
    # should work well with ftp links
    try:
        urlretrieve(url, name)
        print("Downloaded %s" % url)
    except:
        raise ValueError('Error downloading %s' % url) 

pool = Pool(processes=10)
urls = data['url'][1:20].tolist()

result = [pool.apply_async(wget_urls, args=(link,)) for link in urls]
output = [p.get() for p in result]

pool.terminate()
pool.join()

print(output)

Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/2nyj.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/2pnn.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/2etc.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/1iaj.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/3hro.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/3hrn.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/3e7k.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/2rfa.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/1iah.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/2eta.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/2etb.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/3te3.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/3sui.ccp4
Downloaded http://www.ebi.ac.uk/pdbe/coordinates/files/2f37.ccp4


ValueError: Error downloading ftp://ftp.ebi.ac.uk/pub/databases/emdb/structures/EMD-5778/map/emd_5778.map.gz

In [22]:
%%bash 
ls -lht

total 6650768
-rw-r--r--  1 TiNoel  staff   3.3M Jul  1 15:33 1iaj.ccp4
-rw-r--r--  1 TiNoel  staff   5.5M Jul  1 15:33 1iah.ccp4
-rw-r--r--  1 TiNoel  staff    44K Jul  1 15:32 PDBe_GET_maps.ipynb
-rw-r--r--  1 TiNoel  staff   6.0M Jul  1 15:32 emd_9133.map.gz
-rw-r--r--  1 TiNoel  staff     0B Jul  1 15:32 6hrs.ccp4
-rw-r--r--  1 TiNoel  staff   8.0K Jul  1 15:32 6hrr.ccp4
-rw-r--r--  1 TiNoel  staff   8.0K Jul  1 15:32 6d7x.ccp4
-rw-r--r--  1 TiNoel  staff   8.0K Jul  1 15:32 6d7v.ccp4
-rw-r--r--  1 TiNoel  staff   2.0M Jul  1 15:32 6d7p.ccp4
-rw-r--r--  1 TiNoel  staff   8.0K Jul  1 15:32 6c8g.ccp4
-rw-r--r--  1 TiNoel  staff   8.0K Jul  1 15:32 6bwj.ccp4
-rw-r--r--  1 TiNoel  staff   1.6M Jul  1 15:32 emd_7019.map.gz
-rw-r--r--  1 TiNoel  staff    18M Jul  1 15:32 emd_6991.map.gz
-rw-r--r--  1 TiNoel  staff   8.0K Jul  1 15:31 5wo8.ccp4
-rw-r--r--  1 TiNoel  staff   1.4M Jul  1 15:31 5iwr.ccp4
-rw-r--r--  1 TiNoel  staff   1.8M Jul  1 15:31 5iwp.ccp4
-rw-r--r--  1 TiNoel  staff   