# Generate EMDB-IDs list

In [1]:
from ftplib import FTP

servername = "files.rcsb.org"
ftp = FTP(servername)
ftp.login(user='random_user',passwd='random_password')

subdir = "pub/emdb/structures"

try:
    entries_raw = ftp.nlst(subdir)

except ftplib.error_perm as resp:
    if str(resp) == "550 No files found":
        print("No files in this directory")
    else:
        raise

In [2]:
entries_cleaned = [x.split('/')[-1] for x in entries_raw if '.' not in x.split('/')[-1]]
emdb_entries = entries_cleaned

In [3]:
len(emdb_entries)

24507

# Get experimental annotations

In [4]:
import os
import numpy as np
import requests
import coloredlogs, logging
from time import process_time

In [5]:
# Configure logger
workdir = '/data/dragon000/sanjuan/oxford/cryodb' #os.path.dirname(trajectory_filepath)

coloredlogs.install(level='ERROR')
logging.basicConfig(level=logging.ERROR)
logging.basicConfig(filename=os.path.join(workdir,'emdb_annotations.log'),
                    filemode='w', 
                    format='%(name)s - %(levelname)s - %(message)s')

In [7]:
extract_data = lambda x:x['structure_determination_list']['structure_determination'][0]['specimen_preparation_list']['specimen_preparation'][0]

def retrieveInfoEMDB(emdb_id, url_prefix='https://www.ebi.ac.uk/emdb/api/entry/experiment/'):
    emdb_url = url_prefix + emdb_id
    respond = requests.get(url=emdb_url)
    
    if respond.status_code != 200:
        logging.error("request status %d" % (respond.status_code))
        raise DownloadException("Error, retrieveInfoFromEMDB request status %d" % (respond.status_code))
    
    else:
        data = respond.json()
    
    try:
        exptl_annotations = extract_data(data)
        return exptl_annotations
    
    except Exception as e:

        logger.error(e)
        return None

Iterate over index interval chunks

In [42]:
from operator import itemgetter

N = len(emdb_entries)
d = 100

index_intervals = [range(i*d,(i+1)*d) if (i+1)*d < N else range(i*d,N) for i in range(round(N/d))]

exptl_annotations = {}
for i in range(5):
    entries_interval = itemgetter(*index_intervals[i])(emdb_entries)
    try:
        annotations = {emdb_id:retrieveInfoEMDB(emdb_id) for emdb_id in entries_interval}
        exptl_annotations.update(annotations)
        
    except Exception as e:
        print(e)

In [71]:
exptl_annotations

{'EMD-0001': {'preparation_id': 1,
  'concentration': {'units': 'mg/mL', 'valueOf_': '0.5'},
  'buffer': {'ph': 8.0,
   'component': [{'concentration': {'units': 'mM', 'valueOf_': '10.0'},
     'formula': 'Tris-HCL',
     'name': 'Tris hydrochloride',
     'instance_type': 'component'},
    {'concentration': {'units': 'mM', 'valueOf_': '150.0'},
     'formula': 'NaCL',
     'name': 'sodium chloride',
     'instance_type': 'component'},
    {'concentration': {'units': 'mM', 'valueOf_': '10.0'},
     'formula': 'MgCl2',
     'name': 'magnesium chloride',
     'instance_type': 'component'}]},
  'grid': {'model': 'Quantifoil R1.2/1.3', 'material': 'COPPER'},
  'vitrification': {'cryogen_name': 'ETHANE',
   'chamber_humidity': {'units': 'percentage', 'valueOf_': '95'},
   'chamber_temperature': {'units': 'K', 'valueOf_': '277'},
   'instrument': 'FEI VITROBOT MARK IV'},
  'instance_type': 'single_particle_preparation'},
 'EMD-0002': {'preparation_id': 1,
  'concentration': {'units': 'mg/mL'

Save data

In [14]:
import pickle
with open('test_data.pickle', 'wb') as fp:
    pickle.dump(exptl_annotations, fp)

<span style='color:red'><b>ISSUE</b></span>
* Connection dies after certain number of requests (>300 entries)

```python
ConnectionError: HTTPSConnectionPool(host='www.ebi.ac.uk', port=443): Max retries exceeded with url: /emdb/api/entry/experiment/EMD-0348 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f9218e429d0>: Failed to establish a new connection: [Errno -2] Name or service not known'))
```

# Evaluate scope of data availabilty

# Code Snippets

<span style='color:red'>NEXT</span>
* Find a way to filter only experimental data per ID, test with 10 entries. 
* THEN, estimate how long and how much memory it will take get these data.

In [None]:
@retry_decorator
def retrieveInfoFromEMDB(emdb_id):
    resp = requests.get(url=EMDB_INFO_QUERY % emdbId)
    
    if resp.status_code != 200:
        logger.error("request status %d" % (resp.status_code))
        raise DownloadException("Error, retrieveInfoFromEMDB request status %d" % (resp.status_code))
    else:
        data = resp.json()[emdbId][0]
        try:
            pdbs = data["deposition"]["fitted_pdb_id_list"]
            pdbs = [elem.lower() for elem in pdbs["pdb_id"]]
        except KeyError:
            try:
                pdbs = [elem["pdb_id_list"]["pdb_id"][0].lower() for elem in data["experiment"]["fitting"]]
            except KeyError:
                logger.error("%s has not associated pdb" % (emdbId))
                raise DownloadException("Error, %s has not associated pdb" % (emdbId))

        voxel_size = data["map"]["pixel_spacing"]["y"]["value"]
        max_val_vox = data["map"]["statistics"]["maximum"]
        grid_shape = [int(elem) for elem in data["map"]["dimensions"].values()]

        axis_order = data["map"]["axis_order"]
        axis_order = [axis_order["slow"], axis_order["medium"], axis_order["fast"]]
        coords_origin = data["map"]["origin"]
        coords_origin = [coords_origin["section"], coords_origin["row"],
                         coords_origin["column"]]  # order slow, medium, fast
        try:
            resolution = float(data["processing"]["reconstruction"]["resolution_by_author"])
        except KeyError:
            raise DownloadException("Error, no resolution available for %s " % (emdbId))

        date = data["deposition"]["deposition_date"]
        info = {"voxel_size": voxel_size, "max_val_vox": max_val_vox, "grid_shape": grid_shape,
                "coords_origin": coords_origin,
                "resolution": resolution, "pdbs": pdbs, "axis_order": axis_order, "date": date}
        if resolution > config.WORSE_RESOLUTION_ALLOWED:
            msg = "Error downloading %s. Too bad resolution:\n" % (emdbId,)
            logger.error(msg)
            raise DownloadException(msg)
            # writeErrorFile(emdbId, "Error downloading %s. Too bad resolution:\n" % (emdbId,), info, pdbId="")

        if len(set(grid_shape)) > 1:
            logger.error(f"Error downloading {emdbId}: Just cubic volumes are considered")
            raise DownloadException("Error downloading %s. Just cubic volumes are considered:\n" % (emdbId,))
        return info

In [48]:
myurl = prefix+emdb_entries[0]

In [49]:
resp = requests.get(url=myurl)
data = resp.json()

In [53]:
data

{'emdb_id': 'EMD-0001',
 'structure_determination_list': {'structure_determination': [{'structure_determination_id': 1,
    'method': 'singleParticle',
    'aggregation_state': 'particle',
    'specimen_preparation_list': {'specimen_preparation': [{'preparation_id': 1,
       'concentration': {'units': 'mg/mL', 'valueOf_': '0.5'},
       'buffer': {'ph': 8.0,
        'component': [{'concentration': {'units': 'mM', 'valueOf_': '10.0'},
          'formula': 'Tris-HCL',
          'name': 'Tris hydrochloride',
          'instance_type': 'component'},
         {'concentration': {'units': 'mM', 'valueOf_': '150.0'},
          'formula': 'NaCL',
          'name': 'sodium chloride',
          'instance_type': 'component'},
         {'concentration': {'units': 'mM', 'valueOf_': '10.0'},
          'formula': 'MgCl2',
          'name': 'magnesium chloride',
          'instance_type': 'component'}]},
       'grid': {'model': 'Quantifoil R1.2/1.3', 'material': 'COPPER'},
       'vitrification': {'c

In [50]:
data['structure_determination_list']['structure_determination'][0].keys()

dict_keys(['structure_determination_id', 'method', 'aggregation_state', 'specimen_preparation_list', 'microscopy_list', 'image_processing', 'instance_type'])

**NOTE** 

This doesn't work with EMDB! It produces no output.

```python
url = 'https://www.ebi.ac.uk/emdb/api/annotations/'
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
 
urls = []
for link in soup.find_all('a'):
    print(link.get('href'))
```    

**NOTES**

To gather all the data I need to compile a list of the currently available non-obsolete entries in EMDB
* Option 1: Take a for loop running serially up to a high number. IDs take a 4-digit numeric format only and seem to run serially, except for a few which I suspect are obsolete. 
* Option 2: Compile a list from here https://files.rcsb.org/pub/emdb/structures/ . Try `ftplib`?

Ruben's functions. Taking bits and pieces, but mostly ideas.

In [None]:
@retry_decorator
def retrieveInfoFromEMDB(emdbId):
    resp = requests.get(url=EMDB_INFO_QUERY % emdbId)
    if resp.status_code != 200:
        logger.error("request status %d" % (resp.status_code))
        raise DownloadException("Error, retrieveInfoFromEMDB request status %d" % (resp.status_code))
    else:
        data = resp.json()[emdbId][0]
        try:
            pdbs = data["deposition"]["fitted_pdb_id_list"]
            pdbs = [elem.lower() for elem in pdbs["pdb_id"]]
        except KeyError:
            try:
                pdbs = [elem["pdb_id_list"]["pdb_id"][0].lower() for elem in data["experiment"]["fitting"]]
            except KeyError:
                logger.error("%s has not associated pdb" % (emdbId))
                raise DownloadException("Error, %s has not associated pdb" % (emdbId))

        voxel_size = data["map"]["pixel_spacing"]["y"]["value"]
        max_val_vox = data["map"]["statistics"]["maximum"]
        grid_shape = [int(elem) for elem in data["map"]["dimensions"].values()]

        axis_order = data["map"]["axis_order"]
        axis_order = [axis_order["slow"], axis_order["medium"], axis_order["fast"]]
        coords_origin = data["map"]["origin"]
        coords_origin = [coords_origin["section"], coords_origin["row"],
                         coords_origin["column"]]  # order slow, medium, fast
        try:
            resolution = float(data["processing"]["reconstruction"]["resolution_by_author"])
        except KeyError:
            raise DownloadException("Error, no resolution available for %s " % (emdbId))

        date = data["deposition"]["deposition_date"]
        info = {"voxel_size": voxel_size, "max_val_vox": max_val_vox, "grid_shape": grid_shape,
                "coords_origin": coords_origin,
                "resolution": resolution, "pdbs": pdbs, "axis_order": axis_order, "date": date}
        if resolution > config.WORSE_RESOLUTION_ALLOWED:
            msg = "Error downloading %s. Too bad resolution:\n" % (emdbId,)
            logger.error(msg)
            raise DownloadException(msg)
            # writeErrorFile(emdbId, "Error downloading %s. Too bad resolution:\n" % (emdbId,), info, pdbId="")

        if len(set(grid_shape)) > 1:
            logger.error(f"Error downloading {emdbId}: Just cubic volumes are considered")
            raise DownloadException("Error downloading %s. Just cubic volumes are considered:\n" % (emdbId,))
        return info

In [None]:
@retry_decorator
def downloadEMDB(emdbId, mapsOutDir):
    outName = os.path.join(mapsOutDir, emdbId + '.map')
    logger.info("Trying to download emdb %s to %s" % (emdbId, outName))
    if not os.path.isfile(outName):
        emd, num = emdbId.split("-")
        cmd = ('wget -qO- ' + EMDB_DONWLOAD_QUERY + ' --timeout=' +str(TIMEOUT_TIME)+
               '  | zcat  > %s') % (emdbId, num, outName + ".tmp")
        logger.info(cmd)
        p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, shell=True, executable="/bin/bash")
        out = p.communicate()
        if len(out[1]) > 0:
            logger.error(out)
            tryToRemove(outName + ".tmp")
            msg = "Error downloading (wget) emdbId %s to %s. %s" % (emdbId, outName, out)
            raise DownloadException(msg)
        os.rename(outName + ".tmp", outName)
    else:
        logger.warn("Already downloaded emdbId %s ->%s " % (emdbId, outName))

    return outName

@retry_decorator
def downloadEMDB_halfMaps(emdbId, mapsOutDir, onlyAverage=False):
    outName = os.path.join(mapsOutDir, emdbId + '.map')
    logger.info("Trying to download emdbID %s to %s" % (emdbId, outName))
    halfFnames = []
    if not os.path.isfile(outName):
        try:
            time.sleep(random.randint(0,TIMEOUT_TIME))
            ftp = ftplib.FTP(EMDB_FTP_SERVER)
            ftp.login()
            try:
                ftp.cwd(EMDB_FTP_DIR % emdbId)
            except ftplib.all_errors as e:
                msg = f"FTTP error for {emdbId}. It probabaly does not contain half-maps: " + str(e)
                logger.error(msg)
                raise DownloadException(msg)
            fnames = ftp.nlst()
            half_names_inServer = [None, None]
            logger.info("Downaloading half maps: "+ " "+" ".join(fnames))
            for fname in fnames:
                match_objs = re.match(".*half[-_\.]*(map_?)*([12])", fname)
                if match_objs:
                    half_names_inServer[int(match_objs.group(2)) - 1] = fname
            logger.info(half_names_inServer)
            if None in half_names_inServer:
                raise DownloadException(f"Error, half map not available for {emdbId}")
            tmp_halfs = []
            tmp_half_name = None
            for halfNum in range(1, 3):
                try:
                    tmp_half_name = outName + "_half_%d.tmp.mrc" % halfNum
                    tmp_halfs.append(tmp_half_name)
                    with  BytesIO() as flo:
                        ftp.retrbinary('RETR ' + half_names_inServer[halfNum - 1], flo.write)
                        flo.seek(0)
                        with open(tmp_half_name, 'wb') as fout, gzip.GzipFile(fileobj=flo) as fgzip:
                            shutil.copyfileobj(fgzip, fout)
                except ftplib.all_errors as e:
                    logger.error(f"FTTP error for {emdbId}: "+str(e))
                    raise e
                    # if tmp_half_name:
                    #     tryToRemove(tmp_half_name)
                    # msg = "Error downloading emdbId %s to %s: %s" % (emdbId, outName, str(e))
                    # raise DownloadException(msg)
        except ftplib.all_errors as e:
            logger.error(f"FTTP error for {emdbId}: " + str(e))
            raise e
            # msg = "Error downloading emdbId %s to %s: %s" % (emdbId, outName, str(e))
            # raise DownloadException(msg)

        data = loadVol(tmp_halfs[0])[0]
        data += loadVol(tmp_halfs[1])[0]
        data /= 2.
        saveVol(outName, data, fname_headerLike=tmp_halfs[0])
        if onlyAverage:
            for tmp_half_name in tmp_halfs:
                tryToRemove(tmp_half_name)
        else:
            for tmp_half_name in tmp_halfs:
                new_fname = tmp_half_name.replace(".tmp", "")
                os.rename(tmp_half_name, new_fname)
                halfFnames.append(new_fname)
    else:
        logger.warn("Already downloaded emdbId %s ->%s " % (emdbId, outName))

    return outName, halfFnames