# Predict Helix Capping Residues #

The goal is to identify residues just before an alpha helix begins or the residues just after the helix ends. This will improve secondary structure predictors becuase they often extend too far or do not start at the right place. 

The CapsDB has annoted sequences of structures of helix capping residues that can be used to train a deep nueral net. We will use a Bidirectional LSTM using phi/psi features to see if it will those will be good predictors.

## 1. Download data ##

## 2. Generate Features ##
### MMTF Pyspark Imports ###

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.ml import ProteinSequenceEncoder
import numpy as np
import pandas as pd
import math
import os

### Custom imports ###

In [2]:
import secondaryStructureExtractorFull
#import mmtfToASA

### Configure Spark Context ###

In [3]:
spark = SparkSession.builder.master("local[4]").appName("DeepCap").getOrCreate()

### Create SQLContext ###

In [4]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import concat, col, lit, array_contains

sqlContext = SQLContext(spark)

### Read in filtered cap+MMTF data from parquet file###

In [5]:
# Read output of above get_dataset operation from parquet file
parquetPath = '/home/ec2-user/SageMaker/ProteinFragmenter/datacaps.parquet'
dataframe = sqlContext.read.parquet(parquetPath)
data = dataframe.toPandas()
data = data.drop('__index_level_0__', axis=1)

capsdb = sqlContext.read.parquet('caps_descriptors.parquet')

In [6]:
# Add letter abbreviation for easy checking between PDB and CAPS-DB
resi_abbr_map = {'ALA': 'A'
,'ARG': 'R'
,'ASN': 'N'
,'ASP': 'D'
,'CYS': 'C'
,'GLN': 'Q'
,'GLU': 'E'
,'GLY': 'G'
,'HIS': 'H'
,'ILE': 'I'
,'LEU': 'L'
,'LYS': 'K'
,'MET': 'M'
,'PHE': 'F'
,'PRO': 'P'
,'PYL': 'O'
,'SER': 'S'
,'SEC': 'U'
,'THR': 'T'
,'TRP': 'W'
,'TYR': 'Y'
,'VAL': 'V'
,'ASX': 'B'
,'GLX': 'Z'
,'XAA': 'X'
,'XLE': 'J'}

### Get Torsion angle and secondary structure info ###

In [7]:
df1 = capsdb.toPandas()
df = pd.merge(data, df1, left_on=('pdbId','chain'), right_on=('pdbid','chain'), how='inner')
df = df[['pdbId', 'chain', 'resi', 'resn', 'phi', 'psi', 'startcap', 'endcap']]
df['resabbr'] = df['resn'].map(resi_abbr_map)

### Need to specify mapping since PDB and CAPS-DB don't use same numbering scheme for residues ###

In [11]:
pdbid_chain = df1[['pdbid', 'chain']].groupby(['pdbid', 'chain']).max().reset_index()
pdbid_chain['combined'] = pdbid_chain.pdbid + "." + pdbid_chain.chain
pdbid_chain = pdbid_chain["combined"].unique()

In [None]:
# Generate a dictionary of offsets between PDB and Uniprot (CAPS-DB, hopefully) residue numbering.
# If these are all consistent within files, they can be used for the mapping - doing this because
# some residues seem to be missing from the mapping xml files and should be able to be interpolated 
# this way.

import sifts_mapper as smap

offset_dict = dict()
resimap_list = []
stopped_iter = [] # These are chains where the generator for the mapping stopped early

for pc in pdbid_chain:
    pdbid = pc[:4]
    chain = pc[5:]
    if pdbid not in []: #['2ou1', '2wgk', '3u42', '4fmy']:
        temp = df1[(df1.pdbid == pdbid) & (df1.chain == chain)]

        start, end = temp.startcap.min(), temp.endcap.max()
        resilist = [i for i in range(start, end+1)]
        
        if len(resilist) == 1:
            resis = resilist[0]
        else:
            resis = ",".join([str(i) for i in resilist])

        try:
            rmap = smap.uniprot_to_pdb_resi(pdbid, chain, resis, source="PDBe")
            rmap = [i for i in rmap]
        except:
            resilist = []
            rmap = []
            pass
        
        if len(resilist) != len(rmap):
            stopped_iter.append(pc)
            resilist = resilist[:len(rmap)]
        if len(resilist) > 0:        
            dictionary = dict(zip(resilist, rmap))
            map_df = pd.DataFrame(data = {'resi': resilist, 'rmap': rmap})
            

            # Add all mapping offsets to dataframe
            d = {'pdbid': [pdbid for i in resilist],
                 'chain': [chain for i in resilist],
                 'resi': resilist,
                 'rmap': [i for i in rmap]}
            resimap_list.append(pd.DataFrame(d))

            # Add unique mapping offsets to set
            offsets = set()
            for k in dictionary.keys():
                if not math.isnan(dictionary[k]):
                    offsets.add(k - dictionary[k])
            offset_dict[pc] = offsets

resimap_df = pd.concat(resimap_list)


Saving aws:us-east-1:molmimic-sifts :: a1/1a1x.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1a1x.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading a1/1a1x.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: a6/1a62.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1a62.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading a6/1a62.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: a7/1a73.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1a73.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading a7/1a73.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: a8/1a8l.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1a8l.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 



Loading ah/1ah7.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: al/1al3.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1al3.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading al/1al3.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ao/1aoc.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1aoc.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ao/1aoc.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ao/1aol.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1aol.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ao/1aol.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ar/1arb.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1arb.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: c7/1c7s.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1c7s.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading c7/1c7s.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: c9/1c96.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1c96.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading c9/1c96.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: cb/1cb8.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1cb8.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading cb/1cb8.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: cc/1cc8.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1cc8.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: dd/1dd9.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1dd9.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading dd/1dd9.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: dd/1ddw.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ddw.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading dd/1ddw.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: df/1dfm.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1dfm.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading df/1dfm.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: df/1dfu.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1dfu.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: ec/1eca.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1eca.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ec/1eca.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ee/1eer.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1eer.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ee/1eer.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ee/1eex.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1eex.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ee/1eex.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ee/1eex.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1eex.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading f0/1f00.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: f0/1f0k.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1f0k.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading f0/1f0k.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: f0/1f0n.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1f0n.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading f0/1f0n.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: f0/1f0x.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1f0x.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading f0/1f0x.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: f1/1f1m.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1f1m.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: fx/1fx2.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1fx2.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading fx/1fx2.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: fy/1fye.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1fye.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading fy/1fye.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: g1/1g12.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1g12.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading g1/1g12.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: g3/1g3k.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1g3k.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: gq/1gq8.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1gq8.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gq/1gq8.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gq/1gqe.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1gqe.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gq/1gqe.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gq/1gqi.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1gqi.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gq/1gqi.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gs/1gs5.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1gs5.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: hb/1hbn.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1hbn.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading hb/1hbn.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: hd/1hdh.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1hdh.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading hd/1hdh.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: hd/1hdo.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1hdo.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading hd/1hdo.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: he/1he1.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1he1.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: i7/1i7q.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1i7q.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading i7/1i7q.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: i7/1i7w.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1i7w.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading i7/1i7w.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: i8/1i88.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1i88.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading i8/1i88.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ia/1iap.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1iap.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: j5/1j5p.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1j5p.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading j5/1j5p.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: j5/1j5u.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1j5u.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading j5/1j5u.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: j7/1j77.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1j77.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading j7/1j77.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: j7/1j7x.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1j7x.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: jr/1jr7.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1jr7.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading jr/1jr7.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ju/1juh.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1juh.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ju/1juh.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: jx/1jx6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1jx6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading jx/1jx6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: jy/1jy1.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1jy1.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: k8/1k8w.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1k8w.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading k8/1k8w.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: k9/1k92.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1k92.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading k9/1k92.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: k9/1k94.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1k94.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading k9/1k94.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ka/1ka1.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ka1.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: l6/1l6r.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1l6r.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading l6/1l6r.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: l6/1l6x.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1l6x.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading l6/1l6x.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: l7/1l7a.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1l7a.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading l7/1l7a.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: l9/1l9x.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1l9x.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: m2/1m22.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1m22.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading m2/1m22.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: m2/1m2d.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1m2d.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading m2/1m2d.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: m4/1m48.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1m48.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading m4/1m48.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: m4/1m4j.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1m4j.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: mw/1mws.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1mws.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading mw/1mws.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: mz/1mzw.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1mzw.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading mz/1mzw.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: n0/1n08.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1n08.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading n0/1n08.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: n0/1n0w.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1n0w.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading nr/1nrj.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ns/1nsz.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1nsz.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ns/1nsz.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: nt/1nth.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1nth.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading nt/1nth.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: nt/1nty.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1nty.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading nt/1nty.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: nu/1nu0.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1nu0.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: og/1ogd.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ogd.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading og/1ogd.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: og/1ogo.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ogo.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading og/1ogo.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: og/1ogq.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ogq.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading og/1ogq.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: oh/1oh0.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1oh0.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading p5/1p5z.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: p6/1p6o.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1p6o.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading p6/1p6o.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: p9/1p90.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1p90.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading p9/1p90.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: pb/1pbj.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1pbj.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading pb/1pbj.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: pb/1pbw.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1pbw.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Loading q6/1q6z.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: q7/1q7e.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1q7e.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading q7/1q7e.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: q7/1q7l.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1q7l.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading q7/1q7l.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: q7/1q7l.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1q7l.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading q7/1q7l.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: q8/1q8d.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1q8d.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Loading qw/1qw2.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qw/1qw9.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1qw9.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qw/1qw9.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qw/1qwg.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1qwg.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qw/1qwg.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qw/1qwo.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1qwo.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qw/1qwo.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qw/1qwr.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1qwr.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: rk/1rk6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1rk6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading rk/1rk6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: rk/1rki.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1rki.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading rk/1rki.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ro/1ro7.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ro7.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ro/1ro7.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ro/1roc.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1roc.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading s5/1s5u.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: s6/1s6c.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1s6c.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading s6/1s6c.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: s7/1s7i.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1s7i.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading s7/1s7i.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: s7/1s7z.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1s7z.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading s7/1s7z.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: s9/1s99.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1s99.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: sx/1sx5.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1sx5.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading sx/1sx5.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: sz/1sz7.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1sz7.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading sz/1sz7.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: sz/1szh.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1szh.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading sz/1szh.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: sz/1szo.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1szo.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: tj/1tjy.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1tjy.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading tj/1tjy.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: tk/1tke.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1tke.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading tk/1tke.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: tl/1tl2.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1tl2.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading tl/1tl2.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: to/1toa.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1toa.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading u8/1u84.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: u8/1u8v.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1u8v.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading u8/1u8v.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: u9/1u9l.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1u9l.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading u9/1u9l.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ua/1ua4.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ua4.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ua/1ua4.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: uc/1ucd.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ucd.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: ux/1ux6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ux6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ux/1ux6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: uz/1uz3.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1uz3.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading uz/1uz3.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: uz/1uzx.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1uzx.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading uz/1uzx.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: v0/1v0w.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1v0w.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading vh/1vh5.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: vh/1vhn.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1vhn.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading vh/1vhn.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: vi/1vi0.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1vi0.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading vi/1vi0.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: vi/1vi6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1vi6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading vi/1vi6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: vj/1vjf.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1vjf.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Loading w0/1w07.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: w0/1w0h.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1w0h.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading w0/1w0h.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: w1/1w1h.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1w1h.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading w1/1w1h.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: w2/1w23.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1w23.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading w2/1w23.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: w2/1w2f.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1w2f.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: wm/1wmh.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1wmh.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading wm/1wmh.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: wm/1wmh.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1wmh.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading wm/1wmh.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: wn/1wn2.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1wn2.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading wn/1wn2.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: wn/1wna.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1wna.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: x6/1x6i.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1x6i.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading x6/1x6i.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: x6/1x6o.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1x6o.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading x6/1x6o.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: x7/1x7d.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1x7d.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading x7/1x7d.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: x8/1x8b.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1x8b.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: xq/1xqo.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1xqo.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading xq/1xqo.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: xs/1xs0.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1xs0.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading xs/1xs0.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: xs/1xsv.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1xsv.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading xs/1xsv.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: xs/1xsz.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1xsz.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: yg/1ygt.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ygt.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading yg/1ygt.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: yh/1yht.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1yht.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading yh/1yht.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: yj/1yj7.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1yj7.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading yj/1yj7.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: yk/1ykd.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1ykd.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: z2/1z2w.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1z2w.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading z2/1z2w.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: z3/1z3e.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1z3e.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading z3/1z3e.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: z3/1z3e.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1z3e.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading z3/1z3e.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: z3/1z3x.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1z3x.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: zt/1zth.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1zth.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading zt/1zth.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: zv/1zva.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1zva.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading zv/1zva.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: zv/1zvt.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1zvt.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading zv/1zvt.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: zx/1zxx.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/1zxx.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: an/2anx.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2anx.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading an/2anx.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ao/2ao9.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2ao9.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ao/2ao9.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ao/2aot.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2aot.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ao/2aot.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ap/2ap3.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2ap3.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading bb/2bb6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: bb/2bbr.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2bbr.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading bb/2bbr.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: bf/2bf6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2bf6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading bf/2bf6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: bf/2bfd.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2bfd.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading bf/2bfd.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: bh/2bhu.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2bhu.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: c0/2c0g.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2c0g.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading c0/2c0g.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: c0/2c0n.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2c0n.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading c0/2c0n.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: c1/2c1v.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2c1v.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading c1/2c1v.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: c2/2c2i.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2c2i.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: cv/2cve.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2cve.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading cv/2cve.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: cv/2cvi.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2cvi.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading cv/2cvi.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: cw/2cw9.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2cw9.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading cw/2cw9.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: cw/2cws.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2cws.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: dd/2ddx.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2ddx.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading dd/2ddx.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: de/2de3.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2de3.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading de/2de3.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: de/2de6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2de6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading de/2de6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: de/2dej.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2dej.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: dy/2dyo.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2dyo.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading dy/2dyo.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: e1/2e11.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2e11.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading e1/2e11.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: e1/2e1f.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2e1f.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading e1/2e1f.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: e2/2e2o.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2e2o.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: en/2end.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2end.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading en/2end.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: en/2eng.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2eng.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading en/2eng.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: ep/2epl.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2epl.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading ep/2epl.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: eq/2eq7.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2eq7.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: fa/2fao.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2fao.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading fa/2fao.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: fb/2fb5.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2fb5.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading fb/2fb5.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: fb/2fb6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2fb6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading fb/2fb6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: fb/2fba.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2fba.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading fp/2fph.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: fq/2fq3.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2fq3.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading fq/2fq3.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: fr/2fr5.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2fr5.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading fr/2fr5.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: fr/2fre.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2fre.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading fr/2fre.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: fs/2fsh.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2fsh.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Loading ga/2gau.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gb/2gb4.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2gb4.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gb/2gb4.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gd/2gdm.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2gdm.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gd/2gdm.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gf/2gf6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2gf6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gf/2gf6.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gh/2ghs.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2ghs.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: gw/2gwn.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2gwn.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gw/2gwn.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gx/2gx5.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2gx5.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gx/2gx5.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gz/2gz4.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2gz4.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading gz/2gz4.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: gz/2gz6.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2gz6.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading hf/2hft.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: hh/2hhc.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2hhc.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading hh/2hhc.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: hi/2hiy.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2hiy.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading hi/2hiy.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: hj/2hje.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2hje.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading hj/2hje.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: hj/2hjn.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2hjn.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: i4/2i49.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2i49.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading i4/2i49.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: i4/2i4l.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2i4l.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading i4/2i4l.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: i5/2i53.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2i53.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading i5/2i53.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: i5/2i5h.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2i5h.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading il/2ilk.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: il/2ilr.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2ilr.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading il/2ilr.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: im/2im8.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2im8.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading im/2im8.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: im/2imf.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2imf.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading im/2imf.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: im/2imh.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2imh.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: j6/2j6b.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2j6b.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading j6/2j6b.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: j6/2j6g.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2j6g.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading j6/2j6g.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: j6/2j6l.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2j6l.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading j6/2j6l.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: j7/2j7q.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2j7q.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading ml/2mlt.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: nm/2nml.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2nml.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading nm/2nml.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: nn/2nnu.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2nnu.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading nn/2nnu.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: nn/2nnu.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2nnu.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading nn/2nnu.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: no/2noo.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2noo.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: o4/2o4t.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2o4t.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading o4/2o4t.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: o4/2o4v.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2o4v.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading o4/2o4v.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: o5/2o5h.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2o5h.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading o5/2o5h.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: o5/2o5v.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2o5v.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading oh/2ohw.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: oi/2oit.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2oit.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading oi/2oit.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: oi/2oiw.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2oiw.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading oi/2oiw.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: oi/2oiz.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2oiz.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading oi/2oiz.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: oj/2oj5.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2oj5.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Saving aws:us-east-1:molmimic-sifts :: oy/2oyo.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2oyo.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading oy/2oyo.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: oz/2oze.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2oze.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading oz/2oze.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: oz/2ozt.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2ozt.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading oz/2ozt.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: p0/2p02.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2p02.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: pa/2pa8.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2pa8.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading pa/2pa8.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: pa/2pag.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2pag.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading pa/2pag.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: pd/2pd1.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2pd1.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading pd/2pd1.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: pf/2pfi.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2pfi.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: pv/2pv2.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2pv2.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading pv/2pv2.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: pv/2pv4.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2pv4.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading pv/2pv4.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: pw/2pw0.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2pw0.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading pw/2pw0.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: pw/2pww.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2pww.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: qg/2qgm.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2qgm.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qg/2qgm.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qg/2qgq.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2qgq.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qg/2qgq.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qg/2qgu.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2qgu.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qg/2qgu.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qg/2qgy.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2qgy.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: qr/2qrl.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2qrl.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qr/2qrl.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qr/2qru.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2qru.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qr/2qru.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qs/2qsi.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2qsi.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading qs/2qsi.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: qs/2qsx.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2qsx.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Saving aws:us-east-1:molmimic-sifts :: r6/2r6j.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2r6j.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading r6/2r6j.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: r6/2r6o.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2r6o.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading r6/2r6o.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: r6/2r6z.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2r6z.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading r6/2r6z.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: r7/2r75.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2r75.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region 

Loading rj/2rj2.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: rj/2rji.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2rji.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading rj/2rji.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: rk/2rk3.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2rk3.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading rk/2rk3.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: rk/2rk9.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2rk9.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading rk/2rk9.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: rk/2rkh.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2rkh.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

Loading v0/2v0p.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: v2/2v26.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2v26.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading v2/2v26.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: v2/2v2f.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2v2f.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading v2/2v2f.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: v2/2v2g.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2v2g.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecting to bucket molmimic-sifts in region us-east-1
Loading v2/2v2g.xml.gz from S3IOStore
Saving aws:us-east-1:molmimic-sifts :: v2/2v2p.xml.gz to /home/ec2-user/SageMaker/ProteinFragmenter/2v2p.xml.gz
Connecting to bucket molmimic-sifts in region us-east-1
Connecti

In [8]:
import pickle

#resimap_df.to_pickle('resimap_df.pkl')
resimap_df = pd.read_pickle('resimap_df.pkl')

In [23]:
#resimap_df[resimap_df.pdbid == "1aol"]
#resimap_df[resimap_df.rmap.isnull()]
resimap_df.iloc[30000,:]

pdbid    1ftr
chain       A
resi      215
rmap      215
Name: 188, dtype: object

In [325]:
#len(resimap_list)
#resimap_df.head()

#pdbid_chain
#pdbid_chain = pdbid_chain[pdbid_chain.pdbid == '2ou1']
#offset_dict
#resimap_df.head()
temp = pd.DataFrame(data = {'resi': [i+9 for i in range(10)], 'rmap': [i+3 if i < 5 else i for i in range(10)]})
temp['offset'] = temp.resi - temp.rmap
temp.groupby('offset').count().reset_index()

Unnamed: 0,offset,resi,rmap
0,6,5,5
1,9,5,5


In [None]:
offset_dict_single = {k: offset_dict[k].pop() for k in offset_dict.keys() if (len(offset_dict[k]) == 1)} # this destroys offset_dict
offset_dict_single


temp = resimap_df.iloc[np.where(np.isnan(resimap_df.rmap))[0],:]
#temp['rmap2'] = 
temp['pdbid_chain'] = temp.iloc[:,0] + "." + temp.iloc[:,1]
#temp.pdbid_chain.map(offset_dict_single)
[print(i) for i in temp.pdbid_chain.unique()]

In [9]:
# Set residue locations without mapping to zero

resimap_df.loc[resimap_df.rmap.isnull() == True, 'rmap'] = 0

resimap_df['rmap'] = resimap_df.rmap.astype(int)
#resimap_df
#print(df.columns)
#resimap_df[resimap_df.isnull().rmap == True].shape
#resimap_df[resimap_df.rmap == 0].shape

resimap_df['rmap2'] = resimap_df['resi']
resimap_df['resi'] = resimap_df['rmap']
resimap_df['rmap'] = resimap_df['rmap2']
resimap_df = resimap_df.drop('rmap2', axis=1)

In [10]:
resimap_df

Unnamed: 0,pdbid,chain,resi,rmap
0,1a1x,A,56,56
1,1a1x,A,57,57
2,1a1x,A,58,58
3,1a1x,A,59,59
4,1a1x,A,60,60
5,1a1x,A,61,61
6,1a1x,A,62,62
7,1a1x,A,63,63
8,1a1x,A,64,64
9,1a1x,A,65,65


In [11]:
# map cap resi numbers to pbd data resi numbers

data_map = pd.merge(df, resimap_df, left_on=['pdbId', 'chain', 'startcap'], right_on=['pdbid', 'chain', 'resi'], how='left')
data_map = data_map.drop(['startcap'], axis=1)
data_map['startcap'] = data_map['rmap']
data_map['resi'] = data_map['resi_x']
data_map = data_map.drop(['resi_x','resi_y'], axis=1)
data_map = data_map.iloc[:,[0,1,2,3,4,5,6,9,10]]

data_map = pd.merge(data_map, resimap_df, left_on=['pdbId', 'chain', 'endcap'], right_on=['pdbid', 'chain', 'resi'], how='left')
data_map = data_map.drop(['endcap'], axis=1)
data_map['endcap'] = data_map['rmap']
data_map['resi'] = data_map['resi_x']
data_map = data_map.drop(['resi_x','resi_y'], axis=1)
data_map = data_map.iloc[:,[0,1,2,3,4,5,6,9,10]]

# If incomplete data about startcap or endcap location (from missing mapping PDBe<-->Uniprot), set cap to 0
data_map.loc[data_map.startcap.isnull() == True, ['startcap', 'endcap']] = 0
data_map.loc[data_map.endcap.isnull() == True, ['startcap', 'endcap']] = 0
data_map['startcap'] = data_map.startcap.astype(int)
data_map['endcap'] = data_map.endcap.astype(int)

data_map['resi'] = data_map.resi+1 #data_map.apply(lambda x: x.resi+1)

data_map

Unnamed: 0,pdbId,chain,resn,phi,psi,resabbr,startcap,endcap,resi
0,2ygn,A,THR,,163.677383,T,0,0,2
1,2ygn,A,THR,,163.677383,T,0,0,2
2,2ygn,A,GLY,-66.660973,160.703186,G,0,0,3
3,2ygn,A,GLY,-66.660973,160.703186,G,0,0,3
4,2ygn,A,SER,-123.853607,-7.871733,S,0,0,4
5,2ygn,A,SER,-123.853607,-7.871733,S,0,0,4
6,2ygn,A,LEU,-74.896896,137.483932,L,0,0,5
7,2ygn,A,LEU,-74.896896,137.483932,L,0,0,5
8,2ygn,A,TYR,-134.419830,140.864288,Y,0,0,6
9,2ygn,A,TYR,-134.419830,140.864288,Y,0,0,6


In [24]:
data_map[data_map.pdbId == '1ftr']

Unnamed: 0,pdbId,chain,resn,phi,psi,resabbr,startcap,endcap,resi
955212,1ftr,A,MET,,132.874710,M,119,119,1
955213,1ftr,A,MET,,132.874710,M,80,81,1
955214,1ftr,A,MET,,132.874710,M,174,175,1
955215,1ftr,A,MET,,132.874710,M,251,252,1
955216,1ftr,A,MET,,132.874710,M,27,29,1
955217,1ftr,A,MET,,132.874710,M,268,270,1
955218,1ftr,A,MET,,132.874710,M,42,44,1
955219,1ftr,A,MET,,132.874710,M,97,100,1
955220,1ftr,A,MET,,132.874710,M,126,132,1
955221,1ftr,A,MET,,132.874710,M,191,204,1


In [25]:
#data[(data.pdbId == '4lfy') & (data.resi >= 173)]

#data_map['resa_dmap'] = data_map.resn.map(resi_abbr_map)

d = data_map[data_map.pdbId == '1ftr']
g = d.groupby('resi').resabbr.max().reset_index()
print(g.to_string())

     resi resabbr
0       1       M
1       2       E
2       3       I
3       4       N
4       5       G
5       6       V
6       7       E
7       8       I
8       9       E
9      10       D
10     11       T
11     12       F
12     13       A
13     14       E
14     15       A
15     16       F
16     17       E
17     18       A
18     19       K
19     20       M
20     21       A
21     22       R
22     23       V
23     24       L
24     25       I
25     26       T
26     27       A
27     28       A
28     29       S
29     30       H
30     31       K
31     32       W
32     33       A
33     34       M
34     35       I
35     36       A
36     37       V
37     38       K
38     39       E
39     40       A
40     41       T
41     42       G
42     43       F
43     44       G
44     45       T
45     46       S
46     47       V
47     48       I
48     49       M
49     50       C
50     51       P
51     52       A
52     53       E
53     54       A
54     55 

In [66]:
#resimap_df
#df[df.pdbId == '1aol']

In [26]:
df1[df1.pdbid == '1ftr']

Unnamed: 0,nid,capID,pdbid,chain,type,start,end,startcap,endcap,lengthCAP,lengthHELIX,D,delta,seq,ss,__index_level_0__
127,128,1ftr_A_Nt2,1ftr,A,Nt,119,125,119,119,1,6,5.165,27.515,RVGYKLS,-HHHHHH,127
5564,5565,1ftr_A_Nt1,1ftr,A,Nt,80,96,80,81,2,15,6.596,102.831,NDEDELKEQLLDRIGQC,SSHHHHHHHHHHHHHHH,5564
5567,5568,1ftr_A_Nt3,1ftr,A,Nt,174,190,174,175,2,15,7.193,101.846,ESQPAGLQAAEAAVDAI,SSHHHHHHHHHHHHHHH,5567
5568,5569,1ftr_A_Nt4,1ftr,A,Nt,251,267,251,252,2,15,6.459,98.509,LNEEAVKEAMRVGIEAA,SSHHHHHHHHHHHHHHH,5568
6034,6035,1ftr_A_Nt0,1ftr,A,Nt,27,41,27,29,3,12,7.836,117.389,AASHKWAMIAVKEAT,-SSHHHHHHHHHHHH,6034
45930,45931,1ftr_A_Ct4,1ftr,A,Ct,253,270,268,270,3,15,9.478,16.865,EEAVKEAMRVGIEAACQQ,HHHHHHHHHHHHHHHTTS,6723
47934,47935,1ftr_A_Ct0,1ftr,A,Ct,30,44,42,44,3,12,9.42,39.983,HKWAMIAVKEATGFG,HHHHHHHHHHHHSS-,8727
48933,48934,1ftr_A_Ct1,1ftr,A,Ct,82,100,97,100,4,15,10.255,33.672,EDELKEQLLDRIGQCVMTA,HHHHHHHHHHHHHHHTTTS,9726
53616,53617,1ftr_A_Ct2,1ftr,A,Ct,120,132,126,132,7,6,8.837,75.112,VGYKLSFFGDGYQ,HHHHHHGGGTT--,4279
66540,66541,1ftr_A_Ct3,1ftr,A,Ct,176,204,191,204,14,15,15.289,136.694,QPAGLQAAEAAVDAIKGVEGAYAPFPGGI,HHHHHHHHHHHHHHHTTSTT----SGGG-,7509


In [27]:
#data_map
df = data_map

### Create labels

In [28]:
df['is_cap'] = df.apply(lambda x: 1 if (x['resi'] >= x['startcap'] and x['resi'] <= x['endcap']) else 0, axis=1)
df_caps = df.groupby(["pdbId", "chain", "resi"])['is_cap'].max().reset_index()

In [29]:
data_caps = pd.merge(data, df_caps, left_on=('pdbId','chain', 'resi'), right_on=('pdbId','chain', 'resi'), how='inner')

In [30]:
from Bio.PDB.Polypeptide import aa3
one_hot_encoded = pd.DataFrame(data_caps.resn.apply(lambda x: secondaryStructureExtractorFull.get_residue(x)).tolist(), columns=aa3)
one_hot_encoded.head()
data_caps = data_caps.join(one_hot_encoded)
data_caps.head()

Unnamed: 0,pdbId,chain,resi,resn,phi,psi,is_cap,ALA,CYS,ASP,...,MET,ASN,PRO,GLN,ARG,SER,THR,VAL,TRP,TYR
0,2ygn,A,2,GLY,-66.660973,160.703186,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2ygn,A,3,SER,-123.853607,-7.871733,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2ygn,A,4,LEU,-74.896896,137.483932,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2ygn,A,5,TYR,-134.41983,140.864288,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2ygn,A,6,LEU,-139.275024,127.621544,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
#df.is_cap.sum()
data_caps.head()

Unnamed: 0,pdbId,chain,resi,resn,phi,psi,is_cap,ALA,CYS,ASP,...,MET,ASN,PRO,GLN,ARG,SER,THR,VAL,TRP,TYR
0,2ygn,A,2,GLY,-66.660973,160.703186,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2ygn,A,3,SER,-123.853607,-7.871733,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2ygn,A,4,LEU,-74.896896,137.483932,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2ygn,A,5,TYR,-134.41983,140.864288,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,2ygn,A,6,LEU,-139.275024,127.621544,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Define functions for feature extraction

In [32]:

def is_cap(pdbId, chain, resi, is_cap):
    if is_cap == 1:
        return(1)
    elif is_cap == 0:
        return(0)
    else:
        raise ValueError("is_cap must be 0 or 1")

def angle_to_cos(angle):
    if(angle == 0 or np.isnan(angle)):
        return 0
    else:
        return np.cos(np.pi * angle/180)

def angle_to_sin(angle):
    if(angle == 0 or np.isnan(angle)):
        return 0
    else:
        return np.sin(np.pi * angle/180)
    
def parse_feature_file(filename):
    # Read file
    df = pd.read_csv(filename, sep="\t", skiprows=range(10))
    df = df.reset_index()
    # Set column names
    cnames=[
    "RESIDUE_CLASS1_IS_HYDROPHOBIC",
    "RESIDUE_CLASS1_IS_CHARGED",
    "RESIDUE_CLASS1_IS_POLAR",
    "RESIDUE_CLASS1_IS_UNKNOWN",
    "RESIDUE_CLASS2_IS_NONPOLAR",
    "RESIDUE_CLASS2_IS_POLAR",
    "RESIDUE_CLASS2_IS_BASIC",
    "RESIDUE_CLASS2_IS_ACIDIC",
    "RESIDUE_CLASS2_IS_UNKNOWN",
    ]
    l = [["{}_SHELL{}".format(c, i) for c in cnames] for i in range(6)]
    cnames = [item for sublist in l for item in sublist]
    cnames.insert(0, "env")
    cnames.extend(["hash", "x", "y", "z", "VERBOSITY", "location"])
    df.columns = cnames
    
    df = df.drop(["hash", "x", "y", "z", "VERBOSITY"], axis=1)
    
    # filter out non-AA structures
    aminoAcids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS',
    'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 
    'TYR', 'VAL']
    df['residue'] = df.location.str[:3]
    df = df[df.residue.isin(aminoAcids)]
    
    # Split identifiers

    # split residue number
    df_residue = df.location.str[3:].str.split(":", 1, expand=True)
    df_residue.columns = ["ordernum", "chainatom"]
    df = pd.merge(df, df_residue, left_index=True, right_index=True)

    # split chain
    df_chain = df.chainatom.str.split("@", 1, expand=True)
    df_chain.columns = ["chain", "atom"]
    df = pd.merge(df, df_chain, left_index=True, right_index=True)
    
    df['pdbId'] = df.env.str[4:8]

    df = df.drop(["env", "location", "chainatom"], axis=1)
    
    #df.ordernum = df.ordernum.astype(int)

    # Aggregate to residue level
    groups = df.groupby(['chain', 'ordernum', 'residue'], sort=False)
    groupnums = groups.ngroup(ascending=True)
    df_agg_max = groups.max().reset_index()
    df_agg_max = df_agg_max.drop(['atom', 'ordernum'], axis=1)
    df_agg_max['resi'] = df_agg_max.index+1
    return(df_agg_max)


# Parse feature files to get additional features

In [None]:
files = os.listdir("feature/feature-3.1.0/feature_files")
filelist = []
f = open("feature_parsing.log", "w")
for i, filename in enumerate(files):
    print(i, filename)
    f.write("File {}: protein {}\n".format(i, filename))
    df = parse_feature_file("feature/feature-3.1.0/feature_files/{}".format(filename))
    filelist.append(df)

f.close()
feature_df = pd.concat(filelist)

In [161]:
# Use standard scaling on feature dataframe

#feature_df.apply(np.max, axis=0)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(feature_df.iloc[:,2:56]), columns=feature_df.columns[2:56])

id_df = feature_df.iloc[:,[56,0,57,1]] 
id_df = id_df.reset_index().drop('index', axis=1)
feature_df2 = pd.merge(id_df, scaled_df, left_index=True, right_index=True)
feature_df2

Unnamed: 0,pdbId,chain,resi,residue,RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL0,RESIDUE_CLASS1_IS_CHARGED_SHELL0,RESIDUE_CLASS1_IS_POLAR_SHELL0,RESIDUE_CLASS1_IS_UNKNOWN_SHELL0,RESIDUE_CLASS2_IS_NONPOLAR_SHELL0,RESIDUE_CLASS2_IS_POLAR_SHELL0,...,RESIDUE_CLASS2_IS_UNKNOWN_SHELL4,RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL5,RESIDUE_CLASS1_IS_CHARGED_SHELL5,RESIDUE_CLASS1_IS_POLAR_SHELL5,RESIDUE_CLASS1_IS_UNKNOWN_SHELL5,RESIDUE_CLASS2_IS_NONPOLAR_SHELL5,RESIDUE_CLASS2_IS_POLAR_SHELL5,RESIDUE_CLASS2_IS_BASIC_SHELL5,RESIDUE_CLASS2_IS_ACIDIC_SHELL5,RESIDUE_CLASS2_IS_UNKNOWN_SHELL5
0,4ncj,A,1,MET,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.147298,1.179390,-1.027858,-0.088224,0.980710,-0.967822,0.343334,1.643986,-0.050447
1,4ncj,A,2,LYS,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,0.334886,0.525266,-0.525959,-0.542149,0.195332,-0.421891,-0.445199,1.050757,-0.050447
2,4ncj,A,3,LEU,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,2.095112,0.961349,-1.362458,-0.996074,1.896985,-1.331776,-0.182355,1.050757,-0.050447
3,4ncj,A,4,GLU,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,0.334886,1.615473,-1.027858,-0.088224,0.195332,-0.967822,2.183243,0.160914,-0.050447
4,4ncj,A,5,ARG,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,1.011896,2.487638,-1.027858,0.365701,0.849814,-0.421891,0.080489,3.720287,-0.050447
5,4ncj,A,6,VAL,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.553504,0.307225,-1.195158,-0.996074,1.373400,-1.149799,-0.970887,1.347372,-0.050447
6,4ncj,A,7,THR,-0.827754,-0.539470,1.639459,-0.280102,-0.852432,1.509584,...,-0.044339,0.064081,0.961349,-0.358659,-0.996074,-0.066461,-0.239914,0.080489,0.457529,-0.050447
7,4ncj,A,8,VAL,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.147298,-0.128858,1.147040,-0.996074,0.980710,1.397879,0.343334,1.050757,-0.050447
8,4ncj,A,9,LYS,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,-0.748331,0.743307,0.979740,-0.996074,-0.851840,0.669971,0.869022,0.457529,-0.050447
9,4ncj,A,10,ASN,-0.827754,-0.539470,1.639459,-0.280102,-0.852432,1.509584,...,-0.044339,0.334886,0.743307,0.477841,0.365701,0.195332,0.669971,0.869022,0.160914,-0.050447


In [33]:
# Read/Write feature vector
#feature_df2.to_csv("Feature_vectors.csv")
feature_df2 = pd.read_csv("full_dataset_features/Feature_vectors_max_scaled.csv", index_col=0)

  mask |= (ar1 == a)


In [34]:
feature_df2

Unnamed: 0,pdbId,chain,resi,residue,RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL0,RESIDUE_CLASS1_IS_CHARGED_SHELL0,RESIDUE_CLASS1_IS_POLAR_SHELL0,RESIDUE_CLASS1_IS_UNKNOWN_SHELL0,RESIDUE_CLASS2_IS_NONPOLAR_SHELL0,RESIDUE_CLASS2_IS_POLAR_SHELL0,...,RESIDUE_CLASS2_IS_UNKNOWN_SHELL4,RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL5,RESIDUE_CLASS1_IS_CHARGED_SHELL5,RESIDUE_CLASS1_IS_POLAR_SHELL5,RESIDUE_CLASS1_IS_UNKNOWN_SHELL5,RESIDUE_CLASS2_IS_NONPOLAR_SHELL5,RESIDUE_CLASS2_IS_POLAR_SHELL5,RESIDUE_CLASS2_IS_BASIC_SHELL5,RESIDUE_CLASS2_IS_ACIDIC_SHELL5,RESIDUE_CLASS2_IS_UNKNOWN_SHELL5
0,4ncj,A,1,MET,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.147298,1.179390,-1.027858,-0.088224,0.980710,-0.967822,0.343334,1.643986,-0.050447
1,4ncj,A,2,LYS,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,0.334886,0.525266,-0.525959,-0.542149,0.195332,-0.421891,-0.445199,1.050757,-0.050447
2,4ncj,A,3,LEU,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,2.095112,0.961349,-1.362458,-0.996074,1.896985,-1.331776,-0.182355,1.050757,-0.050447
3,4ncj,A,4,GLU,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,0.334886,1.615473,-1.027858,-0.088224,0.195332,-0.967822,2.183243,0.160914,-0.050447
4,4ncj,A,5,ARG,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,1.011896,2.487638,-1.027858,0.365701,0.849814,-0.421891,0.080489,3.720287,-0.050447
5,4ncj,A,6,VAL,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.553504,0.307225,-1.195158,-0.996074,1.373400,-1.149799,-0.970887,1.347372,-0.050447
6,4ncj,A,7,THR,-0.827754,-0.539470,1.639459,-0.280102,-0.852432,1.509584,...,-0.044339,0.064081,0.961349,-0.358659,-0.996074,-0.066461,-0.239914,0.080489,0.457529,-0.050447
7,4ncj,A,8,VAL,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.147298,-0.128858,1.147040,-0.996074,0.980710,1.397879,0.343334,1.050757,-0.050447
8,4ncj,A,9,LYS,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,-0.748331,0.743307,0.979740,-0.996074,-0.851840,0.669971,0.869022,0.457529,-0.050447
9,4ncj,A,10,ASN,-0.827754,-0.539470,1.639459,-0.280102,-0.852432,1.509584,...,-0.044339,0.334886,0.743307,0.477841,0.365701,0.195332,0.669971,0.869022,0.160914,-0.050447


In [35]:
#data_caps
data_caps2 = data_caps.merge(feature_df2, left_on=["pdbId", "chain", "resi"], right_on=["pdbId", "chain", "resi"])

In [96]:
#data_caps2.columns
#data_caps.shape
#feature_df.shape
#data_caps[data_caps.pdbId == "2ygn"]
#feature_df[feature_df.pdbId == "2ygn"]
#data_caps.groupby(['pdbId', 'chain']).ngroup().unique().shape
#feature_df.groupby(['pdbId', 'chain']).ngroup().unique().shape
#data_caps2.groupby(['pdbId', 'chain']).ngroup().unique().shape
train_chains[1].shape

(122, 78)

# Process data into list of arrays

In [63]:
groups = data_caps2.groupby(["pdbId", "chain"])
                           # num pdbs,    max len of seqs, num features

# Check max length of protein chains
# maxlen = 0
# for i, ((pdbid, chain), group) in enumerate(groups):
#     l = 0
#     for j, featuretuple in enumerate(group.itertuples()):
#         l += 1
#         if l > maxlen:
#             maxlen = l
# print(maxlen)

In [64]:
train_chains = []
label_chains = []
laglabel_chains = []

for i, ((pdbid, chain), group) in enumerate(groups):
    # Create empty arrays
    train_chain = np.zeros((1300,78), dtype=float) # max chain length is 1288 residues
    label_chain = np.zeros((1300,1), dtype=int)
    laglabel_chain = np.zeros((1301,1), dtype=int)
    
    # Populate arrays
    for j, featuretuple in enumerate(group.itertuples()):
        train_chain[j, :] = (angle_to_cos(featuretuple.phi), 
                             angle_to_sin(featuretuple.phi), 
                             angle_to_cos(featuretuple.psi), 
                             angle_to_sin(featuretuple.psi),
                            featuretuple.ALA,
                            featuretuple.CYS,
                            featuretuple.ASP,
                            featuretuple.GLU,
                            featuretuple.PHE,
                            featuretuple.GLY,
                            featuretuple.HIS,
                            featuretuple.ILE,
                            featuretuple.LYS,
                            featuretuple.LEU,
                            featuretuple.MET,
                            featuretuple.ASN,
                            featuretuple.PRO,
                            featuretuple.GLN,
                            featuretuple.ARG,
                            featuretuple.SER,
                            featuretuple.THR,
                            featuretuple.VAL,
                            featuretuple.TRP,
                            featuretuple.TYR,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL0,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL0,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL0,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL0,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL1,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL1,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL1,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL1,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL2,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL2,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL2,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL2,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL3,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL3,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL3,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL3,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL4,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL4,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL4,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL4,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL5,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL5,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL5,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL5
                            )
        label_chain[j,0] = is_cap(featuretuple.pdbId, featuretuple.chain, featuretuple.resi, featuretuple.is_cap)
        if (j > 0):
            laglabel_chain[j-1,0] = label_chain[j,0]
    
    # Trim zeros
    trimmed_train = train_chain[~np.all(train_chain == 0, axis=1)]
    trimmed_label = label_chain[:trimmed_train.shape[0]]
    trimmed_laglabel = laglabel_chain[:trimmed_train.shape[0]]
    
    # Add chain data to lists of arrays
    train_chains.append(trimmed_train)
    label_chains.append(trimmed_label)
    laglabel_chains.append(trimmed_laglabel)
    

In [58]:
#print(label_chains[0])
label_chains[0].shape

(105, 1)

# Write training data to pickle file

In [None]:
import pickle
pickle_out = open("pickled_data/train_chains_78.pickle","wb")
pickle.dump(train_chains, pickle_out)
pickle_out.close()

pickle_out = open("pickled_data/label_chains_78.pickle","wb")
pickle.dump(label_chains, pickle_out)
pickle_out.close()

pickle_out = open("pickled_data/laglabel_chains_78.pickle","wb")
pickle.dump(laglabel_chains, pickle_out)
pickle_out.close()

In [26]:
spark.stop()

# The code below reads in 1-dim (binary) labels and writes back out as 2-dim labels (one-hot)

In [None]:
import pickle
label_chain_in = open("pickled_data/label_chains_78.pickle","rb")
labels = pickle.load(label_chain_in)

newlabels = []
for i, l in enumerate(labels):
    temp = np.zeros([l.shape[0], 2], dtype=int)
    temp[:,1] = l[:,0]
    temp[:,0] = (l[:,0]+1)%2
    newlabels.append(temp)

pickle_out = open("pickled_data/label_chains_78.pickle","wb")
pickle.dump(newlabels, pickle_out)
pickle_out.close()

In [None]:
laglabel_chain_in = open("pickled_data/laglabel_chains_78.pickle","rb")
labelslag = pickle.load(laglabel_chain_in)

newlabelslag = []
for i, l in enumerate(labelslag):
    temp = np.zeros([l.shape[0], 2], dtype=int)
    temp[:,1] = l[:,0]
    temp[:,0] = (l[:,0]+1)%2
    newlabelslag.append(temp)

pickle_out = open("pickled_data/laglabel_chains_78.pickle","wb")
pickle.dump(newlabelslag, pickle_out)
pickle_out.close()

# The code below reads in train/label and writes out lists sorted by chain length

In [31]:
import pickle
train_chain_in = open("pickled_data/train_chains.pickle","rb")
train = pickle.load(train_chain_in)

lens = [len(chain) for chain in train]
inds = range(len(train))
lenSeries = pd.Series(data=lens, index=inds).sort_values()
newInds = lenSeries.index.values
newlist = []
[newlist.append(train[i]) for i in newInds]

pickle_out = open("pickled_data/train_chains_sorted.pickle","wb")
pickle.dump(newlist, pickle_out)
pickle_out.close()

# now sort label list
label_chain_in = open("pickled_data/label_chains.pickle","rb")
labels = pickle.load(label_chain_in)

newlist2 = []
[newlist2.append(labels[i]) for i in newInds]

pickle_out = open("pickled_data/label_chains_sorted.pickle","wb")
pickle.dump(newlist2, pickle_out)
pickle_out.close()

# now sort laglabel list
laglabel_chain_in = open("pickled_data/laglabel_chains.pickle","rb")
labelslag = pickle.load(laglabel_chain_in)

newlist3 = []
[newlist3.append(labelslag[i]) for i in newInds]

pickle_out = open("pickled_data/laglabel_chains_sorted.pickle","wb")
pickle.dump(newlist3, pickle_out)
pickle_out.close()