# Predict Helix Capping Residues #

The goal is to identify residues just before an alpha helix begins or the residues just after the helix ends. This will improve secondary structure predictors becuase they often extend too far or do not start at the right place. 

The CapsDB has annoted sequences of structures of helix capping residues that can be used to train a deep nueral net. We will use a Bidirectional LSTM using phi/psi features to see if it will those will be good predictors.

## 1. Download data ##

## 2. Generate Features ##
### MMTF Pyspark Imports ###

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.webfilters import Pisces
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.mappers import StructureToPolymerChains
from mmtfPyspark.ml import ProteinSequenceEncoder
import numpy as np
import pandas as pd
import math
import os

### Custom imports ###

In [2]:
import secondaryStructureExtractorFull
#import mmtfToASA

### Configure Spark Context ###

In [3]:
spark = SparkSession.builder.master("local[8]").appName("DeepCap").getOrCreate()

### Create SQLContext ###

In [4]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import concat, col, lit, array_contains

sqlContext = SQLContext(spark)

### Read in filtered cap+MMTF data from parquet file###

In [17]:
# Read output of above get_dataset operation from parquet file
parquetPath = '/home/ec2-user/SageMaker/ProteinFragmenter/datacaps.parquet'
dataframe = sqlContext.read.parquet(parquetPath)
data = dataframe.toPandas()
data = data.drop('__index_level_0__', axis=1)

capsdb = sqlContext.read.parquet('caps_descriptors.parquet')

In [6]:
# Add letter abbreviation for easy checking between PDB and CAPS-DB
resi_abbr_map = {'ALA': 'A'
,'ARG': 'R'
,'ASN': 'N'
,'ASP': 'D'
,'CYS': 'C'
,'GLN': 'Q'
,'GLU': 'E'
,'GLY': 'G'
,'HIS': 'H'
,'ILE': 'I'
,'LEU': 'L'
,'LYS': 'K'
,'MET': 'M'
,'PHE': 'F'
,'PRO': 'P'
,'PYL': 'O'
,'SER': 'S'
,'SEC': 'U'
,'THR': 'T'
,'TRP': 'W'
,'TYR': 'Y'
,'VAL': 'V'
,'ASX': 'B'
,'GLX': 'Z'
,'XAA': 'X'
,'XLE': 'J'}

### Get Torsion angle and secondary structure info ###

In [9]:
df1 = capsdb.toPandas()
df = pd.merge(data, df1, left_on=('pdbId','chain'), right_on=('pdbid','chain'), how='inner')
df = df[['pdbId', 'chain', 'resi', 'resn', 'phi', 'psi', 'startcap', 'endcap']]
df['resabbr'] = df['resn'].map(resi_abbr_map)

### Need to specify mapping since PDB and CAPS-DB don't use same numbering scheme for residues ###

In [11]:
pdbid_chain = df1[['pdbid', 'chain']].groupby(['pdbid', 'chain']).max().reset_index()
pdbid_chain['combined'] = pdbid_chain.pdbid + "." + pdbid_chain.chain
pdbid_chain = pdbid_chain["combined"].unique()

In [None]:
# Generate a dictionary of offsets between PDB and Uniprot (CAPS-DB, hopefully) residue numbering.
# If these are all consistent within files, they can be used for the mapping - doing this because
# some residues seem to be missing from the mapping xml files and should be able to be interpolated 
# this way.

import sifts_mapper as smap

offset_dict = dict()
resimap_list = []
stopped_iter = [] # These are chains where the generator for the mapping stopped early

for pc in pdbid_chain:
    pdbid = pc[:4]
    chain = pc[5:]
    if pdbid not in []: #['2ou1', '2wgk', '3u42', '4fmy']:
        temp = df1[(df1.pdbid == pdbid) & (df1.chain == chain)]

        start, end = temp.startcap.min(), temp.endcap.max()
        resilist = [i for i in range(start, end+1)]
        
        if len(resilist) == 1:
            resis = resilist[0]
        else:
            resis = ",".join([str(i) for i in resilist])

        try:
            rmap = smap.uniprot_to_pdb_resi(pdbid, chain, resis, source="PDBe")
            rmap = [i for i in rmap]
        except:
            resilist = []
            rmap = []
            pass
        
        if len(resilist) != len(rmap):
            stopped_iter.append(pc)
            resilist = resilist[:len(rmap)]
        if len(resilist) > 0:        
            #dictionary = dict(zip(resilist, rmap))
            map_df = pd.DataFrame(data = {'resi': resilist, 'rmap': rmap})
            

            # Add all mapping offsets to dataframe
            d = {'pdbid': [pdbid for i in resilist],
                 'chain': [chain for i in resilist],
                 'resi': resilist,
                 'rmap': [i for i in rmap]}
            resimap_list.append(pd.DataFrame(d))

            # Add unique mapping offsets to set
            offsets = set()
            for k in dictionary.keys():
                if not math.isnan(dictionary[k]):
                    offsets.add(k - dictionary[k])
            offset_dict[pc] = offsets

resimap_df = pd.concat(resimap_list)


In [12]:
import pickle

#resimap_df.to_pickle('resimap_df.pkl')
resimap_df = pd.read_pickle('resimap_df.pkl')

In [325]:
#len(resimap_list)
#resimap_df.head()

#pdbid_chain
#pdbid_chain = pdbid_chain[pdbid_chain.pdbid == '2ou1']
#offset_dict
#resimap_df.head()
temp = pd.DataFrame(data = {'resi': [i+9 for i in range(10)], 'rmap': [i+3 if i < 5 else i for i in range(10)]})
temp['offset'] = temp.resi - temp.rmap
temp.groupby('offset').count().reset_index()

Unnamed: 0,offset,resi,rmap
0,6,5,5
1,9,5,5


In [None]:
offset_dict_single = {k: offset_dict[k].pop() for k in offset_dict.keys() if (len(offset_dict[k]) == 1)} # this destroys offset_dict
offset_dict_single

#resimap_df_2 = 
temp = resimap_df.iloc[np.where(np.isnan(resimap_df.rmap))[0],:]
#temp['rmap2'] = 
temp['pdbid_chain'] = temp.iloc[:,0] + "." + temp.iloc[:,1]
#temp.pdbid_chain.map(offset_dict_single)
[print(i) for i in temp.pdbid_chain.unique()]

In [352]:
#data.head()
#resimap_df_int = resimap_df.iloc[-np.where(np.isnan(resimap_df.rmap))[0],:]
#resimap_df_int
#resimap_df.groupby(['pdbid', 'chain']).count()

resimap_df = resimap_df[resimap_df.isnull().rmap == False]
print(resimap_df.columns)
print(df.columns)

Index(['pdbid', 'chain', 'resi', 'rmap'], dtype='object')
Index(['pdbId', 'chain', 'resi', 'resn', 'phi', 'psi', 'startcap', 'endcap',
       'resabbr'],
      dtype='object')


In [13]:
# map cap resi numbers to pbd data resi numbers

data_map = pd.merge(df, resimap_df, left_on=['pdbId', 'chain', 'startcap'], right_on=['pdbid', 'chain', 'resi'])
data_map = data_map.drop(['startcap'], axis=1)
data_map['startcap'] = data_map['resi_y']
data_map = data_map.drop('resi_y', axis=1)
data_map = data_map.iloc[:,[0,1,2,3,4,5,6,7,10]]

data_map = pd.merge(data_map, resimap_df, left_on=['pdbId', 'chain', 'endcap'], right_on=['pdbid', 'chain', 'resi'])
data_map = data_map.drop(['endcap'], axis=1)
data_map['endcap'] = data_map['resi']
data_map = data_map.drop('resi', axis=1)
data_map['resi'] = data_map['resi_x']
data_map = data_map.drop('resi_x', axis=1)

data_map = data_map.iloc[:,[0,1,2,3,4,5,6,8,9,10]]

data_map

Unnamed: 0,pdbId,chain,resn,phi,psi,resabbr,startcap,rmap,endcap,resi
0,2ygn,A,THR,,163.677383,T,40,72.0,41,1
1,2ygn,A,GLY,-66.660973,160.703186,G,40,72.0,41,2
2,2ygn,A,SER,-123.853607,-7.871733,S,40,72.0,41,3
3,2ygn,A,LEU,-74.896896,137.483932,L,40,72.0,41,4
4,2ygn,A,TYR,-134.419830,140.864288,Y,40,72.0,41,5
5,2ygn,A,LEU,-139.275024,127.621544,L,40,72.0,41,6
6,2ygn,A,TRP,-152.167755,166.833832,W,40,72.0,41,7
7,2ygn,A,ILE,-108.079048,119.799377,I,40,72.0,41,8
8,2ygn,A,ASP,-61.786110,150.193756,D,40,72.0,41,9
9,2ygn,A,ALA,-47.469296,-38.584801,A,40,72.0,41,10


In [22]:
#data[(data.pdbId == '4lfy') & (data.resi >= 173)]

#data_map['resa_dmap'] = data_map.resn.map(resi_abbr_map)

#data_map
df

Unnamed: 0,pdbId,chain,resi,resn,phi,psi,startcap,endcap,resabbr
0,2ygn,A,1,THR,,163.677383,40,41,T
1,2ygn,A,1,THR,,163.677383,50,52,T
2,2ygn,A,2,GLY,-66.660973,160.703186,40,41,G
3,2ygn,A,2,GLY,-66.660973,160.703186,50,52,G
4,2ygn,A,3,SER,-123.853607,-7.871733,40,41,S
5,2ygn,A,3,SER,-123.853607,-7.871733,50,52,S
6,2ygn,A,4,LEU,-74.896896,137.483932,40,41,L
7,2ygn,A,4,LEU,-74.896896,137.483932,50,52,L
8,2ygn,A,5,TYR,-134.419830,140.864288,40,41,Y
9,2ygn,A,5,TYR,-134.419830,140.864288,50,52,Y


In [23]:
#data_map
df = data_map

### Create labels

In [None]:
df['is_cap'] = df.apply(lambda x: 1 if (x['resi'] >= x['startcap'] and x['resi'] <= x['endcap']) else 0, axis=1)
df_caps = df.groupby(["pdbId", "chain", "resi"])['is_cap'].max().reset_index()

In [None]:
data_caps = pd.merge(data, df_caps, left_on=('pdbId','chain', 'resi'), right_on=('pdbId','chain', 'resi'), how='inner')

In [None]:
from Bio.PDB.Polypeptide import aa3
one_hot_encoded = pd.DataFrame(data_caps.resn.apply(lambda x: secondaryStructureExtractorFull.get_residue(x)).tolist(), columns=aa3)
one_hot_encoded.head()
data_caps = data_caps.join(one_hot_encoded)
data_caps.head()

In [None]:
data_caps.head()

# Define functions for feature extraction

In [None]:

def is_cap(pdbId, chain, resi, is_cap):
    if is_cap == 1:
        return(1)
    elif is_cap == 0:
        return(0)
    else:
        raise ValueError("is_cap must be 0 or 1")

def angle_to_cos(angle):
    if(angle == 0 or np.isnan(angle)):
        return 0
    else:
        return np.cos(np.pi * angle/180)

def angle_to_sin(angle):
    if(angle == 0 or np.isnan(angle)):
        return 0
    else:
        return np.sin(np.pi * angle/180)
    
def parse_feature_file(filename):
    # Read file
    df = pd.read_csv(filename, sep="\t", skiprows=range(10))
    df = df.reset_index()
    # Set column names
    cnames=[
    "RESIDUE_CLASS1_IS_HYDROPHOBIC",
    "RESIDUE_CLASS1_IS_CHARGED",
    "RESIDUE_CLASS1_IS_POLAR",
    "RESIDUE_CLASS1_IS_UNKNOWN",
    "RESIDUE_CLASS2_IS_NONPOLAR",
    "RESIDUE_CLASS2_IS_POLAR",
    "RESIDUE_CLASS2_IS_BASIC",
    "RESIDUE_CLASS2_IS_ACIDIC",
    "RESIDUE_CLASS2_IS_UNKNOWN",
    ]
    l = [["{}_SHELL{}".format(c, i) for c in cnames] for i in range(6)]
    cnames = [item for sublist in l for item in sublist]
    cnames.insert(0, "env")
    cnames.extend(["hash", "x", "y", "z", "VERBOSITY", "location"])
    df.columns = cnames
    
    df = df.drop(["hash", "x", "y", "z", "VERBOSITY"], axis=1)
    
    # filter out non-AA structures
    aminoAcids = ['ALA', 'ARG', 'ASN', 'ASP', 'CYS', 'GLN', 'GLU', 'GLY', 'HIS',
    'ILE', 'LEU', 'LYS', 'MET', 'PHE', 'PRO', 'SER', 'THR', 'TRP', 
    'TYR', 'VAL']
    df['residue'] = df.location.str[:3]
    df = df[df.residue.isin(aminoAcids)]
    
    # Split identifiers

    # split residue number
    df_residue = df.location.str[3:].str.split(":", 1, expand=True)
    df_residue.columns = ["ordernum", "chainatom"]
    df = pd.merge(df, df_residue, left_index=True, right_index=True)

    # split chain
    df_chain = df.chainatom.str.split("@", 1, expand=True)
    df_chain.columns = ["chain", "atom"]
    df = pd.merge(df, df_chain, left_index=True, right_index=True)
    
    df['pdbId'] = df.env.str[4:8]

    df = df.drop(["env", "location", "chainatom"], axis=1)
    
    #df.ordernum = df.ordernum.astype(int)

    # Aggregate to residue level
    groups = df.groupby(['chain', 'ordernum', 'residue'], sort=False)
    groupnums = groups.ngroup(ascending=True)
    df_agg_max = groups.max().reset_index()
    df_agg_max = df_agg_max.drop(['atom', 'ordernum'], axis=1)
    df_agg_max['resi'] = df_agg_max.index+1
    return(df_agg_max)


# Parse feature files to get additional features

In [None]:
files = os.listdir("feature/feature-3.1.0/feature_files")
filelist = []
f = open("feature_parsing.log", "w")
for i, filename in enumerate(files):
    print(i, filename)
    f.write("File {}: protein {}\n".format(i, filename))
    df = parse_feature_file("feature/feature-3.1.0/feature_files/{}".format(filename))
    filelist.append(df)

f.close()
feature_df = pd.concat(filelist)

In [161]:
# Use standard scaling on feature dataframe

#feature_df.apply(np.max, axis=0)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(feature_df.iloc[:,2:56]), columns=feature_df.columns[2:56])

id_df = feature_df.iloc[:,[56,0,57,1]] 
id_df = id_df.reset_index().drop('index', axis=1)
feature_df2 = pd.merge(id_df, scaled_df, left_index=True, right_index=True)
feature_df2

Unnamed: 0,pdbId,chain,resi,residue,RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL0,RESIDUE_CLASS1_IS_CHARGED_SHELL0,RESIDUE_CLASS1_IS_POLAR_SHELL0,RESIDUE_CLASS1_IS_UNKNOWN_SHELL0,RESIDUE_CLASS2_IS_NONPOLAR_SHELL0,RESIDUE_CLASS2_IS_POLAR_SHELL0,...,RESIDUE_CLASS2_IS_UNKNOWN_SHELL4,RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL5,RESIDUE_CLASS1_IS_CHARGED_SHELL5,RESIDUE_CLASS1_IS_POLAR_SHELL5,RESIDUE_CLASS1_IS_UNKNOWN_SHELL5,RESIDUE_CLASS2_IS_NONPOLAR_SHELL5,RESIDUE_CLASS2_IS_POLAR_SHELL5,RESIDUE_CLASS2_IS_BASIC_SHELL5,RESIDUE_CLASS2_IS_ACIDIC_SHELL5,RESIDUE_CLASS2_IS_UNKNOWN_SHELL5
0,4ncj,A,1,MET,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.147298,1.179390,-1.027858,-0.088224,0.980710,-0.967822,0.343334,1.643986,-0.050447
1,4ncj,A,2,LYS,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,0.334886,0.525266,-0.525959,-0.542149,0.195332,-0.421891,-0.445199,1.050757,-0.050447
2,4ncj,A,3,LEU,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,2.095112,0.961349,-1.362458,-0.996074,1.896985,-1.331776,-0.182355,1.050757,-0.050447
3,4ncj,A,4,GLU,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,0.334886,1.615473,-1.027858,-0.088224,0.195332,-0.967822,2.183243,0.160914,-0.050447
4,4ncj,A,5,ARG,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,1.011896,2.487638,-1.027858,0.365701,0.849814,-0.421891,0.080489,3.720287,-0.050447
5,4ncj,A,6,VAL,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.553504,0.307225,-1.195158,-0.996074,1.373400,-1.149799,-0.970887,1.347372,-0.050447
6,4ncj,A,7,THR,-0.827754,-0.539470,1.639459,-0.280102,-0.852432,1.509584,...,-0.044339,0.064081,0.961349,-0.358659,-0.996074,-0.066461,-0.239914,0.080489,0.457529,-0.050447
7,4ncj,A,8,VAL,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.147298,-0.128858,1.147040,-0.996074,0.980710,1.397879,0.343334,1.050757,-0.050447
8,4ncj,A,9,LYS,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,-0.748331,0.743307,0.979740,-0.996074,-0.851840,0.669971,0.869022,0.457529,-0.050447
9,4ncj,A,10,ASN,-0.827754,-0.539470,1.639459,-0.280102,-0.852432,1.509584,...,-0.044339,0.334886,0.743307,0.477841,0.365701,0.195332,0.669971,0.869022,0.160914,-0.050447


In [None]:
# Read/Write feature vector
#feature_df2.to_csv("Feature_vectors.csv")
feature_df2 = pd.read_csv("Feature_vectors.csv", index_col=0)

In [12]:
feature_df2

Unnamed: 0,pdbId,chain,resi,residue,RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL0,RESIDUE_CLASS1_IS_CHARGED_SHELL0,RESIDUE_CLASS1_IS_POLAR_SHELL0,RESIDUE_CLASS1_IS_UNKNOWN_SHELL0,RESIDUE_CLASS2_IS_NONPOLAR_SHELL0,RESIDUE_CLASS2_IS_POLAR_SHELL0,...,RESIDUE_CLASS2_IS_UNKNOWN_SHELL4,RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL5,RESIDUE_CLASS1_IS_CHARGED_SHELL5,RESIDUE_CLASS1_IS_POLAR_SHELL5,RESIDUE_CLASS1_IS_UNKNOWN_SHELL5,RESIDUE_CLASS2_IS_NONPOLAR_SHELL5,RESIDUE_CLASS2_IS_POLAR_SHELL5,RESIDUE_CLASS2_IS_BASIC_SHELL5,RESIDUE_CLASS2_IS_ACIDIC_SHELL5,RESIDUE_CLASS2_IS_UNKNOWN_SHELL5
0,4ncj,A,1,MET,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.147298,1.179390,-1.027858,-0.088224,0.980710,-0.967822,0.343334,1.643986,-0.050447
1,4ncj,A,2,LYS,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,0.334886,0.525266,-0.525959,-0.542149,0.195332,-0.421891,-0.445199,1.050757,-0.050447
2,4ncj,A,3,LEU,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,2.095112,0.961349,-1.362458,-0.996074,1.896985,-1.331776,-0.182355,1.050757,-0.050447
3,4ncj,A,4,GLU,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,0.334886,1.615473,-1.027858,-0.088224,0.195332,-0.967822,2.183243,0.160914,-0.050447
4,4ncj,A,5,ARG,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,1.011896,2.487638,-1.027858,0.365701,0.849814,-0.421891,0.080489,3.720287,-0.050447
5,4ncj,A,6,VAL,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.553504,0.307225,-1.195158,-0.996074,1.373400,-1.149799,-0.970887,1.347372,-0.050447
6,4ncj,A,7,THR,-0.827754,-0.539470,1.639459,-0.280102,-0.852432,1.509584,...,-0.044339,0.064081,0.961349,-0.358659,-0.996074,-0.066461,-0.239914,0.080489,0.457529,-0.050447
7,4ncj,A,8,VAL,1.239261,-0.539470,-0.618927,-0.280102,1.204150,-0.673007,...,-0.044339,1.147298,-0.128858,1.147040,-0.996074,0.980710,1.397879,0.343334,1.050757,-0.050447
8,4ncj,A,9,LYS,-0.827754,1.677912,-0.618927,-0.280102,-0.852432,-0.673007,...,-0.044339,-0.748331,0.743307,0.979740,-0.996074,-0.851840,0.669971,0.869022,0.457529,-0.050447
9,4ncj,A,10,ASN,-0.827754,-0.539470,1.639459,-0.280102,-0.852432,1.509584,...,-0.044339,0.334886,0.743307,0.477841,0.365701,0.195332,0.669971,0.869022,0.160914,-0.050447


In [None]:
#data_caps
data_caps2 = data_caps.merge(feature_df2, left_on=["pdbId", "chain", "resi"], right_on=["pdbId", "chain", "resi"])

In [96]:
#data_caps2.columns
#data_caps.shape
#feature_df.shape
#data_caps[data_caps.pdbId == "2ygn"]
#feature_df[feature_df.pdbId == "2ygn"]
#data_caps.groupby(['pdbId', 'chain']).ngroup().unique().shape
#feature_df.groupby(['pdbId', 'chain']).ngroup().unique().shape
#data_caps2.groupby(['pdbId', 'chain']).ngroup().unique().shape
train_chains[1].shape

(122, 78)

# Process data into list of arrays

In [None]:
groups = data_caps2.groupby(["pdbId", "chain"])
                           # num pdbs,    max len of seqs, num features

# Check max length of protein chains
# maxlen = 0
# for i, ((pdbid, chain), group) in enumerate(groups):
#     l = 0
#     for j, featuretuple in enumerate(group.itertuples()):
#         l += 1
#         if l > maxlen:
#             maxlen = l
# print(maxlen)

In [None]:
train_chains = []
label_chains = []
laglabel_chains = []

for i, ((pdbid, chain), group) in enumerate(groups):
    # Create empty arrays
    train_chain = np.zeros((1300,78), dtype=float) # max chain length is 1288 residues
    label_chain = np.zeros((1300,1), dtype=int)
    laglabel_chain = np.zeros((5000,1), dtype=int)
    
    # Populate arrays
    for j, featuretuple in enumerate(group.itertuples()):
        train_chain[j, :] = (angle_to_cos(featuretuple.phi), 
                             angle_to_sin(featuretuple.phi), 
                             angle_to_cos(featuretuple.psi), 
                             angle_to_sin(featuretuple.psi),
                            featuretuple.ALA,
                            featuretuple.CYS,
                            featuretuple.ASP,
                            featuretuple.GLU,
                            featuretuple.PHE,
                            featuretuple.GLY,
                            featuretuple.HIS,
                            featuretuple.ILE,
                            featuretuple.LYS,
                            featuretuple.LEU,
                            featuretuple.MET,
                            featuretuple.ASN,
                            featuretuple.PRO,
                            featuretuple.GLN,
                            featuretuple.ARG,
                            featuretuple.SER,
                            featuretuple.THR,
                            featuretuple.VAL,
                            featuretuple.TRP,
                            featuretuple.TYR,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL0,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL0,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL0,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL0,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL0,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL1,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL1,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL1,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL1,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL1,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL2,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL2,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL2,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL2,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL2,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL3,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL3,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL3,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL3,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL3,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL4,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL4,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL4,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL4,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL4,
                            featuretuple.RESIDUE_CLASS1_IS_HYDROPHOBIC_SHELL5,
                            featuretuple.RESIDUE_CLASS1_IS_CHARGED_SHELL5,
                            featuretuple.RESIDUE_CLASS1_IS_POLAR_SHELL5,
                            featuretuple.RESIDUE_CLASS1_IS_UNKNOWN_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_NONPOLAR_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_POLAR_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_BASIC_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_ACIDIC_SHELL5,
                            featuretuple.RESIDUE_CLASS2_IS_UNKNOWN_SHELL5)
        label_chain[j,0] = is_cap(featuretuple.pdbId, featuretuple.chain, featuretuple.resi, featuretuple.is_cap)
        if (j > 0):
            laglabel_chain[j-1,0] = label_chain[j,0]
    
    # Trim zeros
    trimmed_train = train_chain[~np.all(train_chain == 0, axis=1)]
    trimmed_label = label_chain[:trimmed_train.shape[0]]
    trimmed_laglabel = label_chain[:trimmed_train.shape[0]+1]
    
    # Add chain data to lists of arrays
    train_chains.append(trimmed_train)
    label_chains.append(trimmed_label)
    laglabel_chains.append(trimmed_laglabel)
    

# Write training data to pickle file

In [None]:
import pickle
pickle_out = open("pickled_data/train_chains_78.pickle","wb")
pickle.dump(train_chains, pickle_out)
pickle_out.close()

pickle_out = open("pickled_data/label_chains_78.pickle","wb")
pickle.dump(label_chains, pickle_out)
pickle_out.close()

pickle_out = open("pickled_data/laglabel_chains_78.pickle","wb")
pickle.dump(laglabel_chains, pickle_out)
pickle_out.close()

In [26]:
spark.stop()

# The code below reads in 1-dim (binary) labels and writes back out as 2-dim labels (one-hot)

In [None]:
import pickle
label_chain_in = open("pickled_data/label_chains_78.pickle","rb")
labels = pickle.load(label_chain_in)

newlabels = []
for i, l in enumerate(labels):
    temp = np.zeros([l.shape[0], 2], dtype=int)
    temp[:,1] = l[:,0]
    temp[:,0] = (l[:,0]+1)%2
    newlabels.append(temp)

pickle_out = open("pickled_data/label_chains_78.pickle","wb")
pickle.dump(newlabels, pickle_out)
pickle_out.close()

In [None]:
laglabel_chain_in = open("pickled_data/laglabel_chains_78.pickle","rb")
labelslag = pickle.load(laglabel_chain_in)

newlabelslag = []
for i, l in enumerate(labelslag):
    temp = np.zeros([l.shape[0], 2], dtype=int)
    temp[:,1] = l[:,0]
    temp[:,0] = (l[:,0]+1)%2
    newlabelslag.append(temp)

pickle_out = open("pickled_data/laglabel_chains_78.pickle","wb")
pickle.dump(newlabelslag, pickle_out)
pickle_out.close()

# The code below reads in train/label and writes out lists sorted by chain length

In [31]:
import pickle
train_chain_in = open("pickled_data/train_chains.pickle","rb")
train = pickle.load(train_chain_in)

lens = [len(chain) for chain in train]
inds = range(len(train))
lenSeries = pd.Series(data=lens, index=inds).sort_values()
newInds = lenSeries.index.values
newlist = []
[newlist.append(train[i]) for i in newInds]

pickle_out = open("pickled_data/train_chains_sorted.pickle","wb")
pickle.dump(newlist, pickle_out)
pickle_out.close()

# now sort label list
label_chain_in = open("pickled_data/label_chains.pickle","rb")
labels = pickle.load(label_chain_in)

newlist2 = []
[newlist2.append(labels[i]) for i in newInds]

pickle_out = open("pickled_data/label_chains_sorted.pickle","wb")
pickle.dump(newlist2, pickle_out)
pickle_out.close()

# now sort laglabel list
laglabel_chain_in = open("pickled_data/laglabel_chains.pickle","rb")
labelslag = pickle.load(laglabel_chain_in)

newlist3 = []
[newlist3.append(labelslag[i]) for i in newInds]

pickle_out = open("pickled_data/laglabel_chains_sorted.pickle","wb")
pickle.dump(newlist3, pickle_out)
pickle_out.close()