In [4]:
from helpers.helper import get_cath
import numpy as np
from Bio import SeqIO
from os import listdir
from os.path import isfile, join
import pandas as pd


In [5]:
def boundaries(seq, domain):
    # return the boundaries from a sequence and a domain
    # seq is only used for getting the length
    bounds = np.zeros((len(seq)), dtype=np.int8)

    for k, v in domain.items():
        boundary_positions = v.split(',')
        for b in boundary_positions:
            indices = [int(i) for i in b.split('-')]
            for i in indices:
                bounds[i-1] = 1    
    return np.array(bounds, dtype=np.float64)

def boundaries8(seq, domain):
    # return the boundaries from a sequence and a domain
    # seq is only used for getting the length
    bounds = np.zeros((len(seq)), dtype=np.int8)

    for k, v in domain.items():
        boundary_positions = v.split(',')
        for b in boundary_positions:
            indices = [int(i) for i in b.split('-')]
            for i in indices:
                low = max(i - 8, 0)
                high = min(i + 8, len(seq))
                for j in range(low, high + 1):
                    bounds[j - 1] = 1    
    return np.array(bounds, dtype=np.float64)


def one_hot_seq(seq):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    encoded = np.array([1 if elt == 'A' else 0 for elt in seq],dtype=np.float64)
    # start from the second element since the first one is A and was created above
    for amino_acid in amino_acids[1:]:
        new = np.array([1 if elt == amino_acid else 0 for elt in seq])
        encoded = np.vstack((encoded, new))

    return encoded

def int_encoding(seq):
    amino_acids = 'XACDEFGHIKLMNPQRSTVWY'
    lookup = {}
    for i, aa in enumerate(amino_acids):
        lookup[aa] = i

    encoding = [lookup[aa] for aa in seq]
    return np.array(encoding, dtype=np.float64) 

In [13]:
mypath = '../data/pdb/bulk/1.5029/data'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [14]:
cath = get_cath()

In [16]:
def data_to_df(encode_fn):
    counter = 0

    # number of proteins/chains not found in the cath object (from the file)
    miss = 0

    df = pd.DataFrame(columns=['in', 'out'])
    df['in'] = df['in'].astype(object)
    df['out'] = df['out'].astype(object)

    for i, f in enumerate(sorted(files)):
        pdb_code = f[:4]
        
        pdb_file_path = f'../data/pdb/bulk/1.5029/data/{f}' 

        if not os.path.isfile(pdb_file_path):
            # print(f'{pdb_file_path} does not exist.')
            continue

        chains = {record.id: record.seq for record in SeqIO.parse(pdb_file_path, 'pdb-seqres')}
        
        for j, chain in enumerate(chains.keys()):
            query_chain = chains[chain]
            seq = str(query_chain)
            try:
                domain = cath[pdb_code][chain[-1]]
                input = encode_fn(seq)
                target_data = boundaries8(seq, domain)
                new_df = pd.DataFrame({'in':[input], 'out':[target_data]})
                df = pd.concat([df,new_df], ignore_index=True)
                counter += 1
            except KeyError:
                miss += 1
        
        if counter % 500 == 0:
            print(f'Written {counter} files')

    print(f'Missed: {miss}')
    return df

In [17]:
df = data_to_df(one_hot_seq)
df.to_pickle('../data/cnn/data.csv')

Written 1000 files
Written 2500 files
Written 4500 files
Written 8000 files
Written 9000 files
Missed: 986


# Section 2
In this section, instead of using one-hot encoding, the amino acids are encoded as integer numbers:

A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y -> 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20

In [38]:
def pad(x, size):
    if len(x) < size:
        x = np.pad(x, (0, size - len(x)), 'constant', constant_values=(0, 0))
    return x

In [47]:
df

Unnamed: 0,in,out
0,"[18, 10, 16, 13, 1, 3, 9, 17, 12, 18, 9, 1, 1,...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"[11, 7, 10, 17, 13, 4, 4, 9, 16, 1, 18, 17, 1,...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,"[18, 10, 16, 13, 1, 3, 9, 17, 12, 18, 9, 1, 1,...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,"[11, 7, 10, 17, 13, 4, 4, 9, 16, 1, 18, 17, 1,...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"[18, 10, 16, 13, 1, 3, 9, 17, 12, 18, 9, 1, 1,...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...
9164,"[17, 17, 20, 9, 10, 8, 10, 12, 6, 9, 17, 10, 9...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9165,"[11, 9, 12, 1, 17, 5, 20, 10, 10, 3, 12, 3, 17...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9166,"[14, 6, 4, 8, 1, 8, 1, 8, 13, 1, 7, 18, 15, 10...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9167,"[11, 9, 12, 1, 17, 5, 20, 10, 10, 3, 12, 3, 17...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [49]:
df = data_to_df(int_encoding)
df['in'] = df['in'].map(lambda x : pad(x, 1500))
df['out'] = df['out'].map(lambda x : pad(x, 1500))
df.to_pickle('../data/rnn/data.csv')

Written 1000 files
Written 2500 files
Written 4500 files
Written 8000 files
Written 9000 files
Missed: 986
