In [24]:
from helpers.helper import get_cath
import numpy as np
from Bio import SeqIO
from os import listdir
from os.path import isfile, join


In [95]:
def boundaries(seq, domain):
    bounds = np.zeros((len(seq)), dtype=np.int8)

    for k, v in domain.items():
        boundary_positions = v.split(',')
        for b in boundary_positions:
            indices = [int(i) for i in b.split('-')]
            for i in indices:
                bounds[i-1] = 1    
    return bounds


def one_hot_seq(seq):
    amino_acids = 'ARNDCQEGHILKMFPSTWYV'
    encoded = np.array([1 if elt == 'A' else 0 for elt in seq])
    # start from the second element since the first one is A and was created above
    for amino_acid in amino_acids[1:]:
        new = np.array([1 if elt == amino_acid else 0 for elt in seq])
        encoded = np.vstack((encoded, new))

    return encoded

In [96]:
seq = 'APPKTTVRWCTISSAEEKKCNSLKDHMQQERVTLSCVQKATYLDCIKAISNNEADAISLDGGQVFEAGLAPYKLKPIAAEVYERSGGSTTSYYAVAVVKKGTDFMIKDLRGKTSCHTGLGRSAGWNIPIGTLIHREDIEWEGIESGISEQAVAKFFSASCVPGATIEQKLCRQCKGDAKTKCLRNGPYSGYSGAFQCLKDGKGDVAFVKHTTVQENAPEEKDEYELLCLDGSRQPVDSYKTCNWARVAAHAVVARDDSKIDDIWSFLGMQAYSLGVDTTSDFHLFGPPGKKDPVLKDLLFKDSAIMLKRVPELMDSQLYLGFEYYSAIQSLRKDQLTVGPRENKIQWCAVGKDEKSKCDRWSVVSNGEVECTILDDNKDCIVKITKGEADAISLDGGFVYTAGVCGLVPVVGESYEDETQCSKDEEQPAYYFAVAVVKKSSAITWNNLQGKKSCHTAVGRTAGWNIPMGLIHNKTGSCDFDDYFSEGCAPGSPPNSRLCKLCQGSGENLLEKCVASSHEKYYGYTGALRCLVEQGDVAFIKHSTVGENVSGSNKDDWAKGLTRDDFELLCTNGKRAKTMDYKTCHLAKVPTHAVVARPEKANKIRELLEGQEKLFGLHGTEKERFMMFQSQTKDLLFKALTKCLVKLRQGITYKEFLGDEYYASVASLNTCNPSDLLQVCTFLEDK'

domain = {'1': '1-91,251-339', '2': '92-250', '3': '340-433,590-686', '4': '434-589'}

mypath = '/home/alexandros/Desktop/l4-individual-project/data/pdb/bulk/1.5029/data'
files = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [99]:
cath = get_cath()
c = 0

for f in sorted(files):
    pdb_code = f[:4]

    print(pdb_code)
    
    pdb_file_path = f'../data/pdb/bulk/1.5029/data/{f}' 

    if not os.path.isfile(pdb_file_path):
        print(f'{pdb_file_path} does not exist.')
        continue

    chains = {record.id: record.seq for record in SeqIO.parse(pdb_file_path, 'pdb-seqres')}
    
    for chain in chains.keys():
        print(chain)
        query_chain = chains[chain]
        seq = str(query_chain)
        try:
            domain = cath[pdb_code][chain[-1]]
            boundaries_arr = boundaries(seq, domain)
            print(seq)
            print(domain)
            print(one_hot_seq(seq))
            print(boundaries_arr)
        except KeyError:
            print(f'{chain} does not exist in cath_domain_boundaries.txt')

    
    # query_chain = chains[query_chain_id]
    # print(str(query_chain))
    if c == 2:
        break
    c += 1

1a00
1A00:A
VLSPADKTNVKAAWGKVGAHAGEYGAEALERMFLSFPTTKTYFPHFDLSHGSAQVKGHGKKVADALTNAVAHVDDMPNALSALSDLHAHKLRVDPVNFKLLSHCLLVTLAAHLPAEFTPAVHASLDKFLASVSTVLTSKYR
{'0': '1-141'}
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [1 0 0 ... 0 0 0]]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
1A00:B
MHLTPEEKSAVTALWGKVNVDEVGGEALGRLLVVYPYTQRFFESFGDLSTPDAVMGNPKVKAHGKKVLGAFSDGLAHLDNLKGTFATLSELHCDKLHVDPENFRLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH
{'0': '1-146'}
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0