In [1]:
import pickle
import random
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from Bio import Align

from dataset import *
from utils import *
from models import *

In [2]:
model = FTransformer()

NameError: name 'FTransformer' is not defined

# FrameSlidingTransformer

In [12]:
def frame_slice(x, frame_size=6):
    """
    Args:
        x: (B, L, E)
        frame_size: size of frames
    Returns:
        frames: (num_frames*B, frame_size, C)
    """
    B, L, E = x.shape
    x = x.view(B, L // frame_size, frame_size, E)
    frames = x.contiguous().view(-1, frame_size, E)

    return frames

x = torch.ones(16, 72, 32)

x1 = frame_slice(x, frame_size=3)
x1.shape

torch.Size([384, 3, 32])

In [8]:
B, L, E

(16, 71, 32)

In [9]:
L // frame_size

23

In [10]:
16*23*3*32, 16*71*32

(35328, 36352)

In [7]:
frame_size = 3

B, L, E = x.shape
x1 = x.view(B, L // frame_size, frame_size, E)
x1.shape

RuntimeError: shape '[16, 23, 3, 32]' is invalid for input of size 36352

# SequenceEncoder

In [2]:
model = SequenceEncoder(seq_encoder_type="transformer", num_layers=2, embed_size=64, hidden=512, dropout=0, nhead=4)
model

SequenceEncoder(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0, inplace=False)
  )
  (seq_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear(in_features=64, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=64, bias=True)
        (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
        )
        (linear1): Linear

In [4]:
# transformer
x = torch.ones(16, 48, 64)
model(x).shape

torch.Size([16, 48, 64])

In [5]:
# lstm
x = torch.ones(16, 48, 64)
model(x)[0].shape

torch.Size([16, 48, 64])

# SetModel

In [2]:
embed_size = 32
hidden = 64

model = SetModel(embed_size=embed_size, 
                 hidden=hidden, 
                 num_layers=2, 
                 dropout=0.1, 
                 k4kmer=5, 
                 use_pretrain=False, 
                 use_coattn=True, 
                 seq_encoder_type="transformer", 
                 num_heads=4, 
                 num_inds=6, 
                 num_outputs=6, 
                 ln=False).cuda()

print("total params: ", sum(p.numel() for p in model.parameters() if p.requires_grad))

model

total params:  506721


SetModel(
  (embedding): Embedding(27, 32)
  (seq_encoder): SequenceEncoder(
    (pos_encoder): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (seq_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
          )
          (linear1): Linear(in_features=32, out_features=2048, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2048, out_features=32, bias=True)
          (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
        (1): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynami

In [4]:
x = torch.ones(12, 5, 32)
y = torch.mean(x, dim=1)
y.shape

torch.Size([12, 32])

In [3]:
para = ["+ABCD-##", "+ABCD-##", "+ABCD-##"]
epi = ["+AD-##", "+AD-##", "+AD-##"]

out = model(para, epi)
out, out.shape

torch.Size([3, 8, 32])
torch.Size([12, 5, 32])
torch.Size([12, 5, 32])
after mean  torch.Size([12, 32])
torch.Size([3, 4, 32])
torch.Size([3, 6, 32])
torch.Size([3, 6, 32]) torch.Size([3, 6, 32])


(tensor([[0.4661],
         [0.4661],
         [0.4671]], device='cuda:0', grad_fn=<SigmoidBackward0>),
 torch.Size([3, 1]))

# transformerencoder(layer)

In [7]:
encoder_layer = nn.TransformerEncoderLayer(d_model=256, nhead=4, batch_first=True)
seq_encoder = nn.TransformerEncoder(encoder_layer=encoder_layer, num_layers=2)
seq_encoder

TransformerEncoder(
  (layers): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (linear1): Linear(in_features=256, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=256, bias=True)
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (linear1): Linear(in_features=256, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_featu

# kmer

In [23]:
def kmer_embed(seqs, k=3):
    ngram_li = []
    for seq in seqs:
        ngram = [seq[i:i+k, :] for i in range(len(seq)-k+1)]
        ngram_li.append(torch.stack(ngram, dim=0))
    
    return torch.vstack(ngram_li)

In [24]:
x = torch.ones(16, 48, 32)

y = kmer_embed(x, k=3)
y.shape

torch.Size([736, 3, 32])

In [21]:
def kmer(seq, k=3):
    ngram = [seq[i:i+k] for i in range(len(seq)-k+1)]
    return ngram

s = "+ABCDEFG-##"
kmer(s, k=3)

['+AB', 'ABC', 'BCD', 'CDE', 'DEF', 'EFG', 'FG-', 'G-#', '-##']

In [24]:
s = torch.ones(2,3,4)
kmer(s, k=3)

[]

In [None]:
batch, seq, hidden

In [74]:
def kmer_embed(seqs, k=3):
    ngram_li = []
    for seq in seqs:
        ngram = [torch.mean(seq[i:i+k, :], dim=0) for i in range(len(seq)-k+1)]
        ngram_li.append(torch.vstack(ngram))

    return torch.stack(ngram_li)

seqs = torch.tensor(
        [[[1., 1., 1.],
         [2., 2., 2.],
         [3., 3., 3.],
         [4., 4., 4.],
         [5., 5., 5.]],

        [[5., 5., 5.],
         [1., 1., 1.],
         [2., 2., 2.],
         [3., 3., 3.],
         [4., 4., 4.]]])

ngram_li = []
for seq in seqs:
    ngram = [torch.mean(seq[i:i+k, :], dim=0) for i in range(len(seq)-k+1)]
    ngram_li.append(torch.vstack(ngram))

final = torch.stack(ngram_li)
final, final.shape

(tensor([[[2.0000, 2.0000, 2.0000],
          [3.0000, 3.0000, 3.0000],
          [4.0000, 4.0000, 4.0000]],
 
         [[2.6667, 2.6667, 2.6667],
          [2.0000, 2.0000, 2.0000],
          [3.0000, 3.0000, 3.0000]]]),
 torch.Size([2, 3, 3]))

In [77]:
res = kmer_embed(seqs, k=3)
res, res.shape

(tensor([[[2.0000, 2.0000, 2.0000],
          [3.0000, 3.0000, 3.0000],
          [4.0000, 4.0000, 4.0000]],
 
         [[2.6667, 2.6667, 2.6667],
          [2.0000, 2.0000, 2.0000],
          [3.0000, 3.0000, 3.0000]]]),
 torch.Size([2, 3, 3]))

# align

In [2]:
target = "GGSISSGDSY/IYYSGST/ARHVGDLRVNDAFDI/SSNIGNNF/DSD/GTWDRSLSVVV"
query = "GGSISSGDSY/IYYSGST/ARHVGDLRVNDAFDI/SSNIGNNF/DSD/GTWDRSLSVVV"

aligner = Align.PairwiseAligner()
aligner = Align.PairwiseAligner(match_score=1.0)

score = aligner.score(target, query)
score = score / max(len(target), len(query))
score

1.0

# how to get attn weights

In [2]:
model = nn.Transformer()

model

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, in

In [67]:
model.encoder.layers[-1].state_dict()['self_attn.in_proj_weight'].shape, model.encoder.layers[-1].state_dict()['self_attn.in_proj_weight']

(torch.Size([1536, 512]),
 tensor([[ 0.0470, -0.0028,  0.0163,  ..., -0.0414,  0.0355, -0.0227],
         [ 0.0279,  0.0492,  0.0412,  ...,  0.0506, -0.0014, -0.0139],
         [ 0.0475,  0.0345,  0.0353,  ...,  0.0470, -0.0176, -0.0121],
         ...,
         [-0.0519, -0.0491,  0.0292,  ...,  0.0364, -0.0531, -0.0331],
         [-0.0325,  0.0164,  0.0195,  ..., -0.0243, -0.0272, -0.0313],
         [-0.0150,  0.0301, -0.0027,  ...,  0.0330, -0.0533, -0.0485]]))

In [76]:
model.decoder.layers[-1].state_dict()['self_attn.out_proj.weight'].shape, model.decoder.layers[-1].state_dict()['self_attn.out_proj.weight']

(torch.Size([512, 512]),
 tensor([[ 0.0727,  0.0576,  0.0258,  ...,  0.0696,  0.0730, -0.0514],
         [-0.0191, -0.0108, -0.0535,  ...,  0.0299, -0.0091,  0.0592],
         [ 0.0610, -0.0025, -0.0308,  ..., -0.0552, -0.0339, -0.0194],
         ...,
         [ 0.0689, -0.0239, -0.0136,  ..., -0.0097, -0.0262, -0.0103],
         [ 0.0089, -0.0052, -0.0376,  ...,  0.0385, -0.0002, -0.0738],
         [-0.0671, -0.0263, -0.0067,  ...,  0.0348, -0.0179,  0.0467]]))

In [58]:
model.decoder.layers[-1].state_dict()['self_attn.in_proj_weight'].shape, model.decoder.layers[-1].state_dict()['self_attn.in_proj_weight']

(torch.Size([1536, 512]),
 tensor([[ 0.0138,  0.0402,  0.0142,  ...,  0.0350, -0.0382,  0.0134],
         [ 0.0315, -0.0208, -0.0037,  ...,  0.0329,  0.0324,  0.0524],
         [ 0.0132, -0.0481, -0.0317,  ..., -0.0450, -0.0170, -0.0525],
         ...,
         [ 0.0151,  0.0279,  0.0342,  ...,  0.0286, -0.0049, -0.0132],
         [ 0.0532, -0.0446, -0.0320,  ..., -0.0411,  0.0289, -0.0172],
         [ 0.0206,  0.0100,  0.0536,  ..., -0.0281, -0.0387,  0.0047]]))

In [42]:
model.decoder.layers[-1].self_attn.in_proj_weight.shape, model.decoder.layers[-1].self_attn.in_proj_weight

(torch.Size([1536, 512]),
 Parameter containing:
 tensor([[ 0.0138,  0.0402,  0.0142,  ...,  0.0350, -0.0382,  0.0134],
         [ 0.0315, -0.0208, -0.0037,  ...,  0.0329,  0.0324,  0.0524],
         [ 0.0132, -0.0481, -0.0317,  ..., -0.0450, -0.0170, -0.0525],
         ...,
         [ 0.0151,  0.0279,  0.0342,  ...,  0.0286, -0.0049, -0.0132],
         [ 0.0532, -0.0446, -0.0320,  ..., -0.0411,  0.0289, -0.0172],
         [ 0.0206,  0.0100,  0.0536,  ..., -0.0281, -0.0387,  0.0047]],
        requires_grad=True))

In [39]:
dir(model.decoder.layers[-1].self_attn)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__constants__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_backward_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_qkv_same_embed_dim',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_reset_p

In [36]:
model.decoder._modules["layers"][-1].multihead_attn._parameters["in_proj_weight"].shape

torch.Size([1536, 512])

In [31]:
model.decoder._modules["layers"][-1].multihead_attn._parameters

OrderedDict([('in_proj_weight',
              Parameter containing:
              tensor([[-0.0294, -0.0511,  0.0045,  ...,  0.0381,  0.0291,  0.0405],
                      [-0.0378, -0.0161,  0.0465,  ..., -0.0185, -0.0045,  0.0329],
                      [-0.0140,  0.0475, -0.0318,  ...,  0.0461,  0.0257,  0.0066],
                      ...,
                      [-0.0535,  0.0461, -0.0355,  ..., -0.0245,  0.0480, -0.0154],
                      [-0.0343, -0.0304,  0.0292,  ..., -0.0300,  0.0525,  0.0037],
                      [ 0.0369, -0.0137, -0.0355,  ...,  0.0328,  0.0130, -0.0061]],
                     requires_grad=True)),
             ('q_proj_weight', None),
             ('k_proj_weight', None),
             ('v_proj_weight', None),
             ('in_proj_bias',
              Parameter containing:
              tensor([0., 0., 0.,  ..., 0., 0., 0.], requires_grad=True))])

In [30]:
dir(model.decoder._modules["layers"][-1].multihead_attn)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__constants__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_backward_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_qkv_same_embed_dim',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_reset_p

In [22]:
dir(model.decoder._modules["layers"][-1])

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__constants__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_ff_block',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_backward_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_mha_block',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_sa

In [3]:
dir(model)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_buffers',
 '_call_impl',
 '_forward_hooks',
 '_forward_pre_hooks',
 '_get_backward_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks',
 '_maybe_warn_non_full_backward_hook',
 '_modules',
 '_named_members',
 '_non_persistent_buffers_set',
 '_parameters',
 '_register_load_state_dict_pre_hook',
 '_register_state_dict_hook',
 '_replicate_for_data_parallel',
 '_reset_parameters',
 '_save_to_state_dict',
 '_slow_forward',
 '_sta

# data EDA

In [2]:
data = pickle.load(open("../../MSAI_Project/codes/data_files/data.json", "rb"))
type(data), len(data)

(list, 5359)

In [3]:
data[0].keys()

dict_keys(['pdb', 'Hchain', 'Lchain', 'Achain', 'Hseq', 'Lseq', 'Aseq', 'L1', 'L2', 'L3', 'H1', 'H2', 'H3', 'Hpos', 'Lpos', 'Apos'])

In [4]:
len(data[0]["Hpos"])

123

In [5]:
len(data[0]["Lpos"])

110

In [6]:
len(data[0]["Apos"][0])

392

In [7]:
data[0]["Apos"][0]

[array([[ 81.858,  -7.199, 197.021],
        [ 81.735,  -6.449, 198.265],
        [ 80.284,  -6.421, 198.728],
        [ 79.74 ,  -5.362, 199.036]], dtype=float32),
 array([[ 79.66 ,  -7.596, 198.767],
        [ 78.278,  -7.837, 199.157],
        [ 77.269,  -7.526, 198.055],
        [ 76.075,  -7.745, 198.261]], dtype=float32),
 array([[ 77.695,  -7.022, 196.897],
        [ 76.793,  -6.774, 195.785],
        [ 77.323,  -7.296, 194.458],
        [ 76.538,  -7.469, 193.52 ]], dtype=float32),
 array([[ 78.63 ,  -7.552, 194.352],
        [ 79.196,  -8.074, 193.112],
        [ 78.916,  -9.554, 192.915],
        [ 78.921, -10.028, 191.772]], dtype=float32),
 array([[ 78.667, -10.296, 193.99 ],
        [ 78.411, -11.718, 193.885],
        [ 76.953, -12.063, 193.664],
        [ 76.593, -13.242, 193.593]], dtype=float32),
 array([[ 76.106, -11.047, 193.552],
        [ 74.68 , -11.249, 193.323],
        [ 74.461, -11.614, 191.862],
        [ 75.008, -10.971, 190.958]], dtype=float32),
 array([[ 

# test utils.py/get_knearest_epi

In [22]:
import heapq

In [41]:
mode = 0
K = 48

# get k nearest (K = 48)
if mode==0:
    for i in range(len(data)):
        # maintain a heap with k amino acids
        epitope = []
        Apos = np.hstack(data[i]["Apos"])
        Aseq = "".join(data[i]["Aseq"])
        for Aidx in range(len(Aseq)):
            # traverse heavy/light chain to find nearest distance
            nearest_dist = np.inf
            for Hidx in range(len(data[i]["Hpos"])):
                cur_dist = np.sqrt(np.sum((Apos[Aidx][0] - data[i]["Hpos"][Hidx][0]) ** 2))
                nearest_dist = np.min([cur_dist, nearest_dist])
            for Lidx in range(len(data[i]["Lpos"])):
                cur_dist = np.sqrt(np.sum((Apos[Aidx][0] - data[i]["Lpos"][Lidx][0]) ** 2))
                nearest_dist = np.min([cur_dist, nearest_dist])

            epitope.append((nearest_dist, Aidx))

        epitope_heap = heapq.nsmallest(K, epitope, key=lambda x:x[0])
        epitope_index = sorted([i[1] for i in epitope_heap])

        data[i]["epitope"] = "".join([aseq[i] for i in epitope_index])
        break

In [42]:
data[0].keys()

dict_keys(['pdb', 'Hchain', 'Lchain', 'Achain', 'Hseq', 'Lseq', 'Aseq', 'L1', 'L2', 'L3', 'H1', 'H2', 'H3', 'Hpos', 'Lpos', 'Apos', 'epitope'])

In [47]:
data[0]["epitope"]

'NMEVSCYEASISDFACSKKMTGKLTMNNKHPWHAADTGTPHWMDGAKG'

In [49]:
pickle.dump(data, open("./demo_data.pkl", "wb"))

In [50]:
data1 = pickle.load(open("./demo_data.pkl", "rb"))
len(data1)

5359

In [52]:
data1[0].keys()

dict_keys(['pdb', 'Hchain', 'Lchain', 'Achain', 'Hseq', 'Lseq', 'Aseq', 'L1', 'L2', 'L3', 'H1', 'H2', 'H3', 'Hpos', 'Lpos', 'Apos', 'epitope'])

In [51]:
len(data)

5359

In [38]:
# aseq = "".join(data[i]["Aseq"])
# epi_index = data[0]["epitope"]

# "".join([aseq[i] for i in epi_index])

'NMEVSCYEASISDFACSKKMTGKLTMNNKHPWHAADTGTPHWMDGAKG'

In [45]:
epitope_heap

[(4.025998115539551, 120),
 (4.572774887084961, 122),
 (4.610614776611328, 279),
 (4.689812660217285, 123),
 (4.719902038574219, 125),
 (4.904451370239258, 121),
 (5.198207378387451, 233),
 (5.715717315673828, 232),
 (5.928899765014648, 204),
 (5.9649763107299805, 124),
 (6.015800476074219, 205),
 (6.222243785858154, 278),
 (6.478855133056641, 126),
 (6.5371809005737305, 207),
 (6.65501070022583, 280),
 (6.8116655349731445, 206),
 (6.899214744567871, 119),
 (7.3167405128479, 231),
 (7.490304946899414, 127),
 (7.526638507843018, 234),
 (7.834255218505859, 235),
 (7.8423590660095215, 208),
 (7.913382530212402, 60),
 (8.017367362976074, 277),
 (8.02734375, 230),
 (8.091531753540039, 225),
 (8.226497650146484, 66),
 (8.301057815551758, 59),
 (8.370485305786133, 228),
 (8.457207679748535, 61),
 (8.466352462768555, 276),
 (8.509766578674316, 62),
 (8.640408515930176, 63),
 (8.71596622467041, 229),
 (8.930583953857422, 203),
 (8.934435844421387, 54),
 (9.243196487426758, 55),
 (9.259186744689

In [46]:
epitope

[(32.8936653137207, 0),
 (30.107135772705078, 1),
 (30.423547744750977, 2),
 (32.810401916503906, 3),
 (31.96693992614746, 4),
 (30.500171661376953, 5),
 (30.18033218383789, 6),
 (30.703096389770508, 7),
 (32.23325729370117, 8),
 (34.15449905395508, 9),
 (37.3301887512207, 10),
 (38.003326416015625, 11),
 (40.524864196777344, 12),
 (42.45100784301758, 13),
 (42.081947326660156, 14),
 (41.65396499633789, 15),
 (43.87492370605469, 16),
 (42.427913665771484, 17),
 (40.55342102050781, 18),
 (38.62664031982422, 19),
 (36.73564529418945, 20),
 (36.21327590942383, 21),
 (33.63908386230469, 22),
 (32.01983642578125, 23),
 (28.799253463745117, 24),
 (27.46338653564453, 25),
 (24.877246856689453, 26),
 (24.013507843017578, 27),
 (26.24178695678711, 28),
 (28.833772659301758, 29),
 (30.73711585998535, 30),
 (33.57426071166992, 31),
 (35.20677947998047, 32),
 (38.42568588256836, 33),
 (40.62495040893555, 34),
 (43.51736831665039, 35),
 (43.81686782836914, 36),
 (41.41398239135742, 37),
 (38.782077