# Peptide encoding to low-dim vector

Encoding of epitopes using TEIM paper autoencoder

[![DOI:10.1016/j.immuni.2023.09.002](https://zenodo.org/badge/DOI/10.1007/978-3-319-76207-4_15.svg)](https://doi.org/10.1038/s42256-023-00634-4)


To run the encoding, the following code MUST be copied into a file called `data_process.py` in the same dir that this notebook:

```
from Bio.Align import substitution_matrices
import numpy as np
import torch
import sys

sys.path.append('.')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def GetBlosumMat(residues_list):
    n_residues = len(residues_list)  # the number of amino acids _ 'X'
    blosum62_mat = np.zeros([n_residues, n_residues])  # plus 1 for gap
    bl_dict = substitution_matrices.load('BLOSUM62')
    for pair, score in bl_dict.items():
        if (pair[0] not in residues_list) or (pair[1] not in residues_list):  # special residues not considered here
            continue
        idx_pair0 = residues_list.index(pair[0])  # index of residues
        idx_pair1 = residues_list.index(pair[1])
        blosum62_mat[idx_pair0, idx_pair1] = score
        blosum62_mat[idx_pair1, idx_pair0] = score
    return blosum62_mat


class Tokenizer:
    def __init__(self,):
        self.res_all = ['G', 'A', 'V', 'L', 'I', 'F', 'W', 'Y', 'D', 'N',
                     'E', 'K', 'Q', 'M', 'S', 'T', 'C', 'P', 'H', 'R'] #+ ['X'] #BJZOU
        self.tokens = ['-'] + self.res_all # '-' for padding encoding

    def tokenize(self, index): # int 2 str
        return self.tokens[index]

    def id(self, token): # str 2 int
        try:
            return self.tokens.index(token.upper())
        except ValueError:
            print('Error letter in the sequences:', token)
            if str.isalpha(token):
                return self.tokens.index('X')

    def tokenize_list(self, seq):
        return [self.tokenize(i) for i in seq]

    def id_list(self, seq):
        return [self.id(s) for s in seq]

    def embedding_mat(self):
        blosum62 = GetBlosumMat(self.res_all)
        mat = np.eye(len(self.tokens))
        mat[1:len(self.res_all) + 1, 1:len(self.res_all) + 1] = blosum62
        return mat
```


In [1]:
# CODE FROM TEIM PAPER
#     Path on GitHub Repo:     TEIM/scripts/data_process.py

import torch
import torch.nn as nn
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    
class View(nn.Module):
    def __init__(self, *shape):
        super(View, self).__init__()
        self.shape = shape
    def forward(self, input):
        shape = [input.shape[0]] + list(self.shape)
        return input.view(*shape)
    

class AutoEncoder(nn.Module):
    def __init__(self, 
        tokenizer,
        dim_hid,
        len_seq,
    ):
        super().__init__()
        embedding = tokenizer.embedding_mat()
        vocab_size, dim_emb = embedding.shape
        self.embedding_module = nn.Embedding.from_pretrained(torch.FloatTensor(embedding), padding_idx=0, )
        self.encoder = nn.Sequential(
            nn.Conv1d(dim_emb, dim_hid, 3, padding=1),
            nn.BatchNorm1d(dim_hid),
            nn.ReLU(),
            nn.Conv1d(dim_hid, dim_hid, 3, padding=1),
            nn.BatchNorm1d(dim_hid),
            nn.ReLU(),
        )

        self.seq2vec = nn.Sequential(
            nn.Flatten(),
            nn.Linear(len_seq * dim_hid, dim_hid),
            nn.ReLU()
        )
        self.vec2seq = nn.Sequential(
            nn.Linear(dim_hid, len_seq * dim_hid),
            nn.ReLU(),
            View(dim_hid, len_seq)
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(dim_hid, dim_hid, kernel_size=3, padding=1),
            nn.BatchNorm1d(dim_hid),
            nn.ReLU(),
            nn.ConvTranspose1d(dim_hid, dim_hid, kernel_size=3, padding=1),
            nn.BatchNorm1d(dim_hid),
            nn.ReLU(),
        )
        self.out_layer = nn.Linear(dim_hid, vocab_size)

    def forward(self, inputs):
        inputs = inputs.long()
        seq_emb = self.embedding_module(inputs)
        
        seq_enc = self.encoder(seq_emb.transpose(1, 2))
        vec = self.seq2vec(seq_enc)
        seq_repr = self.vec2seq(vec)
        indices = None
        seq_dec = self.decoder(seq_repr)
        out = self.out_layer(seq_dec.transpose(1, 2))
        return out, seq_enc, vec, indices


def load_ae_model(tokenizer, path='./epi_ae.ckpt',):
    # tokenizer = Tokenizer()
    ## load model
    model_args = dict(
        tokenizer = tokenizer,
        dim_hid = 32,
        len_seq = 12,
    )
    model = AutoEncoder(**model_args)
    model.eval()

    ## load weights
    state_dict = torch.load(path, map_location=device)
    state_dict = {k[6:]:v for k, v in state_dict.items()}
    model.load_state_dict(state_dict)
    return model


class PretrainedEncoder:
    def __init__(self, tokenizer):
        self.ae_model = load_ae_model(tokenizer)
        self.tokenizer = tokenizer

    def encode_pretrained_epi(self, epi_seqs):
        enc = self.infer(epi_seqs)
        enc_vec = enc[2]
        enc_seq = enc[-1]
        return enc_seq, enc_vec
    
    def infer(self, seqs):
        # # seqs encoding
        n_seqs = len(seqs)
        len_seqs = [len(seq) for seq in seqs]
        assert (np.max(len_seqs) <= 12) and (np.min(len_seqs)>=8), ValueError('Lengths of epitopes must be within [8, 12]')
        encoding = np.zeros([n_seqs, 12], dtype='int32')
        for i, seq in enumerate(seqs):
            len_seq = len_seqs[i]
            if len_seq == 8:
                encoding[i, 2:len_seq+2] = self.tokenizer.id_list(seq)
            elif (len_seq == 9) or (len_seq == 10):
                encoding[i, 1:len_seq+1] = self.tokenizer.id_list(seq)
            else:
                encoding[i, :len_seq] = self.tokenizer.id_list(seq)
        # # pretrained ae features
        inputs = torch.from_numpy(encoding)
        out, seq_enc, vec, indices = self.ae_model(inputs)
        out = np.argmax(out.detach().cpu().numpy(), -1)
        return [
            out,
            seq_enc.detach().cpu().numpy(),
            vec.detach().cpu().numpy(),
            indices,
            encoding
        ]
    

In [2]:
# Manually load Tokenizer from their params
tokenizer = torch.load('base_model.ckpt', map_location=torch.device('cpu'))['hyper_parameters']['model_args']['tokenizer']
epi_encoder = PretrainedEncoder(tokenizer)

In [3]:
import pandas as pd

sample_train_file = '/home/bsccns/Documents/PhD/neoantigen_predictor/nimbus/data/raw/pHLA_binding/NetMHCpan_train/c000_ba'
df = pd.read_csv(sample_train_file, sep=' ', header=None)
seqs_epi_raw = df[0].values.tolist()

In [4]:
# Filter epitopes to 8 to 12 AA
seqs_epi = []
n_larger = 0
for s in seqs_epi_raw:
    if len(s) < 13:
        seqs_epi.append(s)
    else:
        n_larger +=1
        
print(f'{n_larger} peptides out of {len(seqs_epi_raw)} were longer than 12 AA')

88 peptides out of 41206 were longer than 12 AA


In [5]:
# seqs_epi = ['AAAAAAAA']
encoding_epi, epi_vec = epi_encoder.encode_pretrained_epi(seqs_epi)

epi_vec[:5]

array([[4.237492  , 2.5128953 , 3.5028358 , 1.0474035 , 5.3211064 ,
        2.8364499 , 0.        , 1.3840181 , 3.142711  , 2.7802851 ,
        3.1927416 , 3.7346904 , 1.4508824 , 2.1306121 , 1.5529792 ,
        1.6950285 , 3.5080004 , 2.1665826 , 1.6668513 , 1.2255855 ,
        1.2941527 , 2.1354334 , 5.0765424 , 2.4932017 , 1.3577513 ,
        2.8210332 , 3.274586  , 2.3020668 , 2.0123103 , 2.1274827 ,
        3.5270824 , 1.6267327 ],
       [4.237492  , 2.5128953 , 3.5028358 , 1.0474035 , 5.3211064 ,
        2.8364499 , 0.        , 1.3840181 , 3.142711  , 2.7802851 ,
        3.1927416 , 3.7346904 , 1.4508824 , 2.1306121 , 1.5529792 ,
        1.6950285 , 3.5080004 , 2.1665826 , 1.6668513 , 1.2255855 ,
        1.2941527 , 2.1354334 , 5.0765424 , 2.4932017 , 1.3577513 ,
        2.8210332 , 3.274586  , 2.3020668 , 2.0123103 , 2.1274827 ,
        3.5270824 , 1.6267327 ],
       [5.2254972 , 2.711007  , 3.3354301 , 1.0361304 , 5.320385  ,
        1.7822485 , 0.        , 0.63811386, 3.4599