In [1]:
import h5py
import pickle
import pandas as pd
import numpy as np
import re
import random
%matplotlib inline

from translator import SignalTranslator

with open('../outputs/ctable_token.pkl', 'rb') as f:
    ctable = pickle.load(f)

In [2]:
# Helper functions for tokenizing new inputs
alphabet = ' .$ACDEFGHIKLMNPQRSTUVWXYZ'
max_len_in = 107 # max length of prot seq (105 aa) + 2 for tokens
max_len_out = 72
n_chars = len(alphabet)

with open('../data/ctable_copies/ctable_token_master.pkl', 'rb') as f:
    ctable = pickle.load(f)

def encode(seqs, max_len, ctable):
    if ctable.one_hot:
        X = np.zeros((len(seqs), max_len, n_chars))
    else:
        X = np.zeros((len(seqs), max_len))
    seqs = ['$' + seq + '.' for seq in seqs]
    seqs = [seq + ' ' * ((max_len) - len(seq))for seq in seqs]
    for i, seq in enumerate(seqs):
        X[i] = ctable.encode(seq, max_len)
    return X

def to_h5py(seqs, fname, ctable):
    chunksize = 500
    with h5py.File(fname, 'w') as f:
        if ctable.one_hot:
            print('true')
            X = f.create_dataset('X', (len(seqs), max_len_in, n_chars))
        else:
            X = f.create_dataset('X', (len(seqs), max_len_in))          
        for i in range(0, len(seqs), chunksize):
            X[i:i + chunksize, :] = encode([seq for seq in seqs[i:i+chunksize]], max_len_in, ctable)
        left = len(seqs) % chunksize
        if left > 0:
            X[-left:, :] = encode([seq for seq in seqs[-left:]], max_len_in, ctable)

In [3]:
def remove_signal_peptide(df):
    def get_sp_length(signal_peptide):
        match = re.search(r'\d+\.\.(\d+)', signal_peptide)
        return int(match.group(1)) if match else 0
    
    signal_col = "Signal peptide"
    seq_col = "Sequence"
    df_copy = df.copy()
    df_copy["Sequence"] = df_copy.apply(lambda x: x[seq_col][get_sp_length(x[signal_col]):], axis = 1)
    return df_copy

In [4]:
def generate_sps(inputs: pd.DataFrame, clf: SignalTranslator, beam_size):
    test_seqs = pd.Series(inputs['Sequence']).values
    test_seqs = [s[:100] for s in test_seqs]
    test_filename = ('../data/temp_tokens.hdf5')
    to_h5py(test_seqs, test_filename, ctable)

    file = h5py.File(test_filename)
    training_data = SignalTranslator.generator_from_h5_noy(file, 64, shuffle=False, use_cuda=False)
    src = next(training_data) # src is prot sequence, tgt is signal pep
    file.close()
    clf_outputs  = clf.translate_batch(src, beam=beam_size)
    decoded, all_hyp, all_scores, enc_outputs, dec_outputs, enc_slf_attns, dec_slf_attns, dec_enc_attn = clf_outputs

    results = pd.DataFrame({"source": [ctable.decode(s.data.cpu().numpy()) for s in src[0]],
                            "sp": decoded})
    return results

In [5]:
# Load a Model Checkpoint
chkpt_name = 'SIM99_550_12500_64_6_5_0.1_64_100_0.0001_-0.03_99'
chkpt = "../outputs/models/model_checkpoints/" + chkpt_name + ".chkpt"
clf = SignalTranslator.load_model(chkpt)

Namespace(cuda=False, d_inner_hid=1100, d_k=64, d_model=550, d_v=64, d_word_vec=550, dropout=0.1, embs_share_weight=True, max_token_seq_len=107, n_head=5, n_layers=6, proj_share_weight=True, src_vocab_size=27, tgt_vocab_size=27) Namespace(beam_size=1, ctable=<tools.CharacterTable object at 0x7fc1a0886160>, max_trans_length=72, n_best=1) Namespace(d_model=None, decay_power=-0.03, lr_max=0.0001, n_warmup_steps=12500, optim=<class 'torch.optim.adam.Adam'>)
position_encoding
position_encoding
Initiated Transformer with 27403200 parameters.


In [13]:
# Load input proteins and remove existing signal peptides
df = pd.read_csv("../data/pharmaceutical_proteins.tsv", sep="\t")
df = remove_signal_peptide(df)

with open("../data/mature_sequences.txt") as f:
    seqs = [line.strip() for line in f.readlines() if not line.startswith(">")]

df = pd.concat([df, pd.DataFrame({"Sequence": seqs})], ignore_index=True)

In [14]:
# Generate SPs for all proteins combined
full_sps = generate_sps(df, clf=clf, beam_size=10)

# Generate SPs for all proteins separately
individual_sps = pd.concat(df.apply(lambda row: generate_sps(row, clf, beam_size=10), axis=1).to_list(), ignore_index=True)

# Generate SPs for random combinations of proteins
random_combn_sps = pd.concat([
    generate_sps(df.sample(n=15, random_state=i), clf, beam_size=5)
    for i in range(20)
])
random_combn_sps_smol = pd.concat([
    generate_sps(df.sample(n=5, random_state=i), clf, beam_size=5)
    for i in range(100)
])


all_sps = pd.concat([full_sps, individual_sps, random_combn_sps, random_combn_sps_smol]).drop_duplicates(subset="sp", ignore_index=True)

  result = self.forward(*input, **kwargs)
  out = self.model.prob_projection(dec_output)


In [16]:
all_sps.to_csv("../outputs/results/generated_sps.csv", index=False)