# 1. Computing embeddings from protein sequence

In [None]:
import _pickle as cPickle
import numpy as np
import h5py
from operator import itemgetter
import copy
import random
from tqdm import tqdm
from collections import Counter, defaultdict

import sys
sys.path.append("../utils/")
import encoders as en

## 1.1. Get One-Hot Encoding

In [None]:
# Ubuntu                    22.04
# CUDA                      12.4
# Driver Version            550.54.14

# miniconda3                24.1.2

# python                    3.10.9 
# numpy                     1.26.4

In [None]:
single_sequence = 'MNLLCCCCCSNMAPNQRVTRKWELFAGRNKFYCDGLLMSAPHTGVFYLTCILITGTSALF'

encoder = en.OHE_Encoder()
x = encoder.encode_sequence(single_sequence)
x.shape

(21, 1)

## 1.2. Get BLOSUM-62

In [None]:
# Ubuntu                    22.04
# CUDA                      12.4
# Driver Version            550.54.14

# miniconda3                24.1.2

# python                    3.10.9 
# numpy                     1.26.4

In [None]:
single_sequence = 'MNLLCCCCCSNMAPNQRVTRKWELFAGRNKFYCDGLLMSAPHTGVFYLTCILITGTSALF'

encoder = en.BL62_Encoder()
x = encoder.encode_sequence(single_sequence)
x.shape

(23, 1)

## 1.3. Get ESM-2 Encoding

In [None]:
# Ubuntu                    22.04
# CUDA                      12.4
# Driver Version            550.54.14

# miniconda3                24.1.2

# python                    3.10.9 
# numpy                     1.26.4
# torch                     2.4.1
# transformers              4.39.2
# fair-esm                  2.0.0

In [None]:
single_sequence = 'MNLLCCCCCSNMAPNQRVTRKWELFAGRNKFYCDGLLMSAPHTGVFYLTCILITGTSALF'

encoder = en.ESM2_Encoder()
x = encoder.encode_sequence(single_sequence)
x.shape

(1280, 1)

## 1.4. Get ProtT5 Encodings

In [None]:
# Ubuntu                    22.04
# CUDA                      12.4
# Driver Version            550.54.14

# miniconda3                24.1.2

# python                    3.10.9 
# numpy                     1.26.4
# torch                     2.4.1
# transformers              4.39.2

In [None]:
single_sequence = 'MNLLCCCCCSNMAPNQRVTRKWELFAGRNKFYCDGLLMSAPHTGVFYLTCILITGTSALF'

encoder = en.T5_Encoder()
x = encoder.encode_sequence(single_sequence)
x.shape

Loading ProtT5 model: Rostlab/prot_t5_xl_half_uniref50-enc
Using half precision on GPU.


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


(1024, 1)

## 1.5. Get xTrimoPGLM Encoding

In [None]:
# Ubuntu                    22.04
# CUDA                      12.4
# Driver Version            550.54.14

# miniconda3                24.1.2

# python                    3.13.2 
# numpy                     2.2.4
# torch                     2.6.0
# transformers              4.51.1
# deepspeed                 0.16.5

In [None]:
single_sequence = 'MNLLCCCCCSNMAPNQRVTRKWELFAGRNKFYCDGLLMSAPHTGVFYLTCILITGTSALF'

encoder = en.xTrimoPGLM_Encoder()
x = encoder.encode_sequence(single_sequence)
x.shape

# 2. Example saving to a pickle object

In [None]:
with open("example.pickle", 'wb') as file:
    cPickle.dump(x, file)
    cPickle.dump(np.array([0.]), file)   # [0.] for negative, [1.] for positive