# Encoding example 
Examples for ohe-hot encoding and DCA encoding. Actual complete encoding process for all datasets (including Unirep and eUnirep encodings) is in `encoding.py` script. 

In [None]:
from scripts.utils import read_variants, read_a2m, generate_unambiguous_homologs
from Bio import SeqIO
import numpy as np

wt_fasta = '../Data/bg_strsq/bg_strsq.fasta'
sequences_file = '../Data/bg_strsq/BG_STRSQ_Abate2015_encoded.csv'
homologs_file = '../Data/bg_strsq/bg_strsq_jhmmer.a2m'
params_file = '../Data/bg_strsq/bg_strsq_plmc.params'
wt = np.array(SeqIO.read(open(wt_fasta), 'fasta'))

In [None]:
#read sequences 
sequences, y, variants = read_variants(sequences_file, wt_fasta, start_pos=2)

print('sequences shape: ', sequences.shape)
print('y shape: ', y.shape)

In [30]:
#read homologs
homologs = read_a2m(homologs_file)

print('homologs shape: ', homologs.shape)

homologs shape:  (124487, 478)


In [10]:
#convert homologs with ambiguous symbols like X to only amino sequences
unambiguous_homologs = generate_unambiguous_homologs(homologs[:1000,:], n_processes=10, mode='random')
print('unambiguous homologs shape: ', unambiguous_homologs.shape)

unambiguous homologs shape:  (2250, 478)


### One hot encoding

In [19]:
from scripts.encoding import SequencesOneHotEncoder

encoder = SequencesOneHotEncoder(wt, start_pos=2)

In [20]:
Xl = encoder.encode(sequences)

print('Labeled sequences shape: ', Xl.shape)

Labeled sequences shape:  (2598, 478, 20)


In [21]:
Xu = encoder.encode(unambiguous_homologs)

print('Unlabeled sequences shape: ', Xu.shape)

Unlabeled sequences shape:  (134, 478, 20)


### DCA encoding

In [19]:
from scripts.encoding import SequencesDCAEncoder

encoder = SequencesDCAEncoder(wt, 2, params_file)

No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


In [23]:
Xl, y = encoder.encode_variants(sequences, y, variants)

print('Labeled sequences shape: ', Xl.shape)

Labeled sequences shape:  (2598, 435)


(2598, 435)

In [24]:
#The number of instances after encoding can be smaller. 
print('y shape: ', y.shape)

y shape:  (2598,)


In [25]:
Xu = encoder.encode_homologs(unambiguous_homologs)

print('Unlabeled sequences shape: ', Xu.shape)

Unlabeled sequences shape:  (134, 435)
