# Encoding example
Examples for DCA encoding, UniRep encoding, eUniRep encoding and PAM250 encoding. 

Actual complete encoding process for all datasets is in `encoding.py` script. 

In [1]:
from scripts.utils import read_variants, read_a2m, generate_unambiguous_homologs
from Bio import SeqIO
import numpy as np

wt_fasta = 'raw_data/bg_strsq/bg_strsq.fasta'
sequences_file = 'raw_data/bg_strsq/BG_STRSQ_Abate2015_encoded.csv'
homologs_file = 'raw_data/bg_strsq/bg_strsq_jhmmer.a2m'
params_file = 'raw_data/bg_strsq/bg_strsq_plmc.params'
wt = np.array(SeqIO.read(open(wt_fasta), 'fasta'))

In [2]:
#read sequences 
sequences, y, variants = read_variants(sequences_file, wt_fasta, start_pos=2)

print('sequences shape: ', sequences.shape)
print('y shape: ', y.shape)

sequences shape:  (2598, 478)
y shape:  (2598,)


In [3]:
#read homologs
homologs = read_a2m(homologs_file)

print('homologs shape: ', homologs.shape)

homologs shape:  (124487, 478)


In [4]:
#convert homologs with ambiguous symbols like X to only amino sequences
unambiguous_homologs = generate_unambiguous_homologs(homologs[:1000,:], n_processes=10, mode='random')
print('unambiguous homologs shape: ', unambiguous_homologs.shape)

unambiguous homologs shape:  (1000, 478)


### DCA encoding

In [5]:
from scripts.encoding import SequencesDCAEncoder

wt = np.array(SeqIO.read(open(wt_fasta), 'fasta'))
start_pos = 2 #depends on the dataset 

encoder = SequencesDCAEncoder(wt, start_pos, params_file)

In [8]:
Xl, y, indexes = encoder.encode_variants(sequences, y, variants)

#Notice that the encoder eliminates some instances. 
print('Labeled sequences shape: ', Xl.shape)

Labeled sequences shape:  (2598, 435)


In [9]:
#The number of instances after encoding can be smaller. 
print('y shape: ', y.shape)

y shape:  (2598,)


In [10]:
Xu = encoder.encode_homologs(unambiguous_homologs)

print('Unlabeled sequences shape: ', Xu.shape)

Unlabeled sequences shape:  (1000, 435)


### UniRep encoding

In [11]:
from scripts.encoding import UniRepEncoder

wt = np.array(SeqIO.read(open(wt_fasta), 'fasta'))
start_pos = 2 #depends on the dataset 

encoder = UniRepEncoder(wt, start_pos)

In [12]:
Xl = encoder.encode(sequences)

print('Labeled sequences shape: ', Xl.shape)

#This method can not encode the homologous sequences as they contain gaps. 

Labeled sequences shape:  (2598, 1900)


### eUniRep encoding

In [None]:
from scripts.encoding import EUniRepEncoder

wt = np.array(SeqIO.read(open(wt_fasta), 'fasta'))
start_pos = 2 #depends on the dataset

encoder = EUniRepEncoder(wt, start_pos, homologs) #uses the homologous sequences to fine-tune. 

#The execution of this cell is very long. 

In [None]:
Xl = encoder.encode(sequences)

print('Labeled sequences shape: ', Xl.shape)

#This method can not encode the homologous sequences as they contain gaps. 

### PAM250 encoding

In [15]:
from scripts.encoding import PAM250Encoder

wt = np.array(SeqIO.read(open(wt_fasta), 'fasta'))
start_pos = 2 #depends on the dataset

encoder = PAM250Encoder(wt, start_pos)

In [16]:
Xl = encoder.encode(sequences)

print('Labeled sequences shape: ', Xl.shape)

Labeled sequences shape:  (2598, 478, 20)


In [17]:
Xu = encoder.encode(unambiguous_homologs)

print('Unlabeled sequences shape: ', Xu.shape)

Unlabeled sequences shape:  (1000, 478, 20)
