In this section, you will delve into the intriguing world of genomic sequence encoding using advanced tech-
niques from the realm of hyper-dimensional computing. Genomic sequences, which consist of nucleotide bases
represented by the letters A, C, G, and T, hold vital information about an organism’s DNA. By develop-
ing an encoder specifically designed for genomic sequences, you will uncover unique patterns and structures
inherent in the genetic data. To accomplish this, you will employ the power of hyper-dimensional spaces
and explore the concept of capacity in relation to three crucial hyperparameters: encoded sequence length,
window length, and the number of sequences bundled together. Use the genomic library dataset given.
1


In [73]:
import torch
import numpy as np
import math

In [67]:
with open('../genome_dataset/train.txt') as f:
    traindata = f.readline()

import pandas as pd
valid_data = pd.read_csv('../genome_dataset/valid.csv')
test_data = pd.read_csv("../genome_dataset/test.csv")

In [74]:
class Genome_Sequence_Model(object):
    '''' Hyperdimensional classification module . Arguments :
     * dimension ( int , > 0) : The dimensionality of the high dimensional representation .
     * sequence_length ( int , > 0) : The number genomic elements in the sequence.
     * memory length ( int , > 0) : The number of sequences bundled together
     * window length ( int , > 0) : The sliding window value to move through the genomic library
     * probability_distribution(string): Probability distribution being used
                                         will be a Gaussian , with std =1 and mean =0
    '''
    
    def __init__(self,dimension: int,sequence_length : int, memory_length: int, window_length:int, probability_distribution = 'Gaussian', classnum = 4):
        self.dimension = dimension
        self.seqlen = sequence_length
        self.memlen = memory_length
        self.winlen = window_length
        self.classnum = classnum
        self.probdist = probability_distribution
        self.letter_hypervector = self.generate_basis(probability_distribution)
        self.cos = torch.nn.CosineSimilarity()
        
    def generate_basis(self,probability_distribution):
        size = (self.classnum,self.dimension)
        if self.probdist == 'Uniform':
            return torch.empty(size).uniform_(-1,1)
    
        elif self.probdist == 'Gaussian':
            return torch.Tensor(np.random.normal(0,1,size))
        
        elif self.probdist == 'Laplacian':
             return torch.Tensor(np.random.laplace(0,1,size=size))
        
    def encode_sequence(self, seq):
        encoded_sequence = torch.ones((self.dimension))
        for p, c in enumerate(seq):
            encoded_sequence *= torch.cat([gsm.letter_hypervector[int(c)][p:], \
                                           gsm.letter_hypervector[int(c)][:p]]) # permutation of hypervector
            
        return torch.tanh(encoded_sequence)
    
    def generate_subseq(self,gen_lib):
        if len(gen_lib) <= self.seqlen:
            return [gen_lib]
        return [gen_lib[i:i+self.seqlen] \
                         for i in range(0, len(gen_lib)-self.seqlen + 1, \
                                        self.winlen)] # Sliding window through the genome library  
    def encode_library(self,genlib_data):
        self.genlib_hypervector = torch.zeros((math.ceil(len(genlib_data)/float(self.memlen)),self.dimension)) 
        for i,m in enumerate(self.generate_subseq(genlib_data)):
            if i%100 ==0:
                print(i)
            self.genlib_hypervector[i//self.memlen] += self.encode_sequence(m)
            
    def detect_cossim(self,query_sequence):
        query_hypervector = self.encode_sequence(query_sequence)
        cossim = max(self.cos(self.genlib_hypervector,query_hypervector))
        return cossim
    
    def detect_sequence(self,query_sequence,threshold = 0.5):
        query_hypervector = self.encode_sequence(query_sequence)
        cossim = max(self.cos(self.genlib_hypervector,query_hypervector))
        return (cossim> threshold)

In [75]:
gsm = Genome_Sequence_Model(dimension=10000,\
                                            sequence_length=100,\
                                            memory_length=1000,\
                                            window_length=1,\
                                            probability_distribution='Gaussian')

In [None]:
gsm.encode_library(traindata)

In [92]:
valid_data['cos_predict'] = valid_data['genome_sequence'].map(lambda x : gsm.detect_cossim(x))
for t in range(1,999):
    threshold = t/10000
    print(threshold)
    valid_data[f'predict_{threshold}'] = valid_data['cos_predict'].map(lambda x : x >= threshold)
    print(sum(valid_data['label'] == valid_data[f'predict_{threshold}']))

0.0001
50
0.0002
50
0.0003
50
0.0004
50
0.0005
50
0.0006
51
0.0007
51
0.0008
51
0.0009
51
0.001
51
0.0011
51
0.0012
51
0.0013
50
0.0014
50
0.0015
50
0.0016
50
0.0017
50
0.0018
50
0.0019
50
0.002
50
0.0021
50
0.0022
50
0.0023
50
0.0024
49
0.0025
49
0.0026
48
0.0027
48
0.0028
48
0.0029
48
0.003
49
0.0031
49
0.0032
49
0.0033
50
0.0034
50
0.0035
50
0.0036
50
0.0037
50
0.0038
49
0.0039
49
0.004
48
0.0041
48
0.0042
48
0.0043
47
0.0044
47
0.0045
47
0.0046
46
0.0047
47
0.0048
47
0.0049
46
0.005
46
0.0051
47
0.0052
47
0.0053
47
0.0054
46
0.0055
46
0.0056
46
0.0057
46
0.0058
46
0.0059
46
0.006
46
0.0061
44
0.0062
44
0.0063
44
0.0064
43
0.0065
43
0.0066
43
0.0067
43
0.0068
43
0.0069
43
0.007
43
0.0071
43
0.0072
42
0.0073
41
0.0074
41
0.0075
41
0.0076
41
0.0077
41
0.0078
41
0.0079
41
0.008
41
0.0081
41
0.0082
41
0.0083
41
0.0084
42
0.0085
42
0.0086
42
0.0087
42
0.0088
41
0.0089
41
0.009
42
0.0091
42
0.0092
41
0.0093
41
0.0094
40
0.0095
40
0.0096
40
0.0097
40
0.0098
40
0.0099
41
0.01
41
0.0101
41
0

53
0.0953
53
0.0954
53
0.0955
53
0.0956
53
0.0957
52
0.0958
52
0.0959
52
0.096
52
0.0961
52
0.0962
52
0.0963
53
0.0964
53
0.0965
53
0.0966
53
0.0967
53
0.0968
53
0.0969
53
0.097
53
0.0971
53
0.0972
53
0.0973
53
0.0974
53
0.0975
53
0.0976
53
0.0977
53
0.0978
53
0.0979
53
0.098
53
0.0981
53
0.0982
53
0.0983
53
0.0984
53
0.0985
53
0.0986
53
0.0987
53
0.0988
53
0.0989
53
0.099
53
0.0991
53
0.0992
53
0.0993
53
0.0994
53
0.0995
53
0.0996
52
0.0997
52
0.0998
52


In [93]:
test_data['predict_cos'] = test_data['genome_sequence'].map(lambda x : gsm.detect_cossim(x))
test_data['predict'] = test_data['genome_sequence'].map(lambda x : gsm.detect_sequence(x,0.0993))

In [95]:
print(f"Test Acc : {sum(test_data['label'] == test_data['predict'])} % ")

Test Acc : 53 % 


### Hyperparemter Exploration

In [None]:
acc={}
for sl in range(60,110,10):
    for c in range(400, 1100,100):
        for w in range(1,4):
            for d in [256,512, 4096, 10000]:
                print(sl,c,w,d)
                gsm = Genome_Sequence_Model(dimension=d,\
                                            sequence_length=sl,\
                                            memory_length=c,\
                                            window_length=w,\
                                            probability_distribution='Gaussian')
                gsm.encode_library(traindata)
                th = min(valid_data[valid_data['label']==1]['genome_sequence'].map(lambda x : gsm.detect_cossim(x)))
                test_data['predict'] = test_data['genome_sequence'].map(lambda x : gsm.detect_sequence(x,th))
                acc[(sl,c,w,d)] = sum(test_data['label'] == test_data['predict'])

In [101]:
result_df = pd.DataFrame(columns= ['seqlen','memorylen','window','dim','acc'])
for k in acc:
    result_df = result_df.append({'seqlen':k[0],'memorylen':k[1],'window': k[2],'dim':k[3],'acc':acc[k]},ignore_index=True )
result_df['acc'] = result_df['acc'].map(lambda x: x/100)

In [107]:
result_df[result_df['acc']>0.53]

Unnamed: 0,seqlen,memorylen,window,dim,acc
103,70,500,2,10000,0.56
133,70,800,1,512,0.55
144,70,900,1,256,0.54
281,90,600,2,512,0.59
336,100,400,1,256,0.54
344,100,400,3,256,0.55
384,100,800,1,256,0.54
405,100,900,3,512,0.54


In [105]:
result_df.to_csv('./genome_detect_result.csv')