In this section, you will delve into the intriguing world of genomic sequence encoding using advanced tech-
niques from the realm of hyper-dimensional computing. Genomic sequences, which consist of nucleotide bases
represented by the letters A, C, G, and T, hold vital information about an organism’s DNA. By develop-
ing an encoder specifically designed for genomic sequences, you will uncover unique patterns and structures
inherent in the genetic data. To accomplish this, you will employ the power of hyper-dimensional spaces
and explore the concept of capacity in relation to three crucial hyperparameters: encoded sequence length,
window length, and the number of sequences bundled together. Use the genomic library dataset given.
1


In [89]:
import torch
import numpy as np

In [199]:
with open('../genome_dataset/train.txt') as f:
    traindata = f.readline()

import pandas as pd
valid_data = pd.read_csv('../genome_dataset/valid.csv')
test_data = pd.read_csv("../genome_dataset/test.csv")

In [208]:
class Genome_Sequence_Model(object):
    '''' Hyperdimensional classification module . Arguments :
     * dimension ( int , > 0) : The dimensionality of the high dimensional representation .
     * sequence_length ( int , > 0) : The number genomic elements in the sequence.
     * memory length ( int , > 0) : The number of sequences bundled together
     * window length ( int , > 0) : The sliding window value to move through the genomic library
     * probability_distribution(string): Probability distribution being used
                                         will be a Gaussian , with std =1 and mean =0
    '''
    
    def __init__(self,dimension: int,sequence_length : int, memory_length: int, window_length:int, probability_distribution = 'Gaussian', classnum = 4):
        self.dimension = dimension
        self.seqlen = sequence_length
        self.memlen = memory_length
        self.winlen = window_length
        self.classnum = classnum
        self.probdist = probability_distribution
        self.letter_hypervector = self.generate_basis(probability_distribution)
        self.cos = torch.nn.CosineSimilarity()
        
    def generate_basis(self,probability_distribution):
        size = (self.classnum,self.dimension)
        if self.probdist == 'Uniform':
            return torch.empty(size).uniform_(-1,1)
    
        elif self.probdist == 'Gaussian':
            return torch.Tensor(np.random.normal(0,1,size))
        
        elif self.probdist == 'Laplacian':
             return torch.Tensor(np.random.laplace(0,1,size=size))
        
    def encode_sequence(self, seq):
        encoded_sequence = torch.ones((self.dimension))
        for p, c in enumerate(seq):
            encoded_sequence *= torch.cat([gsm.letter_hypervector[int(c)][p:], \
                                           gsm.letter_hypervector[int(c)][:p]]) # permutation of hypervector
            
        return torch.tanh(encoded_sequence)
    
    def generate_subseq(self,gen_lib):
        if len(gen_lib) <= self.seqlen:
            return [gen_lib]
        return [gen_lib[i:i+self.seqlen] \
                         for i in range(0, min(self.memlen,len(gen_lib)-self.seqlen + 1),\
                                        self.winlen)] # Sliding window through the genome library  
    
    def encode_library(self,genlib_data):
        self.genlib_hypervector = torch.zeros((1,self.dimension)) 
        for m in self.generate_subseq(genlib_data):
            self.genlib_hypervector += self.encode_sequence(m)
            
    def detect_cossim(self,query_sequence):
        query_hypervector = self.encode_sequence(query_sequence)
        cossim = self.cos(self.genlib_hypervector,query_hypervector)
        return cossim[0]
    
    def detect_sequence(self,query_sequence,threshold = 0.5):
        query_hypervector = self.encode_sequence(query_sequence)
        cossim = self.cos(self.genlib_hypervector,query_hypervector)
        return (cossim> threshold)[0]

In [214]:
gsm = Genome_Sequence_Model(dimension=10000,\
                            sequence_length=100,\
                            memory_length=1000,\
                            window_length=1,\
                            probability_distribution='Gaussian')

In [215]:
gsm.encode_library(traindata)

In [216]:
valid_data['cos_predict'] = valid_data['genome_sequence'].map(lambda x : gsm.detect_cossim(x))
for t in range(1,999):
    threshold = t/1000
    print(threshold)
    valid_data[f'predict_{threshold}'] = valid_data['cos_predict'].map(lambda x : x >= threshold)
    print(sum(valid_data['label'] == valid_data[f'predict_{threshold}']))

0.001
61
0.002
57
0.003
53
0.004
53
0.005
54
0.006
56
0.007
54
0.008
53
0.009
53
0.01
53
0.011
53
0.012
53
0.013
53
0.014
52
0.015
51
0.016
51
0.017
51
0.018
50
0.019
50
0.02
50
0.021
50
0.022
50
0.023
50
0.024
50
0.025
50
0.026
50
0.027
50
0.028
50
0.029
50
0.03
50
0.031
50
0.032
50
0.033
50
0.034
50
0.035
50
0.036
50
0.037
50
0.038
50
0.039
50
0.04
50
0.041
50
0.042
50
0.043
50
0.044
50
0.045
50
0.046
50
0.047
50
0.048
50
0.049
50
0.05
50
0.051
50
0.052
50
0.053
50
0.054
50
0.055
50
0.056
50
0.057
50
0.058
50
0.059
50
0.06
50
0.061
50
0.062
50
0.063
50
0.064
50
0.065
50
0.066
50
0.067
50
0.068
50
0.069
50
0.07
50
0.071
50
0.072
50
0.073
50
0.074
50
0.075
50
0.076
50
0.077
50
0.078
50
0.079
50
0.08
50
0.081
50
0.082
50
0.083
50
0.084
50
0.085
50
0.086
50
0.087
50
0.088
50
0.089
50
0.09
50
0.091
50
0.092
50
0.093
50
0.094
50
0.095
50
0.096
50
0.097
50
0.098
50
0.099
50
0.1
50
0.101
50
0.102
50
0.103
50
0.104
50
0.105
50
0.106
50
0.107
50
0.108
50
0.109
50
0.11
50
0.111
50
0.112
50
0.11

In [217]:
test_data['predict_cos'] = test_data['genome_sequence'].map(lambda x : gsm.detect_cossim(x))
test_data['predict'] = test_data['genome_sequence'].map(lambda x : gsm.detect_sequence(x,0.001))

In [218]:
print(f"Test Acc : {sum(test_data['label'] == test_data['predict'])} % ")

Test Acc : 61 % 
