In [47]:

class Encoder :
    ''' this class reads a file in binary and produces an encrypted DNA sequence using the dictionaries below. it also segments
    the sequence by inserting a location-specific tag every 256 nucleotides'''

    dec2BinQuinTable = {       # Our dict of lists used as a cipher to substitute all 16 4-bit combinations with pairs of nucleotides
        0: ["0000", "TA"],                
        1: ["0001", "GB"],    
        2: ["0010", "GC"], 
        3: ["0011", "AG"],    
        4: ["0100", "AT"],      #       B is one of the synthetic hachimoji nucleotides. it will be used the same way
        5: ["0101", "BA"],      #       A, C, T, and G are used, as "coding" nucleotides. the other 3 will not encode
        6: ["0110", "TB"],      #       file information but will be used to track location throughout the sequence
        7: ["0111", "BC"],
        8: ["1000", "BG"],
        9: ["1001", "BT"],
        10: ["1010", "CA"],
        11: ["1011", "CB"],
        12: ["1100", "TC"],
        13: ["1101", "CG"],
        14: ["1110", "CT"],
        15: ["1111", "GA"]
        }
    
    bin2QuinTable = {value[0]:value[1] for key, value in dec2BinQuinTable.items()} # dict with binary as key and nuc pair as value

    dec2QuinTable = {key:value[1] for key, value in dec2BinQuinTable.items()} # dict with base-10 digit as key and nuc pair as value 

    quin2BinTable = {value[1]:value[0] for key, value in dec2BinQuinTable.items()} # dict with nuc pair as key and binary as value

    codingNucsList = [code[1] for code in dec2BinQuinTable.values()] # ordered list of nuc pairs

    def __init__(self, binFile):
        self.binFile = binFile

    def decimal_to_quint(self, decimal_num):
        ''' create a dictionary to map decimal values to quint characters'''
        quint_num = ''  # initialize the hex number string
        if decimal_num == 0: 
            quint_num = "TA"
        while decimal_num > 0:
            remainder = decimal_num % 16
            quint_num = Encoder.dec2QuinTable[remainder] + quint_num
            decimal_num //= 16
        return quint_num 

    def convertToDNA(self):
        ''' takes in binary file and converts to DNA using dictionary'''
        DNAseq = ""
        for i in range(0,len(self.binFile),4):
            nibble = self.binFile[i:i+4]  # nibble = a group of four bits
            if nibble in Encoder.bin2QuinTable:
                DNAseq += Encoder.bin2QuinTable[nibble]
        return DNAseq

    def addTags(self):
        ''' add location tags 256 characters apart throughout the sequence, so that the sequence is segmented evenly and easy to parse through '''

        DNAseq = self.convertToDNA() 
        tag_num = 0
        taggedSeq = str()
        startPosition = 0
        endPosition = 0
        end = len(DNAseq)

        while startPosition < end:
            locationTag = self.makeTag(tag_num)
            endPosition = startPosition + 256 - len(locationTag) #ensures there is always a "P" (start of location tag) every 256 chars, regardless of variable length of location tag

            if (endPosition > end):
                endPosition = end
            taggedSeq += locationTag + DNAseq[startPosition:endPosition]
            startPosition = endPosition
            tag_num += 1
        
        locationTag = self.makeTag(tag_num, True) #creates final location tag specific to the end of the file
        taggedSeq += locationTag + DNAseq[startPosition:endPosition]    
        
        taggedSeq = self.addFillerNucs(taggedSeq) #check to see if last slice of string is 256 nucleotides long, if not, add filler nucs 

        return taggedSeq
        
    def makeTag(self, count,at_end=False):
        '''create location tags that code for numbers, so each tag contains information about where in the sequence it is located'''

        start =  "PS"    # non-coding hachimoji nucs that flag the beginnings and ends of each location tag
        end = "PS"   
        separator = "W"     # another non-coding nuc used in location tags at the start & end of sequence, to separate the location marker from the file bound
        file_bound = 0    #each seq has two file bounds -- think "book ends"
        locationTag = self.decimal_to_quint(count)
        
        if count == 0: #specific for creating location tag #1 
            headerString = start + Encoder.dec2BinQuinTable[file_bound][1] + separator + Encoder.dec2BinQuinTable[0][1] + end        
        elif at_end == True: #specific for creating the final location tag
            file_bound+=1
            headerString = start + Encoder.dec2BinQuinTable[file_bound][1] + separator + self.decimal_to_quint(count) + end
        else: 
            headerString = start + locationTag + end 
        
        return headerString

    def addFillerNucs(self, string):
        '''ensures that the seq has 256 nucs between each position tag; if the seq is not long enough for the last two position
        tags to be 256 nucs apart, add filler "TG"s (non-coding) to achieve proper segmenting'''

        p_indices = [i for i in range(len(string)) if string[i] == "P"] #P is the start of every location tag
    
        if len(p_indices) < 3: # If there are fewer than 3 occurrences of "P" this will not work
            print("error")
            exit()

        second_to_last_p_index = p_indices[-2] #start of final position tag
        fourth_to_last_p_index = p_indices[-4] #start of second-to-last position tag
        current_distance = second_to_last_p_index - fourth_to_last_p_index 
        desired_distance = 256

        if current_distance == desired_distance:
            return string
        else:
            difference = desired_distance - current_distance
            tg_string = "TG" * (difference // 2) #divide by 2 because 2 chars in "TG"

            if difference % 2 != 0: #if difference is odd, add a solo "T" after the string of "TG"s
                tg_string += "T"

            string = string[:second_to_last_p_index] + tg_string + string[second_to_last_p_index:] #insert the string of "TG"s after all coding data and just before final position tag

        return string

class Decoder :
    ''' this class converts our encoded sequence back to its original file'''

    quin2BinTable = {'TA': '0000', 'GB': '0001', 'GC': '0010', 'AG': '0011', 'AT': '0100', 'BA': '0101', 'TB': '0110', 'BC': '0111', 'BG': '1000', 'BT': '1001', 'CA': '1010', 'CB': '1011', 'TC': '1100', 'CG': '1101', 'CT': '1110', 'GA': '1111'} 

    def __init__(self, taggedSeq):
        self.taggedSeq = taggedSeq 

    def removeTags(self):
        ''' removes location tags by deleting everything between two instances of "PS", including the "PS"s'''

        if "PS" in self.taggedSeq:
            while True:
                start = self.taggedSeq.find("PS")
                if start == -1:
                    break
                end = self.taggedSeq.find("PS", start+2)
                if end == -1:
                    break
                distance = end - start
                if distance < 256: #checks to make sure we're deleting location tags and not the coding nucleotides between PS's from different tags
                    self.taggedSeq = self.taggedSeq[:start] + self.taggedSeq[end+2:]
                untaggedSeq = self.taggedSeq
        else: 
            untaggedSeq = self.taggedSeq
        return self.taggedSeq

    def removeFillerNucs(self):
        ''' removes "TG"s from end of sequence (as well as the lone "T" filler nuc)'''
        untaggedSeq = self.removeTags()
        fillerNucs = "TG" 
        startIdx = 0
        if fillerNucs in untaggedSeq:
            while True:
                filler = untaggedSeq.find(fillerNucs, startIdx)
                if filler == -1:
                    # no more occurrences of "TG" found
                    bareSeq = untaggedSeq
                    break
                else:
                    bareSeq = untaggedSeq[:filler]
                    # set startIdx to begin searching after the last occurrence found
                    startIdx = filler + len(fillerNucs)
        else:
            # no occurrences of TG found
            bareSeq = untaggedSeq
        
        if bareSeq.endswith("T"): # Delete filler "T" from the end of the strand if there is one
            bareSeq = bareSeq[:-1]
        return bareSeq

    def convertToBin(self):
        ''' use cipher dictionary to convert back to original binary file '''
        bareSeq = self.removeFillerNucs()
        binFile = ""
        for i in range(0,len(bareSeq),2):
            nuc_pair = bareSeq[i:i+2]  
            if nuc_pair in Decoder.quin2BinTable:
                binFile += Decoder.quin2BinTable[nuc_pair]
        return binFile

def main():
    inputBinFile = "0100111010100000000011010100111000001110001111100011111000111110001111101111110000110100110111100100001011111010010010100100111111010000110100001010110100111010111011101111000011001100001101001101010000100100001001000010111101000010001111000010000011110000000000100000111100000000111000000010000011110000111011001111010000111110000011110010001111101111001000111101010011100100101011010000001001001111001000000010000011100000010000110000110100000011010011111111111000101111000011000011010011010100101011011111111000111010000111111101000001100101010101110000100101110000011101000000010111100000011101000000001000100100010011111100111111100000001000000010000000100000001001001010111110101100001000001101010000100011000000100100000000111100010000001111001000100100010011101010111111010000110011111010000011011010110101001111101011100000001111100011010001000011111000110100010001000010001111100011010001001100110101000010111000000000001000111101110100000010001111010000001011111100000011000100001000111010001111100011101011100011111100000100000011110011000011011101010001001010111111001111110001001100001111000000001000110011001110100110110001000010010001000100011001001000010111111010001000100000110111100011111000111110001110100010010000100100001001001101010000100100001011111100111111100011101010101100111100001010110011111100111111100000101011000000001011110000001001000010010001001111110011111110000000100000001000000010000010100000000011010100111000001110001111100011111000111110001111101111110000110100110111100100001011111010010010100100111111010000110100001010110100111010111011101111000011001100001101001101010000100100001001000010111101000010001111000010000011110000000000100000111100000000111000000010000011110000111011001111010000111110000011110010001111101111001000111101010011100100101011010000001001001111001000000010000011100000010000110000110100000011010011111111111000101111000011000011010011010100101011011111111000111010000111111101000011101111110000110100110111100100001011111010010010100100111111010000110100001010110100111010111011101111000011001100001101001101010000100100001001000010111101000010001111000010000011110000000000100000111100000000111000000010000011110000111011001111010000111110000011110010001111101111001000111101010011100100101011010000001001001111001000000010000011100000010000110000110100000011010011111111111000101111000011000011010011010100101011011111111000111010000100011110101000001001101000111101101001101100010000100100010001000110010010000101111100100011111011100010000000100000001000000010010010101010101011001111101011010000110101001101000000100100001001001101010001001101010011010011000011011100010011100011111001000011101010100000000011010100111000001110111111100000001000000010000000100000001001001010111110101100001000001101010000100011000000100100000000111100010000001111001000100100010011101010111111010000110011111010000011011010110101001111101011100000001111100011010001000011111000110100010001000010001111100011010001001100110101000010111000000000001000111101110100000010001111010000001011111100000011000100001000111010001111100011101011100011111100000100000011110011000011011101010001001010111111001111110001001100001111000000001000100100010011111100111111100000001000000010000000100000001001001010111110101100001000001101010000100011000000100100000000111100010000001111001000100100010011101010111111010000110011111010000011011010110101001111101011100000001111100011010001000011111000110100010001000010001111100011010001001100110101000010111000000000001000111101110100000010001111010000001011111100000011000100001000111010001111100011101011100011111100000100000011110011000011011101010001001010111111001111110001001100001111000000001000110011000111111101000010101111101011000010000011010100001000110000001001000000011001010101011100001001011100000111010000000101111000000111010000000010001001001001101010100000000001010110000101000011101000100010000011011110001111100011111000111010001001000010010000100100110101000010010000101111110011111110001110101010110011110000101011001111110011111110000010101100000000101111000000100100001001000100111111001111111000000010000000100000001000000010010010101111101011000010000011010100001000110000001001000000001111000100000011110010001001000100111010101111110100001100111110100000110110101101010011111010111000000011111000110100010000111110001101000100010000100011111000110100010011001101010000101110101000000000110101001110000011100011111000111110001111100011111011111100001101001101111001000010111110100100101001001111110100001101000010101101001110101110111011110000110011000010000000100000111000000100001100001101000000110100111111111110001011110000110000110100110101001010110111111110001110100001111111010000010000111110000011110010001111101111001000111101010011100100101011010000001001001111001000000010000011100000010000110000110100000011010011111111111000101111000011000011010011010100101011011111111000111010000111111101000010101101001110101110111011110000110011000011010011010100001001000010010000101111010000100011110000100000111100000000001000001111000000001110000000100000111100001110110011110100001111100000111100100011111011110010001111010100111001001010110100000010010011110010000000100000111000000100001100001101000000110100111111111110001011110000110000110100110101001010110111111110001110100001"
    inputEncodedSeq = "PST/WT_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A/GACGTACGTA/ACGAGCACTC_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A/GACGTACGTA/ACGAGCACTCT/A_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A/GACGTACGTA/ACGAGCACTCT/A_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A/GACGTACGTA/ACGAGCACTCT/A_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A/GACGTACGTA/ACGAGCACTCT/A_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A/GACGTACGTA/ACGAGCACTCT/A_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A/GACGTACGTA/ACGAGCACTCT/A_PSA/CCCCCCTCATATAC/ATCTTACTA/CTAGC/AGCTA_5T//TCAGA/CGCTA/GCGACAA/C/A/GACGTACGTA/ACGAGCACTCT/A/GACGTACGTA/ACGAGCACTCT/AT/AT/TCT5/GATA/ATGCATG/AT/CGAATGCAGTCGCTAG/TATAGCTAG/TAT/CTTA/CTAG/TAC/TCGAATAGCTTA/AGCAGCTGAGCAGC/A/CTATCACGTA/CATGAGC/AGCTA/TTAA/AXTACGTAAGATG_GACTGCG/TATCAGATCG/PSGBPSTCAC/GGACTAGCAGBTGTG/GTGTGTGT/T/TGTGTGTG//TG/GT//GTGTGTGTGTGTGTGTGTGTG/GTGTGTGTGTGTGTGTGTGTGTGTGT/TGTGTG/GT/TPSGBWCBPS"
    
    myEncoder = Encoder(inputBinFile) #use to encode a binary file
    myDecoder = Decoder(inputEncodedSeq) #use to decode DNA seq

    print("The binary file:")
    binFile = myDecoder.convertToBin()
    print(binFile)

    print("\nThe encoded sequence:")
    encodedSeq = myEncoder.addTags()
    print(encodedSeq)
    
    # For debugging: 
    import re 
    locations = [m.start() for m in re.finditer('PS', inputEncodedSeq)] # verify locations of position tags, they should be every 256 chars
    print("\n ERROR: LOCATION TAGS OUT OF PLACE\n Tags at indices:", locations[::2], "\n\n\n\n\n") #count only the "PS"s at start of location tag

if __name__ == '__main__':
    main()



The binary file:
110001000100110000000000001000100000101011110010000011011111001011001110101000000000010011100000111011100011001111101100001111011110001011111010111111010000110100001101001110101110111011101010000000000100111000001110111000110011111011000011110111100010111110101100010001001100000000000010001000001010111100100000110111110010110011000100010011000000000000100010000010101111001000001101111100101100111111010000110100001101001110101110111001001110111100001010110110101101111000110100001111100011010011100011110100000010001110100010001110100010000011001010001100111110111111010100101011111100110011111110001110100001

The encoded sequence:
PSTAWTAPSATCTCATATACGATCTTACTAGCTAGCTAGCTAGCTGATCAGATCGCTATGCGACAATCAATGACGTACGTACACGAGCACTCTGATATCTCAGATCGATGCATGCATGCGAATGCAGTCGCTAGATATAGCTAGATATACTTAGCTAGATACTTCGAATAGCTTAGAGCAGCTGAGCAGCGATCTATCACGTAGCATGAGCTAGCTACTTAATAGTACGTAAGATGAGACTGCGATATCAGATCGAPSGBPSTCACGGACTAGCAGBGACGTATBBABABCTABTBCTABCATTABACTTABCATTAGCGCATATGATCGACTTAGCTAGCTAGCTA