In [3]:
import requests
from pathlib import Path
from bs4 import BeautifulSoup
import re

In [12]:
# raw data paths
gencode_path = '../../GENCODE43/protein_coding/'
bed = Path(gencode_path) / 'BED6__protein_coding_strict/'
fa = Path(gencode_path) / 'FA_protein_coding_strict_mRNA/'

def find_new_transcript(content):
    """Method to extract a specific transcript ID from an HTML document.

    Keyword Arguments:
    req -- Python request object content

    Returns:
    A string either empty or containing the transcript ID.
    """
    # parse the HTML document
    soup = BeautifulSoup(content, 'html.parser')
    # check if a specific table exists
    if soup.find(id='transcripts_table'):
        # if so extract the transcript ID
        href = soup.find(id='transcripts_table').tbody.td.a.attrs['href']
        transcript = re.sub(r'.*(ENST0\d+)', r'\1', href)
        print('   Current canonical transcript is', transcript)
    else:
        # if not return an empty string
        transcript = ''
        print('   No current transcript found!')

    return transcript

def check_files_and_update_df(old_tid, new_tid):
    """Cross reference the transcript ID with files in the gencode data set
    (bad hack as it uses variables globally defined at the beginning of this notebook!)

    Keyword Arguments:
    transcript -- the transcript ID string
    """
    # search and count files with a given name
    bed_file_list = list(bed.glob(new_tid + '*.bed'))
    bed_files = len(bed_file_list)
    fa_file_list = list(fa.glob(new_tid + '*.fasta'))
    fa_files = len(fa_file_list)

    # check how many files were found
    if bed_files == 1 and fa_files == 1:
        # if it's 1 everything is perfect
        print('   FA and BED files found. Updating dataframe with current information')
        # update dataframe
        # df2.loc[ df2['EnsemblTranscriptID'] == old_tid, 'bed_files' ] = bed_files
        # df2.loc[ df2['EnsemblTranscriptID'] == old_tid, 'fa_files' ] = fa_files
        # df2.loc[ df2['EnsemblTranscriptID'] == old_tid, 'bed'] = str(bed_file_list[0])
        # df2.loc[ df2['EnsemblTranscriptID'] == old_tid, 'fa'] = str(fa_file_list[0])
        # df2.loc[ df2['EnsemblTranscriptID'] == old_tid, 'EnsemblTranscriptID' ] = new_tid
        return True
    else:
        # if there are many manual processing is needed
        print('   FA and BED file count invalid. File lists', bed_file_list, fa_file_list)

    return False

In [18]:
url = 'https://www.ensembl.org/Homo_sapiens/Transcript/Idhistory?t=ENST00000442201'
r = requests.get(url)

In [19]:
soup = BeautifulSoup(r.content, 'html.parser')

table_content = soup.tbody;

# print(table_content)

tid = 'ENST00000442201'

for i in table_content.find_all_next('td'):
    if i.find('a') and re.search(r'.*Gene.*ENSG\d+', i.a.attrs['href']):
        href = i.a.attrs['href']
        gene = re.sub(r'.*(ENSG0\d+)', r'\1', href)
        print('   Transcript is deprecated, resolved gene is', gene)

        url = 'https://www.ensembl.org/Homo_sapiens/Gene/Idhistory?g=' + gene
        r = requests.get(url)
        new_tid = find_new_transcript(r.content)
        if check_files_and_update_df(tid, new_tid): break

   Transcript is deprecated, resolved gene is ENSG00000145075
   Current canonical transcript is ENST00000471307
   FA and BED file count invalid. File lists [] []
   Transcript is deprecated, resolved gene is ENSG00000284862
   Current canonical transcript is ENST00000476379
   FA and BED files found. Updating dataframe with current information


In [21]:
tid = 'ENST00000000442'
line = '>ENST00000000442.11:0-2274'

m = re.match(r'>' + tid + r'\.\d+:0-(?P<trans_length>\d+)', line)

m.group('trans_length')

'2274'

In [23]:
import pandas as pd

# raw data file and path
datafile = '../data/preproc_stage2.csv'

# sanity check if the file exists
if not Path(datafile).is_file():
    print('Data file not found!')

# reading the data into a dataframe and looking at the first entries
df = pd.read_csv(datafile)
df

Unnamed: 0,EnsemblTranscriptID,Adrenal_PTR,Appendices_PTR,Brain_PTR,Colon_PTR,Duodenum_PTR,Endometrium_PTR,Esophagus_PTR,Fallopiantube_PTR,Fat_PTR,...,Thyroid_PTR,Tonsil_PTR,Urinarybladder_PTR,5UTR start,5UTR stop,CDS start,CDS stop,3UTR start,3UTR stop,transcript
0,ENST00000263100,,8.277,,,,,,7.841,,...,,,,0,55,55,1543,1543,3382,ATTGCTGCAGACGCTCACCCCAGACACTCACTGCACCGGAGTGAGC...
1,ENST00000373993,,,,5.135,5.371,,,,,...,,,,0,92,92,1877,1877,2044,ATAATCAAGGAAACCTTTTCCGGGTGGGGATCTCTGAAATTACTCA...
2,ENST00000318602,6.290,6.328,5.948,5.811,6.068,5.383,5.881,6.119,6.410,...,6.060,5.675,5.8286,0,70,70,4495,4495,4610,GGGACCAGATGGATTGTAGGGAGTAGGGTACAATACAGTCTGTTCT...
3,ENST00000299698,,,3.995,,,,4.129,,,...,,5.249,,0,31,31,4396,4396,5127,GACCCTGGAAAAATCTGTCTCACCCACAAAGATGTGGGCTCAGCTC...
4,ENST00000401850,3.843,4.601,,,,,4.013,3.683,,...,,,4.2430,0,490,490,1552,1552,2321,TGCACTTCTGTGCCTCAATTTCCTCATCTGTAGGGTGGGGGTGGTG...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11560,ENST00000374888,,,,,,,,,,...,,4.681,,0,38,38,2450,2450,5467,CTCGGCTCTGGTTCCAGCCGAGCCTCTCGGACGCAGAGATGGAAAT...
11561,ENST00000294353,4.461,5.013,5.047,4.566,5.184,4.826,5.102,4.670,5.756,...,4.250,4.439,4.1460,0,201,201,2436,2436,8143,GGAGTCTGCGCTCTGGTTCGGGCTGCGGCTGCGGCTGCGGCTGCGG...
11562,ENST00000322764,5.664,5.524,5.478,5.915,5.811,5.817,5.943,5.509,4.931,...,5.598,5.968,5.3358,0,80,80,1799,1799,2228,GCAGAGTCTGCGGACCCGGCGCCGAGGCGGCCACCCGAGACGCGGC...
11563,ENST00000381638,5.112,4.918,5.139,5.190,5.442,5.602,4.715,4.956,5.033,...,5.038,5.130,5.0619,0,135,135,9021,9021,11466,AGGAAGCCGGAAGCCGCAGGGGCCGCCGTCGTCTCCTCCGCGTCCC...


In [41]:
count = 0

start_codons = [
    ['A', 'T', 'G'],
    ['C', 'T', 'G']
]

stop_codons = [
    ['T', 'A', 'A'],
    ['T', 'A', 'G'],
    ['T', 'G', 'A']
]

for index, row in df.iterrows():
    print('CDS start', row['CDS start'], 'CDS stop', row['CDS stop'])
    print(row['transcript'])

    CDS_length = row['CDS stop'] - row['CDS start']
    if (CDS_length % 3) != 0:
        print(row['EnsemblTranscriptID'], 'CDS length is NOT a multiple of three!')
    else:
        print('CDS structure ok.')

    CDS = list(row['transcript'])
    
    CDS_start_codon = CDS[row['CDS start']:row['CDS start']+3]
    CDS_stop_codon = CDS[row['CDS stop']-3:row['CDS stop']]

    print('Start codon', CDS_start_codon, 'Stop codon', CDS_stop_codon)

    if not CDS_start_codon in start_codons:
        print(row['EnsemblTranscriptID'], 'Start codon invalid!', CDS_start_codon)
    else:
        print('Start codon ok.')

    if not CDS_stop_codon in stop_codons:
        print(row['EnsemblTranscriptID'], 'Stop codon invalid!', CDS_stop_codon)
    else:
        print('Stop codon ok.')
    
    count += 1
    if count == 1: break

CDS start 55 CDS stop 1543
ATTGCTGCAGACGCTCACCCCAGACACTCACTGCACCGGAGTGAGCGCGACCATCATGTCCATGCTCGTGGTCTTTCTCTTGCTGTGGGGTGTCACCTGGGGCCCAGTGACAGAAGCAGCCATATTTTATGAGACGCAGCCCAGCCTGTGGGCAGAGTCCGAATCACTGCTGAAACCCTTGGCCAATGTGACGCTGACGTGCCAGGCCCACCTGGAGACTCCAGACTTCCAGCTGTTCAAGAATGGGGTGGCCCAGGAGCCTGTGCACCTTGACTCACCTGCCATCAAGCACCAGTTCCTGCTGACGGGTGACACCCAGGGCCGCTACCGCTGCCGCTCGGGCTTGTCCACAGGATGGACCCAGCTGAGCAAGCTCCTGGAGCTGACAGGGCCAAAGTCCTTGCCTGCTCCCTGGCTCTCGATGGCGCCAGTGTCCTGGATCACCCCCGGCCTGAAAACAACAGCAGTGTGCCGAGGTGTGCTGCGGGGTGTGACTTTTCTGCTGAGGCGGGAGGGCGACCATGAGTTTCTGGAGGTGCCTGAGGCCCAGGAGGATGTGGAGGCCACCTTTCCAGTCCATCAGCCTGGCAACTACAGCTGCAGCTACCGGACCGATGGGGAAGGCGCCCTCTCTGAGCCCAGCGCTACTGTGACCATTGAGGAGCTCGCTGCACCACCACCGCCTGTGCTGATGCACCATGGAGAGTCCTCCCAGGTCCTGCACCCTGGCAACAAGGTGACCCTCACCTGCGTGGCTCCCCTGAGTGGAGTGGACTTCCAGCTACGGCGCGGGGAGAAAGAGCTGCTGGTACCCAGGAGCAGCACCAGCCCAGATCGCATCTTCTTTCACCTGAACGCGGTGGCCCTGGGGGATGGAGGTCACTACACCTGCCGCTACCGGCTGCATGACAACCAAAACGGCTGGTCCGGGGACAGCGCGCCGGTCGAGCTGATTCTGAGCGATGAGACGCTG

In [37]:
foo = [1, 2, 3, 4, 5, 6]
foo[4-3:4]

[2, 3, 4]

In [4]:
import numpy as np
from keras.utils import pad_sequences

def seq_onehot(seq):
    mapping = {
        'A' : 0,
        'T' : 1,
        'G' : 2,
        'C' : 3
    }
    seq2 = [ mapping[i] for i in seq ]
    return np.eye(4)[seq2]

raw_sequences = ['ATGC', 'AAA', 'GCTAAT', 'CGAT']

ohe_sequences = [ seq_onehot(s) for s in raw_sequences ]
pad_sequences(ohe_sequences, padding='post')

2023-11-19 23:32:53.887258: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-19 23:32:53.888577: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-19 23:32:53.906128: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-19 23:32:53.906145: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-19 23:32:53.906631: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

array([[[1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[1, 0, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]],

       [[0, 0, 1, 0],
        [0, 0, 0, 1],
        [0, 1, 0, 0],
        [1, 0, 0, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0]],

       [[0, 0, 0, 1],
        [0, 0, 1, 0],
        [1, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 0, 0],
        [0, 0, 0, 0]]], dtype=int32)

In [52]:
foo = np.array(
        [[1, 0, 0, 0],
         [0, 1, 0, 0],
         [0, 0, 1, 0],
         [0, 0, 0, 1]],
        dtype=int)

print(len(foo))

target = 6

print(foo)
foo = np.pad(foo, ((0,target-len(foo)), (0,0)), mode='constant')
print(foo)

4
[[1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]]
[[1 0 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [0 0 0 0]
 [0 0 0 0]]


In [55]:
def onehot_tissue(idx):
    t = np.eye(29, dtype=int)[idx]
    return t

In [56]:
for i in range(1, 30):
    foo = i - 1
    print(onehot_tissue(foo))

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0

In [5]:
def hotenc(seq, u5_start=-1, u5_stop=-1, c_start=-1, c_stop=-1, u3_start=-1, u3_stop=-1, multi=True):
    if multi: encoded_sequence = np.empty((len(seq),10), dtype=int)
    else: encoded_sequence = np.empty((len(seq),4), dtype=int)
    
    for idx, x in enumerate(seq):
        if multi: encoded = np.zeros((10,), dtype=int)
        else: encoded = np.zeros((4,), dtype=int)
        
        # one hot encoding the sequence
        mapping = { 'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3 }
        encoded[0:4] = np.eye(4)[mapping[x]]

        if multi:
            # one hot encoding the region
            if c_start != -1 and idx >= c_start and idx < c_stop: encoded[4:7] = [0, 1, 0]
            elif u5_start != -1 and idx >= u5_start and idx < u5_stop: encoded[4:7] = [1, 0, 0]
            elif u3_start != -1 and idx >= u3_start and idx < u3_stop: encoded[4:7] = [0, 0, 1]
            else: encoded[4:7] = [0, 0, 0]

            # one hot encoding the codon position
            if idx >= c_start and idx < c_stop:
                if u5_start == -1: u5_offset = 0
                else: u5_offset = u5_stop
                # get the position within the coding region by subtracting the 5'UTR offset
                pos = (idx - u5_offset) % 3
                encoded[7:10] = np.eye(3)[pos]
            else:
                encoded[7:10] = [0, 0, 0]      

        encoded_sequence[idx] = encoded

    return encoded_sequence

In [6]:
UTR5_start = 0
UTR5_stop = 7
CDS_start = 7
CDS_stop = 16
UTR3_start = 16
UTR3_stop = 21

raw_sequence = 'ATCCCGAGTTTCGATAGGTG'

hotenc(raw_sequence, UTR5_start, UTR5_stop, CDS_start, CDS_stop, UTR3_start, UTR3_stop, True)

array([[1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
       [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 0]])

In [15]:
from keras.preprocessing.text import one_hot

text = "I am fourty five years old. People suck. I do not give a fuck. I am with my dogs who are my best friends."

wc = {}
for w in text.split():
    # print(w)
    if w in wc.keys():
        wc[w] += 1
    else:
        wc[w] = 1

voc_size = len(wc.keys())
print(wc)
print(voc_size)

one_hot(text, voc_size)

{'I': 3, 'am': 2, 'fourty': 1, 'five': 1, 'years': 1, 'old.': 1, 'People': 1, 'suck.': 1, 'do': 1, 'not': 1, 'give': 1, 'a': 1, 'fuck.': 1, 'with': 1, 'my': 2, 'dogs': 1, 'who': 1, 'are': 1, 'best': 1, 'friends.': 1}
20


[14,
 8,
 3,
 9,
 3,
 12,
 15,
 3,
 14,
 1,
 11,
 9,
 16,
 15,
 14,
 8,
 5,
 5,
 1,
 17,
 8,
 5,
 11,
 14]

In [41]:
def build_kmers(seq, ksize):
    voc = {}
    kmer_count = (len(seq) - ksize) + 1
    print('k-mer count:', kmer_count)
    
    for i in range(0, kmer_count):
        kmer = seq[i:i+ksize]
        if kmer in voc.keys():
            voc[kmer] += 1
        else:
            voc[kmer] = 1

    return voc

In [40]:
raw_sequence = 'ATGCTGA'
raw_sequence = 'ATCCCGAGTTTCGATAGGTG'
raw_sequence = 'AGAAGGCAGCCTCGGTCTCTGGGCGGCGGCGGCGGCCCACTCTGCCCTGGCCGCGCTGTGTGGTGACCGCAGGCCCCAGACATGAGGGCGGCCCGTGCTCTGCTGCCCCTGCTGCTGCAGGCCTGCTGGACAGCCGCGCAGGATGAGCCGGAGACCCCGAGGGCCGTGGCCTTCCAGGACTGCCCCGTGGACCTGTTCTTTGTGCTGGACACCTCTGAGAGCGTGGCCCTGAGGCTGAAGCCCTACGGGGCCCTCGTGGACAAAGTCAAGTCCTTCACCAAGCGCTTCATCGACAACCTGAGGGACAGGTACTACCGCTGTGACCGAAACCTGGTGTGGAACGCAGGCGCGCTGCACTACAGTGACGAGGTGGAGATCATCCAAGGCCTCACGCGCATGCCTGGCGGCCGCGACGCACTCAAAAGCAGCGTGGACGCGGTCAAGTACTTTGGGAAGGGCACCTACACCGACTGCGCTATCAAGAAGGGGCTGGAGCAGCTCCTCGTGGGGGGCTCCCACCTGAAGGAGAATAAGTACCTGATTGTGGTGACCGACGGGCACCCCCTGGAGGGCTACAAGGAACCCTGTGGGGGGCTGGAGGATGCTGTGAACGAGGCCAAGCACCTGGGCGTCAAAGTCTTCTCGGTGGCCATCACACCCGACCACCTGGAGCCGCGTCTGAGCATCATCGCCACGGACCACACGTACCGGCGCAACTTCACGGCGGCTGACTGGGGCCAGAGCCGCGACGCAGAGGAGGCCATCAGCCAGACCATCGACACCATCGTGGACATGATCAAAAATAACGTGGAGCAAGTGTGCTGCTCCTTCGAATGCCAGCCTGCAAGAGGACCTCCGGGGCTCCGGGGCGACCCCGGCTTTGAGGGAGAACGAGGCAAGCCGGGGCTCCCAGGAGAGAAGGGAGAAGCCGGAGATCCTGGAAGACCCGGGGACCTCGGACCTGTTGGGTACCAGGGAATGAAGGGAGAAAAAGGGAGCCGTGGGGAGAAGGGCTCCAGGGGACCCAAGGGCTACAAGGGAGAGAAGGGCAAGCGTGGCATCGACGGGGTGGACGGCGTGAAGGGGGAGATGGGGTACCCAGGCCTGCCAGGCTGCAAGGGCTCGCCCGGGTTTGACGGCATTCAAGGACCCCCTGGCCCCAAGGGAGACCCCGGTGCCTTTGGACTGAAAGGAGAAAAGGGCGAGCCTGGAGCTGACGGGGAGGCGGGGAGACCAGGGAGCTCGGGACCATCTGGAGACGAGGGCCAGCCGGGAGAGCCTGGGCCCCCCGGAGAGAAAGGAGAGGCGGGCGACGAGGGGAACCCAGGACCTGACGGTGCCCCCGGGGAGCGGGGTGGCCCTGGAGAGAGAGGACCACGGGGGACCCCAGGCACGCGGGGACCAAGAGGAGACCCTGGTGAAGCTGGCCCGCAGGGTGATCAGGGAAGAGAAGGCCCCGTTGGTGTCCCTGGAGACCCGGGCGAGGCTGGCCCTATCGGACCTAAAGGCTACCGAGGCGATGAGGGTCCCCCAGGGTCCGAGGGTGCCAGAGGAGCCCCAGGACCTGCCGGACCCCCTGGAGACCCGGGGCTGATGGGTGAAAGGGGAGAAGACGGCCCCGCTGGAAATGGCACCGAGGGCTTCCCCGGCTTCCCCGGGTATCCGGGCAACAGGGGCGCTCCCGGGATAAACGGCACGAAGGGCTACCCCGGCCTCAAGGGGGACGAGGGAGAAGCCGGGGACCCCGGAGACGATAACAACGACATTGCACCCCGAGGAGTCAAAGGAGCAAAGGGGTACCGGGGTCCCGAGGGCCCCCAGGGACCCCCAGGACACCAAGGACCGCCTGGGCCGGACGAATGCGAGATTTTGGACATCATCATGAAAATGTGCTCTTGCTGTGAATGCAAGTGCGGCCCCATCGACCTCCTGTTCGTGCTGGACAGCTCAGAGAGCATTGGCCTGCAGAACTTCGAGATTGCCAAGGACTTCGTCGTCAAGGTCATCGACCGGCTGAGCCGGGACGAGCTGGTCAAGTTCGAGCCAGGGCAGTCGTACGCGGGTGTGGTGCAGTACAGCCACAGCCAGATGCAGGAGCACGTGAGCCTGCGCAGCCCCAGCATCCGGAACGTGCAGGAGCTCAAGGAAGCCATCAAGAGCCTGCAGTGGATGGCGGGCGGCACCTTCACGGGGGAGGCCCTGCAGTACACGCGGGACCAGCTGCTGCCGCCCAGCCCGAACAACCGCATCGCCCTGGTCATCACTGACGGGCGCTCAGACACTCAGAGGGACACCACACCGCTCAACGTGCTCTGCAGCCCCGGCATCCAGGTGGTCTCCGTGGGCATCAAAGACGTGTTTGACTTCATCCCAGGCTCAGACCAGCTCAATGTCATTTCTTGCCAAGGCCTGGCACCATCCCAGGGCCGGCCCGGCCTCTCGCTGGTCAAGGAGAACTATGCAGAGCTGCTGGAGGATGCCTTCCTGAAGAATGTCACCGCCCAGATCTGCATAGACAAGAAGTGTCCAGATTACACCTGCCCCATCACGTTCTCCTCCCCGGCTGACATCACCATCCTGCTGGACGGCTCCGCCAGCGTGGGCAGCCACAACTTTGACACCACCAAGCGCTTCGCCAAGCGCCTGGCCGAGCGCTTCCTCACAGCGGGCAGGACGGACCCCGCCCACGACGTGCGGGTGGCGGTGGTGCAGTACAGCGGCACGGGCCAGCAGCGCCCAGAGCGGGCGTCGCTGCAGTTCCTGCAGAACTACACGGCCCTGGCCAGTGCCGTCGATGCCATGGACTTTATCAACGACGCCACCGACGTCAACGATGCCCTGGGCTATGTGACCCGCTTCTACCGCGAGGCCTCGTCCGGCGCTGCCAAGAAGAGGCTGCTGCTCTTCTCAGATGGCAACTCGCAGGGCGCCACGCCCGCTGCCATCGAGAAGGCCGTGCAGGAAGCCCAGCGGGCAGGCATCGAGATCTTCGTGGTGGTCGTGGGCCGCCAGGTGAATGAGCCCCACATCCGCGTCCTGGTCACCGGCAAGACGGCCGAGTACGACGTGGCCTACGGCGAGAGCCACCTGTTCCGTGTCCCCAGCTACCAGGCCCTGCTCCGCGGTGTCTTCCACCAGACAGTCTCCAGGAAGGTGGCGCTGGGCTAGCCCACCCTGCACGCCGGCACCAAACCCTGTCCTCCCACCCCTCCCCACTCATCACTAAACAGAGTAAAATGTGATGCGAATTTTCCCGACCAACCTGATTCGCTAGATTTTTTTTAAGGAAAAGCTTGGAAAGCCAGGACACAACGCTGCTGCCTGCTTTGTGCAGGGTCCTCCGGGGCTCAGCCCTGAGTTGGCATCACCTGCGCAGGGCCCTCTGGGGCTCAGCCCTGAGCTAGTGTCACCTGCACAGGGCCCTCTGAGGCTCAGCCCTGAGCTGGCGTCACCTGTGCAGGGCCCTCTGGGGCTCAGCCCTGAGCTGGCCTCACCTGGGTTCCCCACCCCGGGCTCTCCTGCCCTGCCCTCCTGCCCGCCCTCCCTCCTGCCTGCGCAGCTCCTTCCCTAGGCACCTCTGTGCTGCATCCCACCAGCCTGAGCAAGACGCCCTCTCGGGGCCTGTGCCGCACTAGCCTCCCTCTCCTCTGTCCCCATAGCTGGTTTTTCCCACCAATCCTCACCTAACAGTTACTTTACAATTAAACTCAAAGCAAGCTCTTCTCCTCAGCTTGGGGCAGCCATTGGCCTCTGTCTCGTTTTGGGAAACCAAGGTCAGGAGGCCGTTGCAGACATAAATCTCGGCGACTCGGCCCCGTCTCCTGAGGGTCCTGCTGGTGACCGGCCTGGACCTTGGCCCTACAGCCCTGGAGGCCGCTGCTGACCAGCACTGACCCCGACCTCAGAGAGTACTCGCAGGGGCGCTGGCTGCACTCAAGACCCTCGAGATTAACGGTGCTAACCCCGTCTGCTCCTCCCTCCCGCAGAGACTGGGGCCTGGACTGGACATGAGAGCCCCTTGGTGCCACAGAGGGCTGTGTCTTACTAGAAACAACGCAAACCTCTCCTTCCTCAGAATAGTGATGTGTTCGACGTTTTATCAAAGGCCCCCTTTCTATGTTCATGTTAGTTTTGCTCCTTCTGTGTTTTTTTCTGAACCATATCCATGTTGCTGACTTTTCCAAATAAAGGTTTTCACTCCTC'

print('sequence length:', len(raw_sequence))

v = build_kmers(raw_sequence, 3)
print('vocabulary length:', len(v.keys()))
v

sequence length: 4203
k-mer count: 4201
vocabulary length: 64


{'AGA': 86,
 'GAA': 57,
 'AAG': 80,
 'AGG': 119,
 'GGC': 148,
 'GCA': 83,
 'CAG': 109,
 'AGC': 87,
 'GCC': 143,
 'CCT': 127,
 'CTC': 93,
 'TCG': 41,
 'CGG': 91,
 'GGT': 51,
 'GTC': 44,
 'TCT': 43,
 'CTG': 144,
 'TGG': 99,
 'GGG': 153,
 'GCG': 67,
 'CCC': 161,
 'CCA': 99,
 'CAC': 75,
 'ACT': 34,
 'TGC': 89,
 'CCG': 92,
 'CGC': 70,
 'GCT': 101,
 'TGT': 42,
 'GTG': 81,
 'TGA': 56,
 'GAC': 104,
 'ACC': 103,
 'ACA': 48,
 'CAT': 48,
 'ATG': 34,
 'GAG': 119,
 'CGT': 44,
 'GGA': 110,
 'GAT': 28,
 'CGA': 56,
 'CTT': 40,
 'TTC': 41,
 'TCC': 72,
 'GTT': 24,
 'TTT': 40,
 'TTG': 27,
 'CTA': 28,
 'TAC': 30,
 'ACG': 61,
 'CAA': 64,
 'AAA': 40,
 'AGT': 27,
 'TCA': 66,
 'ATC': 45,
 'AAC': 37,
 'GTA': 15,
 'TAT': 9,
 'AAT': 18,
 'ATA': 10,
 'TAA': 14,
 'ATT': 14,
 'TAG': 10,
 'TTA': 10}

In [38]:
# build the embedding vocabulary

import itertools

perms = [''.join(x) for x in itertools.product('ATGC', repeat=3)]

print(len(perms))
perms

64


['AAA',
 'AAT',
 'AAG',
 'AAC',
 'ATA',
 'ATT',
 'ATG',
 'ATC',
 'AGA',
 'AGT',
 'AGG',
 'AGC',
 'ACA',
 'ACT',
 'ACG',
 'ACC',
 'TAA',
 'TAT',
 'TAG',
 'TAC',
 'TTA',
 'TTT',
 'TTG',
 'TTC',
 'TGA',
 'TGT',
 'TGG',
 'TGC',
 'TCA',
 'TCT',
 'TCG',
 'TCC',
 'GAA',
 'GAT',
 'GAG',
 'GAC',
 'GTA',
 'GTT',
 'GTG',
 'GTC',
 'GGA',
 'GGT',
 'GGG',
 'GGC',
 'GCA',
 'GCT',
 'GCG',
 'GCC',
 'CAA',
 'CAT',
 'CAG',
 'CAC',
 'CTA',
 'CTT',
 'CTG',
 'CTC',
 'CGA',
 'CGT',
 'CGG',
 'CGC',
 'CCA',
 'CCT',
 'CCG',
 'CCC']