In [1]:
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
import os
import random
import Levenshtein as Lev
np.random.seed(42)
random.seed(42)
%matplotlib inline

In [2]:
# https://www.ncbi.nlm.nih.gov/nuccore/JQ394803.1?report=fasta
plasmid = """TACTAGTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACCACCCTGCCCTTTTTCTTTAA
AACCGAAAAGATTACTTCGCGTTATGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCT
GCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATCTCGAGTCCCGTCAAGTCAGCGTAATGCTCTGCC
AGTGTTACAACCAATTAACCAATTCTGATTAGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATT
CATATCAGGATTATCAATACCATATTTTTGAAAAAGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGG
CAGTTCCATAGGATGGCAAGATCCTGGTATCGGTCTGCGATTCCGACTCGTCCAACATCAATACAACCTA
TTAATTTCCCCTCGTCAAAAATAAGGTTATCAAGTGAGAAATCACCATGAGTGACGACTGAATCCGGTGA
GAATGGCAAAAGCTTATGCATTTCTTTCCAGACTTGTTCAACAGGCCAGCCATTACGCTCGTCATCAAAA
TCACTCGCATCAACCAAACCGTTATTCATTCGTGATTGCGCCTGAGCGAGACGAAATACGCGATCGCTGT
TAAAAGGACAATTACAAACAGGAATCGAATGCAACCGGCGCAGGAACACTGCCAGCGCATCAACAATATT
TTCACCTGAATCAGGATATTCTTCTAATACCTGGAATGCTGTTTTCCCGGGGATCGCAGTGGTGAGTAAC
CATGCATCATCAGGAGTACGGATAAAATGCTTGATGGTCGGAAGAGGCATAAATTCCGTCAGCCAGTTTA
GTCTGACCATCTCATCTGTAACATCATTGGCAACGCTACCTTTGCCATGTTTCAGAAACAACTCTGGCGC
ATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATTTATAC
CCATATAAATCAGCATCCATGTTGGAATTTAATCGCGGCCTCGAGCAAGACGTTTCCCGTTGAATATGGC
TCATAACACCCCTTGTATTACTGTTTATGTAAGCAGACAGTTTTATTGTTCATGATGATATATTTTTATC
TTGTGCAATGTAACATCAGAGATTTTGAGACACAACGTGGCTTTGTTGAATAAATCGAACTTTTGCTGAG
TTGAAGGATCAGATCACGCATCTTCCCGACAACGCAGACCGTTCCGTGGCAAAGCAAAAGTTCAAAATCA
CCAACTGGTCCACCTACAACAAAGCTCTCATCAACCGTGGCTCCCTCACTTTCTGGCTGGATGATGGGGC
GATTCAGGCCTGGTATGAGTCAGCAACACCTTCTTCACGAGGCAGACCTCAGCGCTAGCGGAGTGTATAC
TGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGAAGTGCTTCATGTGGCAGGAGAAAAAAGGCTGCA
CCGGTGCGTCAGCAGAATATGTGATACAGGATATATTCCGCTTCCTCGCTCACTGACTCGCTACGCTCGG
TCGTTCGACTGCGGCGAGCGGAAATGGCTTACGAACGGGGCGGAGATTTCCTGGAAGATGCCAGGAAGAT
ACTTAACAGGGAAGTGAGAGGGCCGCGGCAAAGCCGTTTTTCCATAGGCTCCGCCCCCCTGACAAGCATC
ACGAAATCTGACGCTCAAATCAGTGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCT
GGCGGCTCCCTCGTGCGCTCTCCTGTTCCTGCCTTTCGGTTTACCGGTGTCATTCCGCTGTTATGGCCGC
GTTTGTCTCATTCCACGCCTGACACTCAGTTCCGGGTAGGCAGTTCGCTCCAAGCTGGACTGTATGCACG
AACCCCCCGTTCAGTCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGAAAGACA
TGCAAAAGCACCACTGGCAGCAGCCACTGGTAATTGATTTAGAGGAGTTAGTCTTGAAGTCATGCGCCGG
TTAAGGCTAAACTGAAAGGACAAGTTTTGGTGACTGCGCTCCTCCAAGCCAGTTACCTCGGTTCAAAGAG
TTGGTAGCTCAGAGAACCTTCGAAAAACCGCCCTGCAAGGCGGTTTTTTCGTTTTCAGAGCAAGAGATTA
CGCGCAGACCAAAACGATCTCAAGAAGATCATCTTATTAAGGGGTCTGACGCTCAGTGGAACGAAAACTC
ACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGA
AGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGG
CACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTAC
GATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCA
GATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCT
CCATCCAGTCTATTCCATGGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAA
AATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGA
ATTCGCGGCCGCTTCTAGAGTAACACCGTGCGTGTTGACTATTTTACCTCTGGCGGTGATAATGGTTGCT
ACTAGAGAAAGAGGAGAAATACTAGATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATC
CTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCA
CCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGT
GACCACCTTCGGCTACGGCCTGCAATGCTTCGCCCGCTACCCCGACCACATGAAGCTGCACGACTTCTTC
AAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGA
CCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAA
GGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCC
GACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGC
TCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCT
GAGCTACCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTG
ACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAATAATACTAGAGCCAGGCATCAAATA
AAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACT
AGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATA""".replace('\n','')

In [3]:
plasmid

'TACTAGTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACCACCCTGCCCTTTTTCTTTAAAACCGAAAAGATTACTTCGCGTTATGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATCTCGAGTCCCGTCAAGTCAGCGTAATGCTCTGCCAGTGTTACAACCAATTAACCAATTCTGATTAGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATCAGGATTATCAATACCATATTTTTGAAAAAGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATAGGATGGCAAGATCCTGGTATCGGTCTGCGATTCCGACTCGTCCAACATCAATACAACCTATTAATTTCCCCTCGTCAAAAATAAGGTTATCAAGTGAGAAATCACCATGAGTGACGACTGAATCCGGTGAGAATGGCAAAAGCTTATGCATTTCTTTCCAGACTTGTTCAACAGGCCAGCCATTACGCTCGTCATCAAAATCACTCGCATCAACCAAACCGTTATTCATTCGTGATTGCGCCTGAGCGAGACGAAATACGCGATCGCTGTTAAAAGGACAATTACAAACAGGAATCGAATGCAACCGGCGCAGGAACACTGCCAGCGCATCAACAATATTTTCACCTGAATCAGGATATTCTTCTAATACCTGGAATGCTGTTTTCCCGGGGATCGCAGTGGTGAGTAACCATGCATCATCAGGAGTACGGATAAAATGCTTGATGGTCGGAAGAGGCATAAATTCCGTCAGCCAGTTTAGTCTGACCATCTCATCTGTAACATCATTGGCAACGCTACCTTTGCCATGTTTCAGAAACAACTCTGGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATTTATACCCATATAAATCAGCATCCA

In [4]:
datasets_dir = '../../../data'
data = pickle.load( open( os.path.join(datasets_dir,'tts/val_x_no_nan.pkl'), "rb" ) )
data

Unnamed: 0_level_0,sequence,bacterial_resistance_ampicillin,bacterial_resistance_kanamycin,bacterial_resistance_spectinomycin,bacterial_resistance_chloramphenicol,copy_number_high_copy,copy_number_unknown,copy_number_low_copy,growth_strain_dh5alpha,growth_strain_neb_stable,...,species_budding_yeast,species_zebrafish,species_rat,species_mustard_weed,species_nematode,bacterial_resistance_other,growth_strain_other,growth_temp_other,selectable_markers_other,species_other
addgene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
48.0,GTACGAGCTCGGATCCCTAGTCCAGTGTGGTGGAATTCTGCAGATA...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1092.0,GAATATTTTGTTTACTTTAGAAGGAGATATACCATGGGCCATCATC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1136.0,TCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCT...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1162.0,GACGAAAGGGCCTCGTGATACGCCTATTTTTATAGGTTAATGTCAT...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1183.0,TTTACTTCTTAATTCTCTTTTAAGTTAGTCTTTTTTTTAGTTTTAA...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1188.0,TGGATCCCCCGGGCTGCAGGAATTAATTCTGCAGATATCCATCACA...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1191.0,ATGACTATACAAAACTAGCTAGGAATTCTTGAAGACGAAAGGGCCT...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1210.0,TCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCT...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1232.0,CATCACGATTACGATATCCCAACGACCGAAAACCTGTATTTTCAGG...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1244.0,TGCGNCATTCCCTCTGAATATTTTGTTTACTTTAAGAAGGAGATAT...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
data.columns

Index(['sequence', 'bacterial_resistance_ampicillin',
       'bacterial_resistance_kanamycin', 'bacterial_resistance_spectinomycin',
       'bacterial_resistance_chloramphenicol', 'copy_number_high_copy',
       'copy_number_unknown', 'copy_number_low_copy', 'growth_strain_dh5alpha',
       'growth_strain_neb_stable', 'growth_strain_top10',
       'growth_strain_stbl3', 'growth_strain_xl1_blue', 'growth_strain_dh10b',
       'growth_strain_ccdb_survival', 'growth_temp_37', 'growth_temp_30',
       'selectable_markers_neomycin', 'selectable_markers_puromycin',
       'selectable_markers_hygromycin', 'selectable_markers_ura3',
       'selectable_markers_blasticidin', 'selectable_markers_zeocin',
       'selectable_markers_leu2', 'selectable_markers_trp1',
       'selectable_markers_his3', 'species_human', 'species_synthetic',
       'species_mouse', 'species_fly', 'species_budding_yeast',
       'species_zebrafish', 'species_rat', 'species_mustard_weed',
       'species_nematode', 'bacte

In [6]:
set(plasmid).issubset(set('ACTG'))

True

In [7]:
# slide a subsequence window of 1024
window_size = 1024
seqs = []
window_start_idxs = []
window_end_idxs = []
plasmid = np.array(list(plasmid))
for idx in range(len(plasmid) - window_size + 1):
    subseq = np.array(plasmid)[idx:idx+window_size]
    seqs.append(''.join(subseq.tolist()))
    window_start_idxs.append(idx)
    window_end_idxs.append(idx + window_size)

In [8]:
seqs[0]

'TACTAGTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACCACCCTGCCCTTTTTCTTTAAAACCGAAAAGATTACTTCGCGTTATGCAGGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATCTCGAGTCCCGTCAAGTCAGCGTAATGCTCTGCCAGTGTTACAACCAATTAACCAATTCTGATTAGAAAAACTCATCGAGCATCAAATGAAACTGCAATTTATTCATATCAGGATTATCAATACCATATTTTTGAAAAAGCCGTTTCTGTAATGAAGGAGAAAACTCACCGAGGCAGTTCCATAGGATGGCAAGATCCTGGTATCGGTCTGCGATTCCGACTCGTCCAACATCAATACAACCTATTAATTTCCCCTCGTCAAAAATAAGGTTATCAAGTGAGAAATCACCATGAGTGACGACTGAATCCGGTGAGAATGGCAAAAGCTTATGCATTTCTTTCCAGACTTGTTCAACAGGCCAGCCATTACGCTCGTCATCAAAATCACTCGCATCAACCAAACCGTTATTCATTCGTGATTGCGCCTGAGCGAGACGAAATACGCGATCGCTGTTAAAAGGACAATTACAAACAGGAATCGAATGCAACCGGCGCAGGAACACTGCCAGCGCATCAACAATATTTTCACCTGAATCAGGATATTCTTCTAATACCTGGAATGCTGTTTTCCCGGGGATCGCAGTGGTGAGTAACCATGCATCATCAGGAGTACGGATAAAATGCTTGATGGTCGGAAGAGGCATAAATTCCGTCAGCCAGTTTAGTCTGACCATCTCATCTGTAACATCATTGGCAACGCTACCTTTGCCATGTTTCAGAAACAACTCTGGCGCATCGGGCTTCCCATACAATCGATAGATTGTCGCACCTGATTGCCCGACATTATCGCGAGCCCATTTATACCCATATAAATCAGCATCCA

In [9]:
seqs[-1]

'ATAGGCGTATCACGAGGCAGAATTTCAGATAAAAAAAATCCTTAGCTTTCGCTAAGGATGATTTCTGGAATTCGCGGCCGCTTCTAGAGTAACACCGTGCGTGTTGACTATTTTACCTCTGGCGGTGATAATGGTTGCTACTAGAGAAAGAGGAGAAATACTAGATGGTGAGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCTTCGGCTACGGCCTGCAATGCTTCGCCCGCTACCCCGACCACATGAAGCTGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCTACCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAATAATACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCT

In [10]:
window_start_idxs[-1]

2661

In [11]:
window_end_idxs[-1]

3685

In [12]:
print(len(seqs))
assert len(window_start_idxs) == len(seqs) and len(seqs) == len(window_end_idxs)

2662


In [13]:
# now do a sliding window of 512
window_size = 512
for idx in range(len(plasmid) - window_size + 1):
    subseq = np.array(plasmid)[idx:idx+window_size]
    seqs.append(''.join(subseq.tolist()))
    window_start_idxs.append(idx)
    window_end_idxs.append(idx + window_size)

In [14]:
len(seqs)

5836

In [15]:
seqs[-1]

'GGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCTACCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAATAATACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATA'

In [16]:
# Now 256
window_size = 256
for idx in range(len(plasmid) - window_size + 1):
    subseq = np.array(plasmid)[idx:idx+window_size]
    seqs.append(''.join(subseq.tolist()))
    window_start_idxs.append(idx)
    window_end_idxs.append(idx + window_size)

In [17]:
len(seqs)

9266

In [18]:
seqs[-1]

'TGAGCTACCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGTAATAATACTAGAGCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATA'

In [19]:
# Now 128
window_size = 128
for idx in range(len(plasmid) - window_size + 1):
    subseq = np.array(plasmid)[idx:idx+window_size]
    seqs.append(''.join(subseq.tolist()))
    window_start_idxs.append(idx)
    window_end_idxs.append(idx + window_size)

In [20]:
len(seqs)

12824

In [21]:
seqs[-1]

'CAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATA'

In [22]:
# and 64
window_size = 64
for idx in range(len(plasmid) - window_size + 1):
    subseq = np.array(plasmid)[idx:idx+window_size]
    seqs.append(''.join(subseq.tolist()))
    window_start_idxs.append(idx)
    window_end_idxs.append(idx + window_size)

In [23]:
len(seqs)

16446

In [24]:
seqs[-1]

'TCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATA'

In [25]:
# and 32
window_size = 32
for idx in range(len(plasmid) - window_size + 1):
    subseq = np.array(plasmid)[idx:idx+window_size]
    seqs.append(''.join(subseq.tolist()))
    window_start_idxs.append(idx)
    window_end_idxs.append(idx + window_size)

In [26]:
len(seqs)

20100

In [27]:
seqs[-1]

'CTCACCTTCGGGTGGGCCTTTCTGCGTTTATA'

In [28]:
# and 16 because why not?
window_size = 16
for idx in range(len(plasmid) - window_size + 1):
    subseq = np.array(plasmid)[idx:idx+window_size]
    seqs.append(''.join(subseq.tolist()))
    window_start_idxs.append(idx)
    window_end_idxs.append(idx + window_size)

In [29]:
len(seqs)

23770

In [30]:
seqs[-1]

'CCTTTCTGCGTTTATA'

In [31]:
#Great. Now make the dataframe (subtract one for the sequence entry)
mut_data = pd.DataFrame(np.zeros((len(seqs), len(data.columns) - 1)))
mut_data.columns = data.columns.values[1:]
mut_data

Unnamed: 0,bacterial_resistance_ampicillin,bacterial_resistance_kanamycin,bacterial_resistance_spectinomycin,bacterial_resistance_chloramphenicol,copy_number_high_copy,copy_number_unknown,copy_number_low_copy,growth_strain_dh5alpha,growth_strain_neb_stable,growth_strain_top10,...,species_budding_yeast,species_zebrafish,species_rat,species_mustard_weed,species_nematode,bacterial_resistance_other,growth_strain_other,growth_temp_other,selectable_markers_other,species_other
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
mut_df = pd.concat([pd.DataFrame({'sequence':seqs}), mut_data], axis=1)
mut_df

Unnamed: 0,sequence,bacterial_resistance_ampicillin,bacterial_resistance_kanamycin,bacterial_resistance_spectinomycin,bacterial_resistance_chloramphenicol,copy_number_high_copy,copy_number_unknown,copy_number_low_copy,growth_strain_dh5alpha,growth_strain_neb_stable,...,species_budding_yeast,species_zebrafish,species_rat,species_mustard_weed,species_nematode,bacterial_resistance_other,growth_strain_other,growth_temp_other,selectable_markers_other,species_other
0,TACTAGTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGT...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ACTAGTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTC...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CTAGTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCA...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,TAGTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCAC...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AGTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACC...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,GTAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACCA...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,TAGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACCAC...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,AGCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACCACC...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,GCGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACCACCC...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,CGGCCGCTGCAGTCCGGCAAAAAAACGGGCAAGGTGTCACCACCCT...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
assert mut_df.columns.tolist() == data.columns.tolist()

In [34]:
pickle.dump( mut_df, open( '../../../data/deteRNNt_exploration/JQ394803_scanning_subsequence_x.pkl', "wb" ) )

In [35]:
window_info = pd.DataFrame({'window_start':window_start_idxs, 'window_end':window_end_idxs})
window_info

Unnamed: 0,window_start,window_end
0,0,1024
1,1,1025
2,2,1026
3,3,1027
4,4,1028
5,5,1029
6,6,1030
7,7,1031
8,8,1032
9,9,1033


In [36]:
pickle.dump( window_info, open( '../../../data/deteRNNt_exploration/JQ394803_scanning_subsequence_window_info.pkl', "wb" ) )