In [2]:
## Import required packages
## Base modules
import random
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#Preprocessing modules
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
## Pytorch NN modules
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
%matplotlib inline

In [3]:
# Check local GPU is running, uncomment and run if using a local GPU
#torch.cuda.current_device()
#torch.cuda.device(0)
#torch.cuda.device_count()
#torch.cuda.get_device_name(0)

In [4]:
## We'll build a helper function to generate simulated reads.
## Could've gone for specifying probabilities per base,
## but as it's a field test we're keeping it simple.

def random_dna_sequence(length):
    return np.array(list(''.join(random.choice('ACTG') for _ in range(length))))

# Function for generating n x m R matrix,
# where n is the number of reads (total_reads)
# and m is each read's length (read_length).

def create_read_matrix(total_reads,read_length):
    read_matrix = []
    for i in range(total_reads):
        read_matrix.append(random_dna_sequence(read_length))
    return np.vstack(read_matrix)

#Build a function that simultaneously generates unique, desired length k-mers 
##from a previously generated read_matrix and times said k-mer occured.
## Returns both objects ordered by occurence

def create_kmer_matrix(read_matrix,kmer_size):
    kmer_matrix = []
    for i in range(read_matrix.shape[0]):
        for j in range(len(read_matrix[i])-kmer_size+1):
            kmer_matrix.append(read_matrix[i][j:j+kmer_size])
    kmer_matrix, counts = np.unique(kmer_matrix,axis=0,return_counts=1)
    #inner index to sort by count occurences
    sort_index = np.argsort(-counts)
    return kmer_matrix[sort_index], counts[sort_index]

## function to turn our kmer matrix into a one hot matrix.

def oneshotonehot(kmer_matrix):
    
    # Not yet robust, breaks when single vector is entered;
    # look into reshaping
    
    #Define bases and fit labels to use

    
    oneshotonehot_matrix = []
    bases = ['A','C','T','G']
    label_encoder = LabelEncoder()
    base_encoder = label_encoder.fit(bases)
    for i in range(len(kmer_matrix)):
        encoded_base_seq = base_encoder.transform(kmer_matrix[i])
        reshaped_vector = encoded_base_seq.reshape(-1)
        one_hot_vector = np.eye(len(bases))[reshaped_vector] 
        oneshotonehot_matrix.append(one_hot_vector)
    return oneshotonehot_matrix

In [36]:
seq = random_dna_sequence(10)
print(seq)

['G' 'A' 'A' 'G' 'C' 'A' 'A' 'A' 'A' 'A']


In [42]:
rm = create_read_matrix(10,10)
print(rm)

[['C' 'A' 'C' 'T' 'G' 'T' 'C' 'G' 'C' 'G']
 ['G' 'G' 'G' 'G' 'C' 'G' 'T' 'A' 'A' 'T']
 ['T' 'G' 'G' 'C' 'G' 'G' 'A' 'G' 'C' 'C']
 ['A' 'G' 'C' 'A' 'A' 'A' 'A' 'G' 'T' 'T']
 ['T' 'A' 'A' 'A' 'G' 'C' 'A' 'C' 'A' 'C']
 ['C' 'T' 'T' 'A' 'A' 'T' 'A' 'A' 'G' 'C']
 ['C' 'A' 'A' 'T' 'C' 'G' 'C' 'T' 'G' 'A']
 ['T' 'G' 'G' 'A' 'G' 'C' 'A' 'A' 'A' 'T']
 ['G' 'T' 'G' 'C' 'G' 'G' 'G' 'A' 'G' 'G']
 ['C' 'C' 'G' 'G' 'T' 'A' 'C' 'A' 'A' 'A']]


In [44]:
km = create_kmer_matrix(rm,4)
print(km)

(array([['G', 'G', 'A', 'G'],
       ['A', 'G', 'C', 'A'],
       ['C', 'A', 'A', 'A'],
       ['G', 'A', 'G', 'C'],
       ['G', 'G', 'C', 'G'],
       ['T', 'C', 'G', 'C'],
       ['T', 'A', 'A', 'T'],
       ['A', 'A', 'G', 'C'],
       ['G', 'C', 'A', 'A'],
       ['A', 'A', 'A', 'G'],
       ['G', 'C', 'G', 'G'],
       ['G', 'C', 'G', 'T'],
       ['G', 'C', 'A', 'C'],
       ['G', 'C', 'T', 'G'],
       ['G', 'G', 'G', 'A'],
       ['G', 'G', 'G', 'C'],
       ['G', 'G', 'G', 'G'],
       ['A', 'A', 'A', 'A'],
       ['G', 'T', 'A', 'A'],
       ['G', 'A', 'G', 'G'],
       ['G', 'T', 'C', 'G'],
       ['G', 'T', 'G', 'C'],
       ['T', 'A', 'A', 'A'],
       ['T', 'A', 'A', 'G'],
       ['T', 'A', 'C', 'A'],
       ['T', 'G', 'C', 'G'],
       ['T', 'G', 'G', 'A'],
       ['T', 'G', 'G', 'C'],
       ['G', 'G', 'T', 'A'],
       ['G', 'T', 'A', 'C'],
       ['C', 'T', 'G', 'T'],
       ['T', 'G', 'T', 'C'],
       ['A', 'A', 'A', 'T'],
       ['A', 'A', 'G', 'T'],
       ['A', 

In [57]:
kmers,counts = create_kmer_matrix(create_read_matrix(100000,100),6)

In [154]:
kmers

array([['A', 'G', 'A', 'G', 'G', 'T'],
       ['C', 'A', 'A', 'G', 'T', 'C'],
       ['C', 'G', 'G', 'T', 'A', 'G'],
       ...,
       ['C', 'C', 'A', 'T', 'A', 'A'],
       ['C', 'C', 'G', 'G', 'C', 'T'],
       ['T', 'T', 'C', 'G', 'T', 'G']], dtype='<U1')

In [None]:
# Create a kmer matrix for testing the model
kmers,counts = create_kmer_matrix(create_read_matrix(100000,100),6)

In [519]:
np.array(zz[0]).shape

(6,)

In [512]:
np.arange(3)

array([0, 1, 2])

In [511]:
zz

[array([0, 2, 0, 2, 2, 3]),
 array([1, 0, 0, 2, 3, 1]),
 array([1, 2, 2, 3, 0, 2]),
 array([1, 2, 0, 3, 1, 2]),
 array([2, 2, 3, 0, 1, 2]),
 array([3, 3, 2, 0, 2, 1]),
 array([3, 1, 2, 3, 0, 3]),
 array([3, 0, 2, 0, 2, 2]),
 array([1, 3, 1, 2, 3, 3]),
 array([0, 1, 0, 0, 2, 3]),
 array([2, 0, 1, 0, 0, 2]),
 array([2, 1, 2, 3, 1, 0]),
 array([0, 2, 1, 1, 3, 1]),
 array([0, 1, 0, 0, 3, 1]),
 array([2, 1, 2, 0, 3, 1])]

In [505]:
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoder.fit(zz)

for i in range(len(zz)):
    reshaped_array = zz[i].reshape(len(zz[i]),1)
    encoded_base_seq = onehot_encoder.transform(reshaped_array)
    print(encoded_base_seq)
    


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


ValueError: X has different shape than during fitting. Expected 6, got 1.

In [495]:
len(zz)

15

In [507]:
one_hot_matrix(zz)

[array([[1., 0., 0.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.]]), array([[0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.]]), array([[0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]]), array([[0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.]]), array([[0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.]]), array([[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.]]), array([[0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
    

In [None]:
onehot_encoder

In [469]:
zz = encoded_matrix(kmers[0:15])

In [467]:
one_hot_matrix(zz)

IndexError: boolean index did not match indexed array along dimension 0; dimension is 36 but corresponding boolean dimension is 6

In [425]:
print(encoded_matrix(kmers[0:5]))

[array([0, 2, 0, 2, 2, 3]), array([1, 0, 0, 2, 3, 1]), array([1, 2, 2, 3, 0, 2]), array([1, 2, 0, 3, 1, 2]), array([2, 2, 3, 0, 1, 2])]


In [426]:
print(kmers[0:5])

[['A' 'G' 'A' 'G' 'G' 'T']
 ['C' 'A' 'A' 'G' 'T' 'C']
 ['C' 'G' 'G' 'T' 'A' 'G']
 ['C' 'G' 'A' 'T' 'C' 'G']
 ['G' 'G' 'T' 'A' 'C' 'G']]


In [339]:
zz = kmers[0:5]

In [340]:
len(zz)

5

In [347]:
print((one_hot_matrix(zz)))
print(zz)

[array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]]), array([[0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.]]), array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]]), array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]]), array([[0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])]
[['A' 'G' 'A' 'G' 'G' 'T']
 ['C' 'A' 'A' 'G' 'T' 'C']
 ['C' 'G' 'G' 'T' 'A' 'G']
 ['C' 'G' 'A' 'T' 'C' 'G']
 ['G' 'G' 'T' 'A' 'C' 'G']]


In [273]:
kmers[0]

array(['A', 'G', 'A', 'G', 'G', 'T'], dtype='<U1')

In [260]:
kmers[0]

array(['A', 'G', 'A', 'G', 'G', 'T'], dtype='<U1')

In [261]:
kmers[1000]

array(['A', 'A', 'C', 'C', 'G', 'C'], dtype='<U1')

In [None]:
base_encoded = base_encoded.reshape(len(base_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(base_encoded)


array(['C', 'G', 'G', 'T', 'A', 'G'], dtype='<U1')

In [101]:
bases = np.array(["A","C","T","G"])
onehot_encoder.fit(bases.reshape(-1,1))

OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values='auto', sparse=False)

In [102]:
z = onehot_encoder.fit_transform(kmers)

In [105]:
z[2]

array([0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 0.])

In [84]:
seq_encoder.categories_

AttributeError: 'LabelEncoder' object has no attribute 'categories_'

In [81]:
seq_encoder.transform(kmers).toarray()

ValueError: bad input shape (4096, 6)

In [58]:
## Define optimizer, RMSprop as per TTLT
## Model was trained for 200 epochs
## Learning rate = 0.001, alpha = 0.99, momentum = 0
### alpha and momentum correspond to pytorch's default value
### for the torch.optim.RMSprop class; 
### therefore, we'll only specify learning rat and epochs.
n_epoch = 200
learning_rate = 0.001
optim.RMSprop()

# Build LSTM
## Layers = 2, 256 hidden units per layer.
## Embedding output size = 2

bi_lstm = nn.LSTM(input_size=24, hidden_size = 256 , num_layers = 2, bidirectional=True)
reverse_lstm = nn.LSTM(input_size=24, hidden_size = 256 , num_layers = 2, bidirectional=True)


bi_output, bi_hidden = bi_lstm()
reverse_output, reverse_hidden = bi_lstm()

TypeError: __init__() missing 1 required positional argument: 'params'

In [None]:
# Build prediction layer
## MLP
## layers = 2, sizes 150 and 100, respectively
## Activation function = ReLU