In [926]:
## Import required packages
## Base modules
import random
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Encoding Modules
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE

In [3]:
# Check local GPU is running, uncomment and run if using a local GPU
#torch.cuda.current_device()
#torch.cuda.device(0)
#torch.cuda.device_count()
#torch.cuda.get_device_name(0)

In [937]:
## We'll build a helper function to generate simulated reads.
## Could've gone for specifying probabilities per base,
## but as it's a field test we're keeping it simple.

def random_dna_sequence(length):
    return np.array(list(''.join(random.choice('ACTG') for _ in range(length))))

# Function for generating n x m R matrix,
# where n is the number of reads (total_reads)
# and m is each read's length (read_length).

def create_read_matrix(total_reads,read_length):
    
    read_matrix = []
    
    for i in range(total_reads):
        read_matrix.append(random_dna_sequence(read_length))
        
    return np.vstack(read_matrix)

#Build a function that simultaneously generates unique, desired length k-mers 
##from a previously generated read_matrix and times said k-mer occured.
## Returns both objects ordered by occurence

def create_kmer_matrix(read_matrix,kmer_size):
    
    kmer_matrix = []
    
    for i in range(read_matrix.shape[0]):
        
        for j in range(len(read_matrix[i])-kmer_size+1):
            kmer_matrix.append(read_matrix[i][j:j+kmer_size])
            
    kmer_matrix, counts = np.unique(kmer_matrix,axis=0,return_counts=1)
    #inner index to sort by count occurences
    sort_index = np.argsort(-counts)
    
    return kmer_matrix[sort_index], counts[sort_index]

## function to turn our kmer matrix into a one hot matrix.

def oneshotonehot(kmer_matrix):
    
    # Not yet robust, breaks when single vector is entered;
    # look into reshaping
    
    #Define bases and fit labels to use
    
    vectors = kmer_matrix[0]
    counts = kmer_matrix[1]
    oneshotonehot_matrix = []
    bases = ['A','C','T','G']
    label_encoder = LabelEncoder()
    base_encoder = label_encoder.fit(bases)
    
    for i in range(len(vectors)):
        
        encoded_base_seq = base_encoder.transform(vectors[i])
        reshaped_vector = encoded_base_seq.reshape(-1)
        one_hot_vector = np.eye(len(bases))[reshaped_vector] 
        oneshotonehot_matrix.append(one_hot_vector)
        
    return oneshotonehot_matrix, counts

## One final wrapper function to generate per-sample data on the spot

def generate_kmer_data(total_reads, read_length, kmer_size, number_of_samples):
    
    X = []
    Y = []
    sample_num = [] 
    
    for i in range(number_of_samples):
        
        read_matrix = create_read_matrix(total_reads,read_length)
        kmer_matrix = create_kmer_matrix(read_matrix,kmer_size)
        onehot_matrix = oneshotonehot(kmer_matrix)
        X.extend(onehot_matrix[0])
        Y.extend(onehot_matrix[1])
        sample_num.extend([i+1] * len(onehot_matrix[0]))
        
    return X, Y, sample_num

In [941]:
# We will now generate and load data with our previously created helper functions
## X corresponds to one-hot encoded sequence vectors per kmer sequence
## Y corresponds to the number of times said kmer was counted
## sample corresponds to the sample identifier

## We'll import modules to help split our data into test and train sets
from sklearn.model_selection import train_test_split

X, y, sample = generate_kmer_data(total_reads = 100, read_length = 10 , kmer_size = 5, number_of_samples=2)


## Before proceeding, we'll concatenate X and it's corresponding sample label.


# Initial split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    
# Splitting training values to obtain a validation set 
## Computing validation set from the previously computed 80% training data
## 25% "test" on this validation split amounts to 20% of overall data 
## Training is the 60% remaining
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [58]:
## Pytorch NN modules
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
%matplotlib inline

## Define optimizer, RMSprop as per TTLT
## Model was trained for 200 epochs
## Learning rate = 0.001, alpha = 0.99, momentum = 0
### alpha and momentum correspond to pytorch's default value
### for the torch.optim.RMSprop class; 
### therefore, we'll only specify learning rat and epochs.
n_epoch = 200
learning_rate = 0.001
optim.RMSprop()

# Build LSTM
## Layers = 2, 256 hidden units per layer.
## Embedding output size = 2

bi_lstm = nn.LSTM(input_size=24, hidden_size = 256 , num_layers = 2, bidirectional=True)
reverse_lstm = nn.LSTM(input_size=24, hidden_size = 256 , num_layers = 2, bidirectional=True)


bi_output, bi_hidden = bi_lstm()
reverse_output, reverse_hidden = bi_lstm()

TypeError: __init__() missing 1 required positional argument: 'params'

In [None]:
# Build prediction layer
## MLP
## layers = 2, sizes 150 and 100, respectively
## Activation function = ReLU