In [2]:
## Import required packages
import torch
import random
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.manifold import TSNE
%matplotlib inline

In [3]:
# Check local GPU is running, uncomment and run if using a local GPU
#torch.cuda.current_device()
#torch.cuda.device(0)
#torch.cuda.device_count()
#torch.cuda.get_device_name(0)

In [118]:
## We'll build a function to generate simulated reads.
## Could've gone for specifying probabilities per base,
## but as it's a field test we're keeping it simple.

def random_dna_sequence(length):
    return np.array(list(''.join(random.choice('ACTG') for _ in range(length))))

# Function for generating n x m R matrix,
# where n is the number of reads (total_reads)
# and m is each read's length (read_length).

def create_read_matrix(total_reads,read_length):
    read_matrix = []
    for i in range(total_reads):
        read_matrix.append(random_dna_sequence(read_length))
    return np.vstack(read_matrix)

#Build a function that simultaneously generates unique, desired length k-mers 
##from a previously generated read_matrix and times said k-mer occured.
## Returns both objects ordered by occurence

def create_kmer_matrix(read_matrix,kmer_size):
    kmer_matrix = []
    for i in range(read_matrix.shape[0]):
        for j in range(len(read_matrix[i])-kmer_size+1):
            kmer_matrix.append(read_matrix[i][j:j+kmer_size])
    kmer_matrix, counts = np.unique(kmer_matrix,axis=0,return_counts=1)
    #inner index to sort by count occurences
    sort_index = np.argsort(-counts)
    return kmer_matrix[sort_index], counts[sort_index]

In [140]:
x,y = create_kmer_matrix(create_read_matrix(10,100),10)

In [143]:
x[1]

array(['G', 'G', 'C', 'C', 'A', 'A', 'A', 'C', 'T', 'G'], dtype='<U1')

In [148]:
## One hot encoding of sequences
## define the bases we'll use an convert values to an array.
sequence = x[3]
values = sequence
print(values)
#encoding to integers
base_encoder = LabelEncoder()
base_encoded = base_encoder.fit_transform(values)
print(base_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
base_encoded = base_encoded.reshape(len(base_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(base_encoded)
print(onehot_encoded)

['G' 'G' 'C' 'C' 'C' 'T' 'C' 'T' 'A' 'A']
[2 2 1 1 1 3 1 3 0 0]
[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
