# Deepsea Validation Txt File Generator

This notebook generates a file 'deepseavalidation.txt' to be used by t2t-decoder which contains validation data for all 919 labels on 8000 sequences.

Requirements:

- valid.mat file from DeepSea


In [6]:
import pandas as pd
from scipy.io import loadmat
import numpy as np

In [17]:
def matrix2sequence(matrix, reverse = False):
	'''
	Converts matrix of size n x 4 to DNA sequence. Used to sanity check conversions.
	:param matrix n x 4 matrix
	:param reverse if True, returns revers complement
	'''
	seq = ''
	for i in range(matrix.shape[0]):
		base = np.asarray(matrix[i, :])

		if np.array_equal(base, [1,0,0,0]):
			if (reverse):
				seq += 'T'
			else:
				seq += 'A'
		elif np.array_equal(base, [0,1,0,0]):
			if (reverse):
				seq += 'C'
			else:
				seq += 'G'
		elif np.array_equal(base, [0,0,1,0]):
			if (reverse):
				seq += 'G'
			else:
				seq += 'C'
		elif np.array_equal(base, [0,0,0,1]):
			if (reverse):
				seq += 'A'
			else:
				seq += 'T'
		elif np.array_equal(base, [0,0,0,0]):
			seq += 'N'
		else:
			raise ValueError('Invalid encoding for base pair %s', base)

	if (reverse):
		seq = ''.join(list(reversed(seq)))

	return seq

In [18]:
datafile = "/data/epitome/tmp/deepsea_train/valid.mat"

In [19]:
# Load valid.mat
mat = loadmat(datafile)
targets = mat['validdata']
inputs = mat['validxdata']


In [21]:
# process validation sequences to strings
sequences = list(map(lambda i: matrix2sequence(i.transpose()), inputs))

In [35]:
# save file
f= open("deepseavalidation.txt","w+")

for i in range(len(sequences)):
    target_str = ','.join(str(t) for t in targets[i])
    f.write("%s\t%s\n" %(sequences[i], target_str))
        
f.close()