In [7]:
ls

config.py                 functions.pyc          ssfunctions.ipynb
config.pyc                learn.py               SSfunctions.py
daskClusterLSF.py         learnssmodel.py        sspredictor.py
Dataset.py                lsfconfig.py           test.py
distributedTensorFlow.py  physicalpropTable.csv  test.pyc
[0m[01;34mexamples[0m/                 README.md
functions.py              [01;34mSSdataset[0m/


In [37]:
import pandas as pd
import numpy as np
import keras
from Bio import SeqIO
import glob
import pdb
import functools

from sklearn.preprocessing import OneHotEncoder
from keras.models import Model
from keras.layers import Input, Dense , LSTM , Bidirectional

def protsec2numpy(sec, windowlen, propdict=None, verbose= False):
	#window should be an odd number
	#slice up the sequence into window sized chunks
	padding = (windowlen-1)/2
	try:
		originallen = len(sec)
	except TypeError:
		return None
	sec = ['-']*int(padding) + list(sec) + int(padding)*['-']
	sechunks=  [ [ x for x in str(sec[i:i + windowlen]) ]  for i in range(0, originallen)]
    return sechunks

def seq2vec(seq, propdict, verbose = False):
	#countmat is length of sequence
	propmat = np.zeros(( len(propdict) , len(seq) ))
	seqvec = np.asarray( [char for char in seq] )
	for i,prop in enumerate(propdict):
		vals = np.vectorize(propdict[prop].get)(seqvec)
		propmat[i,:] = vals.ravel()
	return propmat

def econdedDSSP(ssStr,intdico,encoder):
	'''
	H = alpha-helix
	B = residue in isolated beta-bridge
	E = extended strand, participates in beta ladder
	G = 3-helix (310 helix)
	I = 5 helix (pi-helix)
	T = hydrogen bonded turn
	S = bend
	'''
	try:
		intvec = np.asarray([ intdico[char] for char in ssStr])
		onehot = encoder.transform(intvec.reshape(-1, 1))
		return onehot
	except:
		#print(ssStr)
		return(ssStr)


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 23)

In [56]:
import csv
def loadDict(csvfile):    
	with open(csvfile , 'r') as filestr:
		final = {}
		propdict= csv.DictReader(filestr)
		for row in propdict:
			for key in row.keys():
				if key != 'letter Code' and key!= 'Amino Acid Name' and key!= '':
					if key not in final:
						final[key]={}
					final[key][row['letter Code']] = float(row[key])
	return final


def datagenerator(fastas , n=100 , windowlen= 13, embeddingprot=None , embeddingSS=None):
	#yield string for x and y to make a df block of n sequences to learn with
	for fasta in fastas:
		fastaIter = SeqIO.parse(fasta, "fasta")
		seqDict={}
		for seq in fastaIter:
			chainID = str(seq.description)
			ID = chainID[0:6]
			if ID not in seqDict:
				seqDict[ID]= {}
			if 'secstr' in seq.description:
				seqDict[ID]['SS']= ''.join(seq.seq)
			else:
				seqDict[ID]['AA']= ''.join(seq.seq)

			if len(seqDict)>n:
				df = pd.DataFrame.from_dict(seqDict, orient= 'index')
				yield df
				seqDict={}




#init encoder for ss
sspath = '/home/cactuskid/Dropbox/machine_learning/SSdataset/'
fastas = glob.glob(sspath +'*.txt')
print(fastas)

#window of amino acids to inspect in lstm
windowlen = 13

#first part of the network, layered lstm
LSTMoutdim = 30
LSTMlayers = 3
#second part of the network, dense decoder
Denselayers = 3
Denseoutdim = 30
#save itnerval
saveinterval = 100
verbose = True

proppath = '/home/cactuskid/Dropbox/machine_learning/physicalpropTable.csv'
propdict = loadDict(proppath)

#pdb.set_trace()


encoder = OneHotEncoder()
encoder.fit( np.asarray( np.asarray(range(7)).reshape(-1,1) ) )



intdico = { charval : int(i) for i,charval in enumerate(['H', 'B', 'E', 'G', 'I', 'T', 'S'] ) }
ssencoder = functools.partial( econdedDSSP , intdico= intdico , encoder = encoder)
prot2sec = functools.partial( protsec2numpy , windowlen= windowlen , propdict= propdict )
generator = datagenerator(fastas, n = 10 , windowlen= windowlen ,  embeddingprot = prot2sec , embeddingSS = ssencoder)

testdf = next(generator)

['/home/cactuskid/Dropbox/machine_learning/SSdataset/ss.txt']


In [57]:
print(testdf)

                                                       AA  \
101M:A  MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...   
102L:A  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAAKSE...   
102M:A  MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...   
103L:A  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNSLDAAK...   
103M:A  MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...   
104L:A  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAA...   
104L:B  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSAA...   
104M:A  VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...   
105M:A  VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...   
106M:A  MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...   
107L:A  MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKGEL...   

                                                       SS  
101M:A  HHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHGGGGGGTTTTTSHH...  
102L:A  HHHHHHHHHEEEEEETTSEEEETTEEEESSSTTTHHHHHHHHHHTS...  
102M:A  HHHHHHHHHHHHHHGGGHHHHHHHHHHHHHHHGGGGGGTTTTTSHH...  
103L:A  HHHHHHHHHEEEEEETTSE

In [58]:
row = next(testdf.iterrows())

In [59]:
print(row[1]['AA'])

rowout = protsec2numpy(row[1]['AA'], 11, propdict=None, verbose= False)

MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRVKHLKTEAEMKASEDLKKHGVTVLTALGAILKKKGHHEAELKPLAQSHATKHKIPIKYLEFISEAIIHVLHSRHPGNFGADAQGAMNKALELFRKDIAAKYKELGYQG


In [61]:

from array import array
''.join(rowout[0])

SyntaxError: invalid syntax (<ipython-input-61-4fe4f3f228b2>, line 3)