Skip to content

Commit

Permalink
cafa3 targets
Browse files Browse the repository at this point in the history
  • Loading branch information
coolmaksat committed Nov 24, 2016
1 parent 3e7d997 commit aa0f6bb
Show file tree
Hide file tree
Showing 11 changed files with 120 additions and 163 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -1,3 +1,5 @@
data/
venv/
*.pyc
*.out
*.pdf
4 changes: 4 additions & 0 deletions README.md
Expand Up @@ -63,3 +63,7 @@ viruses
# Experiments with embeddings
80/20 with - 0.785514 34121, 0.855387 35816, 0.798833 31559
80/20 without - 0.745071 34121


# CAFA 2 Challange experiments
0.380500 66123, 0.498349 66311, 0.560893 71437
85 changes: 56 additions & 29 deletions cafa.py
Expand Up @@ -27,7 +27,7 @@ def read_fasta(filename):
if line.startswith('>'):
if seq != '':
data.append(seq)
line = line.split()[0]
line = line.split()[1]
seq = line + '\t'
else:
seq += line
Expand Down Expand Up @@ -64,15 +64,15 @@ def get_annotations():


def fasta2tabs():
cafa_root = 'data/cafa2/data/CAFA2-targets/'
cafa_root = 'data/cafa3/CAFA3_targets/'
data = list()
for dr in os.listdir(cafa_root):
if os.path.isdir(cafa_root + dr):
for fl in os.listdir(cafa_root + dr):
if fl.endswith('.tfa'):
seqs = read_fasta(cafa_root + dr + '/' + fl)
data += seqs
with open('data/cafa2/targets.txt', 'w') as f:
# for dr in os.listdir(cafa_root):
# if os.path.isdir(cafa_root + 'Targets/'):
for fl in os.listdir(cafa_root + 'Targets/'):
if fl.endswith('.fasta'):
seqs = read_fasta(cafa_root + 'Targets/' + fl)
data += seqs
with open('data/cafa3/targets.txt', 'w') as f:
for line in data:
f.write(line + '\n')

Expand All @@ -84,39 +84,66 @@ def sprot2tabs():
f.write(line + '\n')


def get_data():
targets = set()
with open('data/cafa2/targets.txt', 'r') as f:
def cafa3():
root = 'data/cafa3/CAFA3_training_data/'
filename = root + 'uniprot_sprot_exp.fasta'
data = read_fasta(filename)
annots = dict()
with open(root + 'uniprot_sprot_exp.txt') as f:
for line in f:
items = line.strip().split()
targets.add(items[1])
items = line.strip().split('\t')
if items[0] not in annots:
annots[items[0]] = set()
annots[items[0]].add(items[1])
fl = open(root + 'uniprot_sprot.tab', 'w')
for line in data:
items = line.split('\t')
if is_ok(items[1]) and items[0] in annots:
fl.write(line + '\t')
gos = list(annots[items[0]])
fl.write(gos[0])
for go_id in gos[1:]:
fl.write('; ' + go_id)
fl.write('\n')


def get_data():
# targets = set()
# with open('data/cafa2/targets.txt', 'r') as f:
# for line in f:
# items = line.strip().split()
# targets.add(items[1])
seqs = dict()
with open('data/cafa2/uniprot_sprot.tab', 'r') as f:
with open('data/cafa3/targets.txt', 'r') as f:
for line in f:
items = line.strip().split('\t')
if is_ok(items[1]):
prot_id = items[0].split('|')[2]
prot_id = items[0]
seqs[prot_id] = items[1]

data = list()
with open('data/cafa2/annotations_2014.tab', 'r') as f:
# print len(seqs)
annots = dict()
with open('data/cafa3/uniprot-go.tab', 'r') as f:
for line in f:
items = line.strip().split('\t')
if items[0] in targets and items[0] in seqs:
data.append(items)

np.random.shuffle(data)
fl = open('data/cafa2/test.txt', 'w')
for items in data:
fl.write(items[0] + '\t' + seqs[items[0]] + '\t' + items[1])
for i in range(2, len(items)):
fl.write('; ' + items[i])
fl.write('\n')
if items[1] in seqs:
annots[items[1]] = items[2]

# np.random.shuffle(data)
fl = open('data/cafa3/data.txt', 'w')
for prot_id in seqs:
if prot_id in annots:
fl.write(prot_id + '\t' + seqs[prot_id] + '\t' + annots[prot_id])
fl.write('\n')
else:
print(prot_id)
fl.close()


def main(*args, **kwargs):
get_data()
# cafa3()
# fasta2tabs()


if __name__ == '__main__':
main(*sys.argv)
19 changes: 11 additions & 8 deletions get_data.py
Expand Up @@ -13,13 +13,16 @@

FUNCTION = 'bp'
ORG = ''
TT = 'test'
TT = 'train'

args = sys.argv
if len(args) == 4:
print args
TT = args[1]
ORG = '-' + args[2]
if args[2]:
ORG = '-' + args[2]
else:
ORG = ''
FUNCTION = args[3]

FUNC_DICT = {
Expand All @@ -29,7 +32,7 @@

GO_ID = FUNC_DICT[FUNCTION]

DATA_ROOT = 'data/network/'
DATA_ROOT = 'data/cafa3/'
FILENAME = TT + '.txt'

go = get_gene_ontology('go.obo')
Expand Down Expand Up @@ -123,11 +126,11 @@ def main(*args, **kwargs):
'indexes': indexes,
'gos': gos,
'labels': labels}
rep = load_rep()
rep_list = list()
for prot_id in proteins:
rep_list.append(rep[prot_id])
data['rep'] = rep_list
# rep = load_rep()
# rep_list = list()
# for prot_id in proteins:
# rep_list.append(rep[prot_id])
# data['rep'] = rep_list
df = pd.DataFrame(data)
df.to_pickle(DATA_ROOT + TT + ORG + '-' + FUNCTION + '.pkl')

Expand Down
16 changes: 9 additions & 7 deletions get_functions.py
Expand Up @@ -9,19 +9,21 @@
get_anchestors,
BIOLOGICAL_PROCESS,
MOLECULAR_FUNCTION,
CELLULAR_COMPONENT)
from aaindex import AAINDEX
CELLULAR_COMPONENT,
FUNC_DICT)
from multiprocessing import Pool


DATA_ROOT = 'data/cafa2/'
DATA_ROOT = 'data/cafa3/'
ORG = ''
FILENAME = 'train' + ORG + '.txt'
ANNOT_NUM = 10
GO_ID = CELLULAR_COMPONENT
FUNCTION = 'cc' + ORG
ANNOT_NUM = 250
FUNCTION = 'bp'

go = get_gene_ontology('go_cafa2.obo')
GO_ID = FUNC_DICT[FUNCTION]
FUNCTION += ORG

go = get_gene_ontology('go.obo')
# functions = get_go_sets(
# go, [MOLECULAR_FUNCTION, BIOLOGICAL_PROCESS, CELLULAR_COMPONENT])

Expand Down
2 changes: 1 addition & 1 deletion nn_hierarchical_swiss_bp.py
Expand Up @@ -37,7 +37,7 @@
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
sys.setrecursionlimit(100000)

DATA_ROOT = 'data/cafa2/'
DATA_ROOT = 'data/cafa3/'
MAXLEN = 1000
GO_ID = BIOLOGICAL_PROCESS
go = get_gene_ontology('go.obo')
Expand Down
6 changes: 3 additions & 3 deletions nn_hierarchical_swiss_cc.py
Expand Up @@ -37,7 +37,7 @@
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
sys.setrecursionlimit(100000)

DATA_ROOT = 'data/network/'
DATA_ROOT = 'data/cafa3/'
MAXLEN = 1000
GO_ID = CELLULAR_COMPONENT
go = get_gene_ontology('go.obo')
Expand Down Expand Up @@ -133,7 +133,7 @@ def model():
# set parameters:
batch_size = 512
nb_epoch = 100
output_dim = 128
output_dim = 256
nb_classes = len(functions)
start_time = time.time()
logging.info("Loading Data")
Expand Down Expand Up @@ -188,7 +188,7 @@ def model():
model_path = DATA_ROOT + 'hierarchical_cc' + ORG + '.hdf5'
checkpointer = ModelCheckpoint(
filepath=model_path, verbose=1, save_best_only=True)
earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
logging.info(
'Compilation finished in %d sec' % (time.time() - start_time))
logging.info('Starting training the model')
Expand Down
2 changes: 1 addition & 1 deletion nn_hierarchical_swiss_mf.py
Expand Up @@ -37,7 +37,7 @@
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
sys.setrecursionlimit(100000)

DATA_ROOT = 'data/network/'
DATA_ROOT = 'data/cafa3/'
MAXLEN = 1000
GO_ID = MOLECULAR_FUNCTION
go = get_gene_ontology('go.obo')
Expand Down

0 comments on commit aa0f6bb

Please sign in to comment.