cafa3 targets

bio-ontology-research-group · Nov 24, 2016 · aa0f6bb · aa0f6bb
1 parent 3e7d997
commit aa0f6bb
Show file tree

Hide file tree

Showing 11 changed files with 120 additions and 163 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
 data/
 venv/
 *.pyc
+*.out
+*.pdf
diff --git a/README.md b/README.md
@@ -63,3 +63,7 @@ viruses
 # Experiments with embeddings
 80/20 with - 0.785514 34121, 0.855387 35816, 0.798833 31559
 80/20 without - 0.745071 34121
+
+
+# CAFA 2 Challange experiments
+0.380500 66123, 0.498349 66311, 0.560893 71437
diff --git a/cafa.py b/cafa.py
@@ -27,7 +27,7 @@ def read_fasta(filename):
             if line.startswith('>'):
                 if seq != '':
                     data.append(seq)
-                line = line.split()[0]
+                line = line.split()[1]
                 seq = line + '\t'
             else:
                 seq += line
@@ -64,15 +64,15 @@ def get_annotations():
 
 
 def fasta2tabs():
-    cafa_root = 'data/cafa2/data/CAFA2-targets/'
+    cafa_root = 'data/cafa3/CAFA3_targets/'
     data = list()
-    for dr in os.listdir(cafa_root):
-        if os.path.isdir(cafa_root + dr):
-            for fl in os.listdir(cafa_root + dr):
-                if fl.endswith('.tfa'):
-                    seqs = read_fasta(cafa_root + dr + '/' + fl)
-                    data += seqs
-    with open('data/cafa2/targets.txt', 'w') as f:
+    # for dr in os.listdir(cafa_root):
+    # if os.path.isdir(cafa_root + 'Targets/'):
+    for fl in os.listdir(cafa_root + 'Targets/'):
+        if fl.endswith('.fasta'):
+            seqs = read_fasta(cafa_root + 'Targets/' + fl)
+            data += seqs
+    with open('data/cafa3/targets.txt', 'w') as f:
         for line in data:
             f.write(line + '\n')
 
@@ -84,39 +84,66 @@ def sprot2tabs():
             f.write(line + '\n')
 
 
-def get_data():
-    targets = set()
-    with open('data/cafa2/targets.txt', 'r') as f:
+def cafa3():
+    root = 'data/cafa3/CAFA3_training_data/'
+    filename = root + 'uniprot_sprot_exp.fasta'
+    data = read_fasta(filename)
+    annots = dict()
+    with open(root + 'uniprot_sprot_exp.txt') as f:
         for line in f:
-            items = line.strip().split()
-            targets.add(items[1])
+            items = line.strip().split('\t')
+            if items[0] not in annots:
+                annots[items[0]] = set()
+            annots[items[0]].add(items[1])
+    fl = open(root + 'uniprot_sprot.tab', 'w')
+    for line in data:
+        items = line.split('\t')
+        if is_ok(items[1]) and items[0] in annots:
+            fl.write(line + '\t')
+            gos = list(annots[items[0]])
+            fl.write(gos[0])
+            for go_id in gos[1:]:
+                fl.write('; ' + go_id)
+            fl.write('\n')
+
+
+def get_data():
+    # targets = set()
+    # with open('data/cafa2/targets.txt', 'r') as f:
+    #     for line in f:
+    #         items = line.strip().split()
+    #         targets.add(items[1])
     seqs = dict()
-    with open('data/cafa2/uniprot_sprot.tab', 'r') as f:
+    with open('data/cafa3/targets.txt', 'r') as f:
         for line in f:
             items = line.strip().split('\t')
             if is_ok(items[1]):
-                prot_id = items[0].split('|')[2]
+                prot_id = items[0]
                 seqs[prot_id] = items[1]
-
-    data = list()
-    with open('data/cafa2/annotations_2014.tab', 'r') as f:
+    # print len(seqs)
+    annots = dict()
+    with open('data/cafa3/uniprot-go.tab', 'r') as f:
         for line in f:
             items = line.strip().split('\t')
-            if items[0] in targets and items[0] in seqs:
-                data.append(items)
-
-    np.random.shuffle(data)
-    fl = open('data/cafa2/test.txt', 'w')
-    for items in data:
-        fl.write(items[0] + '\t' + seqs[items[0]] + '\t' + items[1])
-        for i in range(2, len(items)):
-            fl.write('; ' + items[i])
-        fl.write('\n')
+            if items[1] in seqs:
+                annots[items[1]] = items[2]
+
+    # np.random.shuffle(data)
+    fl = open('data/cafa3/data.txt', 'w')
+    for prot_id in seqs:
+        if prot_id in annots:
+            fl.write(prot_id + '\t' + seqs[prot_id] + '\t' + annots[prot_id])
+            fl.write('\n')
+        else:
+            print(prot_id)
     fl.close()
 
 
 def main(*args, **kwargs):
     get_data()
+    # cafa3()
+    # fasta2tabs()
+
 
 if __name__ == '__main__':
     main(*sys.argv)
diff --git a/get_data.py b/get_data.py
@@ -13,13 +13,16 @@
 
 FUNCTION = 'bp'
 ORG = ''
-TT = 'test'
+TT = 'train'
 
 args = sys.argv
 if len(args) == 4:
     print args
     TT = args[1]
-    ORG = '-' + args[2]
+    if args[2]:
+        ORG = '-' + args[2]
+    else:
+        ORG = ''
     FUNCTION = args[3]
 
 FUNC_DICT = {
@@ -29,7 +32,7 @@
 
 GO_ID = FUNC_DICT[FUNCTION]
 
-DATA_ROOT = 'data/network/'
+DATA_ROOT = 'data/cafa3/'
 FILENAME = TT + '.txt'
 
 go = get_gene_ontology('go.obo')
@@ -123,11 +126,11 @@ def main(*args, **kwargs):
         'indexes': indexes,
         'gos': gos,
         'labels': labels}
-    rep = load_rep()
-    rep_list = list()
-    for prot_id in proteins:
-        rep_list.append(rep[prot_id])
-    data['rep'] = rep_list
+    # rep = load_rep()
+    # rep_list = list()
+    # for prot_id in proteins:
+    #     rep_list.append(rep[prot_id])
+    # data['rep'] = rep_list
     df = pd.DataFrame(data)
     df.to_pickle(DATA_ROOT + TT + ORG + '-' + FUNCTION + '.pkl')
 

diff --git a/get_functions.py b/get_functions.py
@@ -9,19 +9,21 @@
     get_anchestors,
     BIOLOGICAL_PROCESS,
     MOLECULAR_FUNCTION,
-    CELLULAR_COMPONENT)
-from aaindex import AAINDEX
+    CELLULAR_COMPONENT,
+    FUNC_DICT)
 from multiprocessing import Pool
 
 
-DATA_ROOT = 'data/cafa2/'
+DATA_ROOT = 'data/cafa3/'
 ORG = ''
 FILENAME = 'train' + ORG + '.txt'
-ANNOT_NUM = 10
-GO_ID = CELLULAR_COMPONENT
-FUNCTION = 'cc' + ORG
+ANNOT_NUM = 250
+FUNCTION = 'bp'
 
-go = get_gene_ontology('go_cafa2.obo')
+GO_ID = FUNC_DICT[FUNCTION]
+FUNCTION += ORG
+
+go = get_gene_ontology('go.obo')
 # functions = get_go_sets(
 #     go, [MOLECULAR_FUNCTION, BIOLOGICAL_PROCESS, CELLULAR_COMPONENT])
 

diff --git a/nn_hierarchical_swiss_bp.py b/nn_hierarchical_swiss_bp.py
@@ -37,7 +37,7 @@
 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 sys.setrecursionlimit(100000)
 
-DATA_ROOT = 'data/cafa2/'
+DATA_ROOT = 'data/cafa3/'
 MAXLEN = 1000
 GO_ID = BIOLOGICAL_PROCESS
 go = get_gene_ontology('go.obo')

diff --git a/nn_hierarchical_swiss_cc.py b/nn_hierarchical_swiss_cc.py
@@ -37,7 +37,7 @@
 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 sys.setrecursionlimit(100000)
 
-DATA_ROOT = 'data/network/'
+DATA_ROOT = 'data/cafa3/'
 MAXLEN = 1000
 GO_ID = CELLULAR_COMPONENT
 go = get_gene_ontology('go.obo')
@@ -133,7 +133,7 @@ def model():
     # set parameters:
     batch_size = 512
     nb_epoch = 100
-    output_dim = 128
+    output_dim = 256
     nb_classes = len(functions)
     start_time = time.time()
     logging.info("Loading Data")
@@ -188,7 +188,7 @@ def model():
     model_path = DATA_ROOT + 'hierarchical_cc' + ORG + '.hdf5'
     checkpointer = ModelCheckpoint(
         filepath=model_path, verbose=1, save_best_only=True)
-    earlystopper = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
+    earlystopper = EarlyStopping(monitor='val_loss', patience=20, verbose=1)
     logging.info(
         'Compilation finished in %d sec' % (time.time() - start_time))
     logging.info('Starting training the model')

diff --git a/nn_hierarchical_swiss_mf.py b/nn_hierarchical_swiss_mf.py
@@ -37,7 +37,7 @@
 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.INFO)
 sys.setrecursionlimit(100000)
 
-DATA_ROOT = 'data/network/'
+DATA_ROOT = 'data/cafa3/'
 MAXLEN = 1000
 GO_ID = MOLECULAR_FUNCTION
 go = get_gene_ontology('go.obo')