In [30]:
import csv
from sys import exit 


def list_syn(label2key, lc, syn_labels):
    for idx, syn in enumerate(syn_labels):
        # If the synonym is already in the dictionary 
        # and it is not because the preferred label is also a synomym:
        if syn in label2key.keys() and not (syn == syn_labels[0] and idx > 0):
            # print('Preferred label: {}, Seen both in line: {} and {}'.format(syn, label2key[syn]+2, lc+2))
            # If the index is 0 the label is a preferred label and this has higher precedence:
            if idx == 0:
                label2key[syn] = lc
        else:
            if syn != '':
                label2key[syn] = lc
            else:
                pass

    return(label2key)


# Define the dictionaries to store the keys to the concept dictionary:
t_label2key = dict()
o_label2key = dict()
d_label2key = dict()
f_label2key = dict()

# Line number as key to a list of information about the term:
concept = dict()

edam_file = 'EDAM.csv'

def fill_label2key(edam_file, t_label2key, o_label2key, d_label2key, f_label2key, concept):
    # Read the EDAM CSV file and extract all its relevant data:
    with open(edam_file) as EDAM_fh:
        EDAMcsv = csv.reader(EDAM_fh, delimiter=',', quotechar='"')
        # Skip first line:
        next(EDAMcsv, None)

        for lc, line in enumerate(EDAMcsv):
            # We skip obsolete entries:
            obs = line[4].lower()
            if str(obs) == 'true':
                continue

            # Extract the url and the labels:
            url = line[0]
            pref_label = line[1]
            syn_labels = line[49].split('|')
            syn_labels = [x.lower() for x in syn_labels]
            # Insert the preferred label as the first in the list of synonyms:
            syn_labels.insert(0, pref_label.lower())

            # Now devide into topic/operation/data/format:
            if 'topic' in url:
                concept[lc] = [url, 'topic', pref_label, obs]
                t_label2key = list_syn(t_label2key, lc, syn_labels)
            elif 'operation' in url:
                concept[lc] = [url, 'operation', pref_label, obs]
                o_label2key = list_syn(o_label2key, lc, syn_labels)
            elif 'data' in url:
                concept[lc] = [url, 'data', pref_label, obs]
                d_label2key = list_syn(d_label2key, lc, syn_labels)
            elif 'format' in url:
                concept[lc] = [url, 'format', pref_label, obs]
                f_label2key = list_syn(f_label2key, lc, syn_labels)
            elif line in ['\n', '\r\n']:
                print('Remove newlines please.')
            else:
                continue # Skip this error control by now
                print('Check the input! Could not find any topic/operation/data/format.')
                print(line)
                print("Line "+ str(lc))
    return(t_label2key, o_label2key, d_label2key, f_label2key, concept)

t_label2key, o_label2key, d_label2key, f_label2key, concept = fill_label2key(edam_file,t_label2key, o_label2key, d_label2key, f_label2key, concept)
# syn_labels = [x.lower() for x in syn_labels]


In [31]:
print(len(t_label2key))
print(len(o_label2key))
print(len(d_label2key))
print(len(f_label2key))

370
905
1433
547


In [32]:
t_label2key.keys()

['protein interactions',
 'laboratory animal science',
 'protein analysis',
 'transmissable disease',
 'eukaryote',
 'drug formulation and delivery',
 'quality assurance',
 'drug discovery',
 'microbial collection',
 'audiovestibular medicine',
 'protein secondary structure',
 'health informatics',
 'metabolomics',
 'molecular dynamics',
 'statistics and probability',
 'database management',
 'cell culture collection',
 'synthetic organic chemistry',
 'functional, regulatory and non-coding rna',
 'healthcare informatics',
 'gene structure',
 'structure prediction',
 'exome sequence analysis',
 'biological chemistry',
 'mrna features',
 'crystallography',
 'pcr experiment',
 'data submission, annotation and curation',
 'organisms',
 'microbiology',
 'protein 3d motifs',
 'dna mutation',
 'adme',
 'physiology',
 'software development',
 'biodiversity',
 'safety sciences',
 'communicable disease',
 'studies',
 'chemical biology',
 'population genetics',
 'reproductive health',
 'microarra

In [33]:
all_topics = [x.lower() for x in t_label2key.keys()]
all_operations = [x.lower() for x in o_label2key.keys()]
all_data = [x.lower() for x in d_label2key.keys()]
all_formats = [x.lower() for x in f_label2key.keys()]
'mDIP'.lower() in all_topics

True

In [42]:
tool = 'mDIP'.lower()
if tool in t_label2key:
    concept_key = t_label2key[tool]
    print(concept[concept_key])

['http://edamontology.org/topic_3674', 'topic', 'Methylated DNA immunoprecipitation', 'false']


In [40]:
t_label2key['mDIP'.lower()]
concept[2530]

['http://edamontology.org/topic_3674',
 'topic',
 'Methylated DNA immunoprecipitation',
 'false']

In [None]:
tool = 'Sequence analysis'

In [36]:
topic_operation_overlap = list(set(all_topics) & set(all_operations))
topic_operation_overlap

['text mining',
 'protein secondary structure',
 'structure analysis',
 'dna mapping',
 'mapping',
 'structure prediction',
 'exome sequence analysis',
 'protein analysis',
 'sequence analysis',
 'nucleic acid analysis',
 'gene expression analysis',
 'sequence assembly',
 'protein structure analysis',
 'nucleic acid structure analysis',
 'data handling']

In [37]:
'K-mer counting'.lower() in all_topics

False

In [38]:
'K-mer counting'.lower() in all_operations

True

In [43]:
all_data

['protein id (embl/genbank/ddbj)',
 'arrayexpress accession number',
 'data index',
 'cell type name',
 'sequence alignment',
 'atom identifier',
 'sequence report',
 'enzyme commission number',
 'restriction digest',
 'feature table (nucleic acid)',
 'cath node id',
 'gramene primary identifier',
 'pathway id (panther)',
 'feature table (protein)',
 'dbd id',
 'kegg compound id',
 'strain name',
 'dna sequence',
 'emboss usa',
 'upi',
 'protein structures',
 'uniparc accession',
 'tool identifier',
 'ensembl id (protein)',
 'cgd id',
 'pathway id (kegg)',
 'ensembl ids',
 'amino acid index',
 'vectorbase id',
 'protein sequence (raw)',
 'peptide immunogenicity data',
 'protein structure report',
 'sequence identifier',
 'peptide molecular weight hits',
 'reference sample report',
 'protein complex',
 'emage id',
 'polypeptide chain id',
 'clone or est (report)',
 'disease report',
 'nucleotide substitution matrix',
 'orf name',
 'unii',
 'directory name',
 'database entry metadata',
 