In [64]:
from ddot import Ontology
import ddot
ont2 = Ontology.from_table('/home/hermuba/data0118/network1122/candidate_ontologies')

In [4]:
from ddot.Ontology import parse_obo
help(parse_obo)


Help on function parse_obo in module ddot.Ontology:

parse_obo(obo, output_file=None, id2name_file=None, id2namespace_file=None, alt_id_file=None)
    Parses an OBO file and writes the results into several tables.
    
    Parameters
    ----------
    obo : str
    
        Filename of OBO file
    
    output_file : str
    
        Filename to write table that describes the ontology's
        hierarchical structure. The table has four columns: (1) parent
        term, (2) child term, (3) relation type (e.g. "is_a" or
        "part_of"), (4) namespace of relation
        (e.g. "biological_process" or "cellular component")
    
    id2name_file : str
    
        Filename to write table of term descriptions.  The table has
        two columns: (1) Ontology term (e.g. "GO:0000030"), (2)
        description (e.g. "mannosyltransferase activity")
    
    id2namespace_file : str
    
        Filename to write table of term namespaces.  The table has two
        columns: (1) Ontology term 

In [11]:
import io
import pandas as pd
def parse_obo(obo,
              output_file=None,
              id2name_file=None,
              id2namespace_file=None,
              alt_id_file=None):
    """Parses an OBO file and writes the results into several tables.

    Parameters
    ----------
    obo : str

        Filename of OBO file

    output_file : str

        Filename to write table that describes the ontology's
        hierarchical structure. The table has four columns: (1) parent
        term, (2) child term, (3) relation type (e.g. "is_a" or
        "part_of"), (4) namespace of relation
        (e.g. "biological_process" or "cellular component")

    id2name_file : str

        Filename to write table of term descriptions.  The table has
        two columns: (1) Ontology term (e.g. "GO:0000030"), (2)
        description (e.g. "mannosyltransferase activity")

    id2namespace_file : str
    
        Filename to write table of term namespaces.  The table has two
        columns: (1) Ontology term (e.g. "GO:0000030"), (2) namespace
        of the term (e.g. "biological_process")

    alt_id_file : str
    
        Filename to write table of alternative Term IDs that are
        synonyms and refer to the same term. The table has two
        columns: (1) Primary Term ID, (2) Alternative Term ID

    """

    ## Keywords that screw up parsing:
    # import, is_anonymous, intersection_of, union_of

    ## Relations
    # 'is_a:'
    # 'relationship: has_part'  # Not in filtered GO
    # 'relationship: occurs_in' # Not in filtered GO
    # 'relationship: part_of'   
    # 'relationship: positively_regulates' 
    # 'relationship: negatively_regulates'
    # 'relationship: regulates'
    # 'relationship: results_in' # Not in filtered GO

    stanza, edges = [], []
    id2name = dict()
    id2namespace = dict()
    alt_id = dict()
    in_term_stanza = False
    default_namespace_exists = False
    for line in io.open(obo).read().splitlines():

        line = line.split('!')[0].strip()  # Remove comments

        if len(line)>0 and line[0]=='[' and line[-1]==']':
            # Add last stanza if it was a term stanza.  Include namespace.
            if in_term_stanza:
                edges.extend(x+(namespace, ) for x in stanza)

            # Start new term stanza
            stanza = []
            
            # Set the default namespace, if it exists
            if default_namespace_exists:
                namespace = default_namespace
            
            # In a term stanzo or not
            in_term_stanza = line =='[Term]'

            name = None
                
        #if 'alt_id:' in line: assert False

        if 'id:' == line[:3]:
            curr_term = line.split('id:')[1].strip()
        elif 'alt_id:' in line:
            alt_term = line.split('alt_id:')[1].strip()
            if curr_term in alt_id:  alt_id[curr_term].append(alt_term)
            else:                          alt_id[curr_term] = [alt_term]
            id2name[alt_term] = name
        elif 'name:' in line:
            name = line.split('name:')[1].strip()
            #assert not curr_term in id2name
            id2name[curr_term] = name
        elif 'is_a:' in line:
            parent = line.split('is_a:')[1].strip()
            stanza.append((parent, curr_term, 'is_a'))
        elif 'relationship:' in line:
            line = line.split('relationship:')[1].strip().split()
            if len(line)!=2: print(line)
            assert len(line)==2
            relation, parent = line
            stanza.append((parent, curr_term, relation))
        elif 'namespace:' == line[:10]:
            namespace = line.split('namespace:')[1].strip()
            assert not curr_term in id2namespace
            id2namespace[curr_term] = namespace
        elif 'default-namespace:' == line[:18]:
            namespace = line.split('default-namespace:')[1].strip()
            default_namespace_exists = True
            default_namespace = namespace

    pd.DataFrame(edges).to_csv(output_file, header=False, index=False, sep='\t')
    pd.Series(id2name).to_csv(id2name_file, sep='\t')
    pd.Series(id2namespace).to_csv(id2namespace_file, sep='\t')
    pd.Series(dict([(a, c) for a, b in alt_id.items() for c in b])).to_csv(alt_id_file, sep='\t')

In [12]:
parse_obo('/home/hermuba/data0118/ontologies/aro.obo', '/home/hermuba/data0118/ontologies/aro_output')



In [62]:
aro_ontology = Ontology.from_table('/home/hermuba/data0118/ontologies/aro_output')

In [63]:
aro_ontology

0 genes, 4094 terms, 0 gene-term relations, 7810 term-term relations
node_attributes: []
edge_attributes: ['is_a', 'antibiotic_resistance']

In [58]:
aro_hier = aro_ontology.to_table()

In [15]:
import pandas as pd
gold_anno = pd.read_pickle('/home/hermuba/data0118/goldstandard/ec_rmplasmid_node_anno_df')
new_res_genes = gold_anno.loc[ont2.genes] 

In [28]:
new_res_genes.head()

Unnamed: 0,cog_ID,cog_category,cluster,count,core,loose_best_ARO,loose_ARO,strict_best_ARO,strict_ARO,is_card,resfam,aclame_title,is_aclame,nr,hypo_nr,drug_target,is_drug_target,GO,pathway,domain
562.10576.con.0008_20|562.10576,COG2271,G,Cluster 4469,7.0,False,,,,,False,,protein:plasmid:25502 Length: 467 # NCBI annot...,True,WP_000053689.1 MFS transporter [Escherichia coli],,,False,"{GO:0016021, GO:0055085, GO:0022857}",,"{IPR020846, IPR036259, IPR011701, IPR000849}"
562.10576.con.0029_46|562.10576,COG2194,R,Cluster 2710,4.0,False,,,MCR-1,ARO:3003689,True,,protein:plasmid:113751 Length: 580 # NCBI anno...,True,WP_049589868.1 MULTISPECIES: phosphoethanolami...,,,False,"{GO:0003824, GO:0016021, GO:0008484}",,"{IPR012549, IPR017850, IPR000917}"
562.10576.con.0045_7|562.10576,COG3173,R,Cluster 7872,2.0,False,,,APH(4)-Ia,ARO:3002655,True,"{APH3, APH3'}",,False,AEG42736.1 HygR [Cloning vector pPLV03],,,False,,,"{IPR002575, IPR011009}"
562.10576.con.0045_9|562.10576,COG2746,V,Cluster 11944,2.0,False,,,AAC(3)-IV,ARO:3002539,True,{AAC3},,False,"6MN3_A Chain A, Crystal structure of aminoglyc...",,,False,"{GO:0046353, GO:0046677}",,"{IPR003679, IPR028345}"
562.10576.con.0047_4|562.10576,COG2814,G,Cluster 5537,6.0,False,,,floR,ARO:3002705,True,"{Chlor_Efflux_Pump, emrB, MFS_efflux, TetA, Te...",protein:plasmid:116340 Length: 404 # NCBI anno...,True,WP_000214122.1 MULTISPECIES: chloramphenicol/f...,,,False,"{GO:0006855, GO:0016021, GO:0015238, GO:0055085}",,"{IPR020846, IPR004812, IPR036259, IPR011701}"


In [26]:
def gene_term_table(new_res_genes, term):
    term_table = pd.DataFrame()
    term_index = 0
    
        

In [36]:
aro = new_res_genes['strict_ARO'].dropna().append(new_res_genes['loose_ARO'].dropna())
aro_list = []

    

In [44]:
# generate gene_term list: list of types
for gene in aro.index:
    aro_many = aro[gene].split(', ')
    for aro_term in aro_many:
        aro_list.append((gene, aro_term))

In [45]:
aro_list

[('562.10576.con.0029_46|562.10576', 'ARO:3003689'),
 ('562.10576.con.0045_7|562.10576', 'ARO:3002655'),
 ('562.10576.con.0045_9|562.10576', 'ARO:3002539'),
 ('562.10576.con.0047_4|562.10576', 'ARO:3002705'),
 ('562.22437.con.0041_3|562.22437', 'ARO:3002858'),
 ('562.22437.con.0041_7|562.22437', 'ARO:3003836'),
 ('562.22437.con.0041_9|562.22437', 'ARO:3000413'),
 ('562.22471.con.0130_3|562.22471', 'ARO:3000168'),
 ('562.22520.con.0181_2|562.22520', 'ARO:3002863'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002781'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002770'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002758'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002761'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002746'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002727'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002719'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002771'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002716'),
 ('562.22521.con.0055_2|562.22521', 'ARO:3002752'),
 ('562.2252

In [59]:
ont = Ontology(aro_hier, aro_list) # mapping genes to aro_ontology

In [69]:
ont # card ontology with genes inside

334 genes, 1517 terms, 9539 gene-term relations, 3 term-term relations
node_attributes: []
edge_attributes: []

In [75]:
ont.to_table('/home/hermuba/data0118/ontologies/ARO_with_candidates')

Unnamed: 0,Parent,Child,EdgeType
0,a,P,Child-Parent
1,d,E,Child-Parent
2,h,C,Child-Parent
3,ARO:3003689,562.10576.con.0029_46|562.10576,Gene-Term
4,ARO:3002655,562.10576.con.0045_7|562.10576,Gene-Term
5,ARO:3002539,562.10576.con.0045_9|562.10576,Gene-Term
6,ARO:3002705,562.10576.con.0047_4|562.10576,Gene-Term
7,ARO:3001329,562.22429.con.0059_8|562.22429,Gene-Term
8,ARO:3001328,562.22429.con.0059_8|562.22429,Gene-Term
9,ARO:3000174,562.22429.con.0059_8|562.22429,Gene-Term


In [66]:
# align CliXO to ARO
align_aro = ont2.align(ont)

collapse command: /home/hermuba/miniconda3/envs/ddot/lib/python3.6/site-packages/ddot/alignOntology/collapseRedundantNodes /tmp/tmp6hfc1c5k
collapse command: /home/hermuba/miniconda3/envs/ddot/lib/python3.6/site-packages/ddot/alignOntology/collapseRedundantNodes /tmp/tmpzru14ell
Alignment command: /home/hermuba/miniconda3/envs/ddot/lib/python3.6/site-packages/ddot/alignOntology/calculateFDRs /tmp/tmp7iox9q3k /tmp/tmpwye2qjlx 0.05 criss_cross /tmp/tmp95s08hm8 100 40 gene


In [74]:
align_aro

Unnamed: 0_level_0,Term,Similarity,FDR
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S:630,ARO:3004126,0.875000,0.0
S:621,ARO:3002986,0.875000,0.0
S:592,ARO:3000676,0.875000,0.0
S:590,ARO:3002576,0.875000,0.0
S:589,ARO:3002660,0.875000,0.0
S:584,ARO:3003577,0.875000,0.0
S:582,ARO:3002804,0.875000,0.0
S:772,ARO:3004139,0.676694,0.0
S:791,ARO:3002704,0.648008,0.0
S:755,ARO:3000796,0.643380,0.0


In [68]:
align_aro.to_csv('/home/hermuba/data0118/network1122/ARO_ontology_align.csv')