###This is the README for all information downloadable from orthoDB
Ortho DB v9 data dump consists of 9 files:

odb9_levels.tab.gz:      NCBI taxonomy nodes where Ortho DB orthologous groups (OGs) are calculated
odb9_species.tab.gz:     NCBI taxonomy nodes (mostly species level) participating in OGs
odb9_genes.tab.gz:       genes with some info
odb9_OGs.tab.gz:         Ortho DB orthologous groups
odb9_OG2genes.tab.gz:    OGs to genes correspondence
odb9v1_OG_xrefs.tab.gz:  OGs association with some external references, e.g. GO, COG and InterPro
odb_v8_v9_1.tab          mappings between v8 and v9.1 orthologous groups

odb9_fasta_<root>.tgz    tar-ball with one fasta file per taxon id in the given root (bacteria,metazoa,fungi,plants)


The non-fasta files are in tab-separated format without column headers.

The fasta files have headers with orthodb internal gene id as well as a public id.

-----------------------------------------------------------------

odb9_levels.tab:
1.	NCBI tax_id
2.	scientific name
3.	total non-redundant count of genes in all underneath clustered species
4.	total count of OGs built on it
5.	total non-redundant count of species underneath

odb9_species.tab
1.	NCBI tax_id
2.	scientific name
3.	total count of clustered genes in this species
4.	total count of OGs it participates
5.	mapping type, clustered(C) or mapped(M)

odb9_genes.tab
1.	Ortho DB unique gene id (not stable between releases)
2.	organism tax id
3.	protein original sequence id, as downloaded together with the sequence
4.	Uniprot id, evaluated by mapping
5.	ENSEMBL gene name, evaluated by mapping
6.	NCBI gid, evaluated by mapping
7.	description, evaluated by mapping

odb9_OGs.tab
1.	OG unique id (not stable between releases)
2.	level tax_id on which the cluster was built
3.	OG name (the group's most common gene name)

odb9_OG2genes.tab
1.	OG unique id
2.	Ortho DB unique gene id

odb9v1_OG_xrefs.tab
1.	OG unique id
2.	external DB or DB section
3.	external identifier
4.	number of genes in the OG associated with the identifier

odb_v8_v9_1.tab
1.  level tax_id on which both v8 and v9.1 OGs were built
2.  v8   OG id
3.  v9.1 OG id
4.  distance between the two OGs (0. - identical, 1.0 - totally unrelated)



In [None]:
## This code allows you to query a Uniprot ID and will return a odb ID

#!/usr/bin/env python3

import sys

query = sys.argv[1]
ID_dict = dict()

with open('odb9v1_genes.tab', 'r') as file_obj:
    for line in file_obj:
        line_list = line.split()
        ID_dict[line_list[3]] = line_list[0]

    print(ID_dict[query])
## input of O14802    
## returns 9606:00232f

In [None]:
## This code will return a OG from a provided odbID 
OG_dict = dict() 

with open('odb9v1_OG2genes.tab', 'r') as file_obj:
    for line in file_obj:
       OG_dict[line.split()[1]] = line.split()[0]
    
## input of 9606:00232f
## returns EOG093700ID
    

In [None]:
## This code will return a taxonomy ID for a OG provided 

tax_dict = dict()

with open('odb9v1_OGs.tab', 'r') as file_obj:
    for line in file_obj:
        tax_dict[line.split()[0]] = line.split()[1]
        
## returns 2759

In [None]:
#!/usr/bin/env python3

import sys

query_file_name = sys.argv[1]

query_list = list()

ID_to_odb = dict()
odb_to_OGset = dict()
tax_dict = dict()
OG_to_odbSet = dict()
tax_dict = dict()
uniprot_to_homologs = dict()

# the query file is read in and a list of UniProt IDs is generated

with open(query_file_name, 'r') as file_obj:
    for line in file_obj:
        line = line.rstrip()
        query_list.append(line)

# the genes.tab file is parsed into a two-level dictionary where the odb ID and tax ID are saved as the value for a uniprot ID key
            
with open('odb9v1_genes.tab', 'r') as file_obj:
   for line in file_obj:
        line_list = line.split()
        ID_to_odb[line_list[3]] = {'odb' : line_list[0],'tax': line_list[1]}

print('output of ID_to_odb:',ID_to_odb['O14802']) 
# this returns: output of ID_to_odb: {'odb': '9606:00232f', 'tax': '9606'}


# the OG2genes.tab file is parse into a dictionary where a key of odb ID will return the OG

        
with open('odb9v1_OG2genes.tab', 'r') as file_obj:
    for line in file_obj:
        odbID = line.split()[1]
        OG = line.split()[0]
        if odbID in odb_to_OGset:
            odb_to_OGset[odbID].append(OG)
        else :
            odb_to_OGset[odbID] = [OG]

# this converts the list of OGs associated with each odb to a set
            
    for odbID in odb_to_OGset:
        OG_list = odb_to_OGset[odbID]
        OG_set = set(OG_list)
        odb_to_OGset[odbID] = OG_set

print('output of odb_to_OGset:',odb_to_OGset['9606:00232f'])
# this returns: output of odb_to_OGset: {'EOG093700ID', 'EOG090B00IL', 'EOG090M00IS', 'EOG090700I0', 'EOG090O017L', 'EOG090A00I5', 'EOG090600QN', 'EOG0903023G', 'EOG091G0FNO'}

# the OG_dict is reversed such that a key of an OG will return a list of odb IDs that are in that group
        
for odbID in odb_to_OGset:
    OG_list = odb_to_OGset[odbID]

    for ogID in OG_list:
        if ogID in OG_to_odbSet:
            OG_to_odbSet[ogID].add(odbID)
        else :
            OG_to_odbSet[ogID] = set()
            OG_to_odbSet[ogID].add(odbID)

print('output of OG_to_odb_list is:',OG_to_odbSet['EOG0903023G'])
# this returns: output of OG_to_odb_list is: {'379532:0044ce', '9986:002416', '9601:000893', '34839:003337', '10020:0000a2', '1026970:004272', '37293:003aeb', '10042:004777', '9544:00113a', '29078:0013f4', '43179:0021f6', '54131:0007ca', '9598:00074b', '9568:00165b', '9478:0017aa', '60711:00233d', '30608:001525', '9595:004e3a', '30611:0020f0', '9483:000337', '10090:001347', '9978:00069f', '51337:002648', '37347:0033ce', '9555:003d10', '885580:004a23', '9531:0044ee', '10141:003b88', '9545:00068d', '9597:00030d', '9402:0024ed', '246437:0024f7', '10036:0018bc', '482537:001036', '10160:004262', '9606:00232f', '10116:001418', '61853:002648', '61622:00181b'} 

# a list of OGs associated with an odb will be collected, the list of odbs associated with those OGs are added to a set
# so that a list of odbs is returned for a provided odb

for uniprot_id in query_list:
    homologs = set()
    odb = ID_to_odb[uniprot_id][odb]
    OGlist = odb_to_OGset[odb]

    for OG in OGlist:
        odbList = OG_to_odbSet[OG]

        for related_odb in odbList:
            homologs.add(related_odb)

    uniprot_to_homologs[uniprot_id] = homologs

print(uniprot_to_homologs)
    
# a lookup table is created from the parsed gene dict in which a odb ID as key will return tax ID

with open('output.tab' , 'w') as outputFile:
    for uniprot_ID in uniprot_to_homologs:
        mapped_odbs = uniprot_to_homologs[uniprot_ID]

        mapped_odb_str = ''
        
        for odb in mapped_odbs:
            taxa = odb.split(':')[0]
            mapped_odb_str += taxa+','

        mapped_odb_str = mapped_odb_str.rstrip(',')
            
        outputFile.write(uniprot_ID+'\t'+'['+mapped_odb_str+']'+'\n')
            