## Extracting taxonomic groups from NCBITaxon

In [None]:
bacteria = set()
virus = set()
fungi = set()
species_dict = {}
with open('categories.dmp', 'r') as f:
    for line in f:
        items = line.strip().split('\t')
        species_dict['<http://purl.obolibrary.org/obo/NCBITaxon_'+items[2]+'>'] = '<http://purl.obolibrary.org/obo/NCBITaxon_' + items[1]+'>'
        if items[0] == 'B':
            bacteria.add('<http://purl.obolibrary.org/obo/NCBITaxon_'+items[2]+'>')
        elif items[0] == 'V':
            virus.add('<http://purl.obolibrary.org/obo/NCBITaxon_'+items[2]+'>')
        elif items[0] == 'E':
            fungi.add('<http://purl.obolibrary.org/obo/NCBITaxon_'+items[2]+'>')
print(len(bacteria), len(virus), len(fungi), len(species_dict))

## Generating a dict of pathogen phenotypes from PathoPhenoDB

In [None]:
patho_input = 'patho_pheno.nt'

dis_pato = {}
dis_pheno = {}
patho_dis = {}
pheno_dis = {}
with open(patho_input, 'r') as f:
    for line in f:
        items = line.strip().split()
        if 'SIO_000255' in items[1]:
            if 'dis_pato' in items[2]:
                if items[0] in dis_pato:
                    print('wrong!')
                dis_pato[items[2]] = items[0]
            if 'dis_pheno' in items[2]:
                if items[0] not in dis_pheno:
                    dis_pheno[items[0]] = set()
                dis_pheno[items[0]].add(items[2])
        if 'RO_0002556' in items[1]:
            if 'dis_pato' in items[0]:
                if items[2] not in patho_dis:
                    patho_dis[items[2]] = set()
                patho_dis[items[2]].add(items[0])
        if 'RO_0002200' in items[1]:
            if 'dis_pheno' in items[0]:
                if items[0] in pheno_dis:
                    print('wrong!')
                pheno_dis[items[0]] = items[2]
print(len(dis_pato), len(dis_pheno), len(patho_dis), len(pheno_dis))

In [None]:
pheno_dict = {}
for patho, assos in patho_dis.items():
    if patho not in pheno_dict:
        pheno_dict[patho] = set()
    for asso in assos:
        disease = dis_pato[asso]
        if disease in dis_pheno:
            for pheno in dis_pheno[disease]:
                phenotype = pheno_dis[pheno]
                pheno_dict[patho].add(phenotype)

## Saving the dict into a file for OPA2Vec

In [None]:
with open('patho_pheno_asso.txt', 'w') as f:
    for patho, phenos in pheno_dict.items():
        for pheno in phenos:
            f.write('%s %s\n' % (patho, pheno))
        if patho in species_dict: # has species level 
            taxon = species_dict[patho]
            for pheno in phenos:
                f.write('%s %s\n' % (taxon, pheno))