## Extracting Entrez IDs from HPO and MGI annotations for mapping to Uniprot IDs

In [None]:
output = open('phenotype_entrez.txt', 'w')
with open('HMD_HumanPhenotype.rpt', 'r') as f:
    for line in f:
        items = line.strip().split('\t')
        output.write(items[1].strip() + '\n')
with open('ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt', 'r') as f:
    next(f)
    for line in f:
        items = line.strip().split('\t')
        output.write(items[0] + '\n')
output.close()
# Then convert entrez to uniprot online

In [None]:
entrez2uniprot = {}
with open('phenotype_entrez2uniprot.txt', 'r') as f:
    next(f)
    for line in f:
        items = line.strip().split('\t')
        if items[3] != "reviewed":
            continue
        if ',' not in items[0]:
            entrez = items[0]
            if entrez not in entrez2uniprot:
                entrez2uniprot[entrez] = set()
            entrez2uniprot[entrez].add(items[1])
        else:
            for entrez in items[0].split(','):
                if entrez not in entrez2uniprot:
                    entrez2uniprot[entrez] = set()
                entrez2uniprot[entrez].add(items[1])
print(len(entrez2uniprot))

## Mapping mouse gene phenotypes to their human orthologs

In [None]:
human_dict = {}
mouse_dict = {}
mouse_input = 'MGI_GenePheno.rpt'
human_input = 'HMD_HumanPhenotype.rpt'

In [None]:
with open(mouse_input, 'r') as f:
    for line in f:
        items = line.strip().split('\t')
        pheno = items[4]
        gene = items[6]
        if gene not in mouse_dict:
            mouse_dict[gene] = set()
        mouse_dict[gene].add(pheno)
print(len(mouse_dict))

In [None]:
mouse_genes = set()
with open(human_input, 'r') as f:
    for line in f:
        items = line.strip().split('\t')
        mouse_gene = items[5].strip()
        entrez = items[1].strip()
        if entrez not in entrez2uniprot:
            continue
        uniprots = entrez2uniprot[entrez]
        for human_gene in uniprots:
            if human_gene not in human_dict and mouse_gene in mouse_dict:
                human_dict[human_gene] = set()
                mouse_genes.add(human_gene)
            if human_gene in human_dict and mouse_gene in mouse_dict:
                human_dict[human_gene] = human_dict[human_gene] | mouse_dict[mouse_gene]
print(len(human_dict))

## Getting HPO annotations of human proteins

In [None]:
human_genes = set()
with open('ALL_SOURCES_ALL_FREQUENCIES_genes_to_phenotype.txt', 'r') as f:
    next(f)
    for line in f:
        items = line.strip().split('\t')
        entrez = items[0]
        if entrez not in entrez2uniprot:
            continue
        uniprots = entrez2uniprot[entrez]
        for human_gene in uniprots:
            human_genes.add(human_gene)
            hp_term = items[3]
            if human_gene not in human_dict:
                human_dict[human_gene] = set()
            human_dict[human_gene].add(hp_term)
print(len(human_genes))

## Getting GO annotations of human proteins

In [None]:
go_genes = set()
go_dict = {}
with open('goa_human.gaf', 'r') as f:
    for line in f:
        if line[0] == '!':
            continue
        items = line.strip().split('\t')
        if items[6] == 'IEA':
            continue
        idx = items[4].split(':')[1]
        go = "<http://purl.obolibrary.org/obo/GO_" + idx + '>'
        if items[1] not in go_dict:
            go_dict[items[1]] = set()
        go_dict[items[1]].add(go)
        go_genes.add(items[1])
print(len(go_dict), len(go_genes))

In [None]:
intersect = (mouse_genes & human_genes) & go_genes
print(len(intersect))
removal = set()
removal.add('MP:0002169')

In [None]:
output_file = 'human_pheno_asso_HPiMPiGO.txt'
with open(output_file, 'w') as f:
    for key, phenos in human_dict.items():
        if key not in intersect:
            continue
        for pheno in phenos:
            if pheno in removal:
                continue
            parts = pheno.split(':')
            pheno = "<http://purl.obolibrary.org/obo/" + parts[0] + '_' + parts[1] + '>'
            f.write('%s %s\n' % (key, pheno))
    for uniprot, gos in go_dict.items():
        if uniprot not in intersect:
            continue
        for go in gos:
            f.write('%s %s\n' % (uniprot, go))