This uses a GO parser to parse the HPO obo ontology and then calculates changes made between versions of HPO

In [None]:
from __future__ import with_statement
from collections import defaultdict
import json
import difflib
__author__    = "Uli Koehler"
__copyright__ = "Copyright 2013 Uli Koehler"
__license__   = "Apache v2.0"                                       #this was the original GO parser

In [None]:
def processGOTerm(goTerm):
    """
    In an object representing a GO term, replace single-element lists with
    their only member.
    Returns the modified object as a dictionary.
    """
    ret = dict(goTerm) #Input is a defaultdict, might express unexpected behaviour
    for key, value in ret.items():
        if len(value) == 1:
            ret[key] = value[0]
    return ret

In [None]:
def parseGOOBO(filename):
    """
    Parses a Gene Ontology dump in OBO v1.2 format.
    Yields each 
    Keyword arguments:
        filename: The filename to read
    """
    with open(filename, "r", encoding='utf-8') as infile:
        currentGOTerm = None
        for line in infile:
            line = line.strip()
            if not line: continue #Skip empty
            if line == "[Term]":
                if currentGOTerm: yield processGOTerm(currentGOTerm)
                currentGOTerm = defaultdict(list)
            elif line == "[Typedef]":
                #Skip [Typedef sections]
                currentGOTerm = None
            else: #Not [Term]
                #Only process if we're inside a [Term] environment
                if currentGOTerm is None: continue
                key, sep, val = line.partition(":")
                currentGOTerm[key].append(val.strip())
        #Add last term
        if currentGOTerm is not None:
            yield processGOTerm(currentGOTerm)

In [None]:
def getTerm(searchTerm, HPO):  #find the given id in the parsed ontology HPO
    for term in HPO:
        if term['id'] == searchTerm:
            return(term)
    else:
        return None

In [None]:
def getJson(filename):     #get the json data models
    """open the JSON file of phenotype models and return it"""
    with open(filename, encoding='utf-8') as data_file:
        return json.load(data_file)

In [None]:
def getParents(term, HPO, HPOtree):         #get the parent tree of a term
    try:
        parent = getTerm(term, HPO)['is_a']
    except:
        return
    if type(parent) == str:                  #if there is only one parent it is a str
        if parent[0:10] == 'HP:0000118':     #if we reach the top of the HPO tree, print the final term and finish
            HPOtree.append(parent)
            return
        else:                          #if we are not at the top of the tree, print the term and recursively get the next parent
            HPOtree.append(parent)
            getParents(parent[0:10], HPO, HPOtree)
    else:                                   #if there is more than one parent (its a list)
        for l in parent:                    #go through each term in the list of parents
            HPOtree.append(l)                  #print and recursively get the next parent(s)
            getParents(l[0:10], HPO, HPOtree)

In [None]:
directory = 'C:/Users/Andrew Devereau/Documents/Python Scripts/'
oldFile = directory + 'hpoDMC.obo'   #this is the current DMC HPO version
newFile = directory + 'hp060916.obo'  #this is the new HPO version
oldHPO = []
newHPO = []
for goTerm in parseGOOBO(oldFile):
    oldHPO.append(goTerm)
for goTerm in parseGOOBO(newFile):
    newHPO.append(goTerm)
print(len(oldHPO))
print(len(newHPO))

In [None]:
keySet = set()                                     #get all of the keys in the ontology. Not all keys are included in each term
for node in oldHPO:
    for k in node.keys():
        keySet.add(k)                              #using a set removes duplicate entries
keySet

In [None]:
modelsJson = getJson(directory + 'Rare Disease Conditions Phenotypes and Clinical Tests - v1.5.1 - FINAL.json')


Go through the disease models and find all terms that have changed, been added or removed between the old and new versions of HPO. Write to a text file.

In [None]:
outfile = open('change.txt', 'w', encoding = 'utf-8')#output file for the change analysis
for level2 in modelsJson['DiseaseGroups']:     #run through each model in the json catalogue
    for level3 in level2['subGroups']:
        for level4 in level3['specificDisorders']:
            for pheno in level4['shallowPhenotypes']:  #go through each HPO term
                oldTerm = getTerm(pheno['id'], oldHPO)   #for each HPO code get a node from the old and new HPO versions
                newTerm = getTerm(pheno['id'], newHPO)
                if oldTerm == None:                     #these tests check for terms that are missing from the old and new HPO sets
                    outfile.write(level4['id'] + '\t' + level4['name'] + '\t' + pheno['id'] + '\t' + 'Not found in original set' + '\n')
                    continue
                if newTerm == None:
                    outfile.write(level4['id'] + '\t' + level4['name'] + '\t' + pheno['id'] + '\t' + 'Not found in new set' + '\n')
                    continue
                oldKeys = list(oldTerm.keys())           #get a list of the keys in each terms data
                newKeys = list(newTerm.keys())
                for k in keySet:                               #run through all possible keys for each HPO term
                    if k in newTerm and k in oldTerm:           #find fields that have changed
                        if newTerm[k] != oldTerm[k]:
                            outfile.write(level4['id'] + '\t' + level4['name'] + '\t' + pheno['id'] + '\t' + str(getTerm(pheno['id'], newHPO)['name']))
                            outfile.write('\t' + 'Changed\t' + k)
                            outfile.write('\t' + str(oldTerm[k]))
                            outfile.write('\t' + str(newTerm[k]) + '\n')
                    elif k in newTerm and k not in oldTerm:     #find fields that have been added
                        outfile.write(level4['id'] + '\t' +  level4['name'] + '\t' + pheno['id'] + '\t' + str(getTerm(pheno['id'], newHPO)['name']))
                        outfile.write('\t' + 'Added\t' + k)
                        outfile.write('\t' + str(newTerm[k])+ '\n')
                    elif k in oldTerm and k not in newTerm:     #find fields that have been removed
                        outfile.write(level4['id'] + '\t' +  level4['name'] + '\t' + pheno['id'] + '\t' + str(getTerm(pheno['id'], newHPO)['name']))
                        outfile.write('\t' +'Removed\t' + k)
                        outfile.write('\t' + str(oldTerm[k])+ '\n')
outfile.close()

In [None]:
HPOtree1 = []                #this looks for changes to the parent trees of each term
HPOtree2 = []
for level2 in modelsJson['DiseaseGroups']:     #run through each model in the json catalogue
    for level3 in level2['subGroups']:
        for level4 in level3['specificDisorders']:
            print(level4['name'])             #print the name of disease
            for pheno in level4['shallowPhenotypes']:  #go through each HPO term
                HPOtree1.clear()
                getParents(pheno['id'], newHPO, HPOtree1)  #get the parent tree for new HPO
                HPOtree2.clear()
                getParents(pheno['id'], oldHPO, HPOtree2)  #get parent tree for old HPO
                if HPOtree1 != HPOtree2:           #find differences in trees
                    print (pheno['id'], pheno['name'])

In [None]:
HPOtree1=[]
HPOtree2=[]
getParents('HP:0000252', newHPO, HPOtree1)
getParents('HP:0000252', oldHPO, HPOtree2)

In [None]:
HPOtree1

In [None]:
HPOtree2

Go through all of the HPO terms in the old and new files to find all new and removed terms

In [None]:
oldSet = {term['id'] for term in oldHPO}   #get the set of old HPO ids
newSet = {term['id'] for term in newHPO}   #get the set of new HPO ids
len(oldSet), len(newSet)                   #get the size of each set

Should also check new HPO terms against disease names. These are likely to be core terms so may need to be added to the core terms list

In [None]:
added = newSet - oldSet    #get the terms added in the new HPO version
removed = oldSet - newSet  #get the terms removed from the new HPO version
len(added), len(removed)

In [None]:
for term in removed:
    print(term, getTerm(term, oldHPO)['name'])

In [None]:
for term in added:
    print(term, getTerm(term, newHPO)['name'])