In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [2]:
changes = pd.read_csv('change.txt', header=None, sep = '\t', names = ['DiseaseID', 'Disease','HPOId', 'HPOName', 'ChangeType', 'ChangedField', 'oldValue', 'newValue'])

In [3]:
changes.info()   #this is the output from the comparison of HPO versions

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3409 entries, 0 to 3408
Data columns (total 8 columns):
DiseaseID       3409 non-null int64
Disease         3409 non-null object
HPOId           3409 non-null object
HPOName         3409 non-null object
ChangeType      3409 non-null object
ChangedField    3409 non-null object
oldValue        3409 non-null object
newValue        2589 non-null object
dtypes: int64(1), object(7)
memory usage: 213.1+ KB


In [4]:
changes.head()

Unnamed: 0,DiseaseID,Disease,HPOId,HPOName,ChangeType,ChangedField,oldValue,newValue
0,33666,Familial Hypercholesterolaemia,HP:0003141,Hyperbetalipoproteinemia,Changed,synonym,"['""Increased beta-lipoproteins"" EXACT []', '""I...","['""Increased beta-lipoproteins"" EXACT []', '""I..."
1,33666,Familial Hypercholesterolaemia,HP:0001658,Myocardial infarction,Added,synonym,Heart attack EXACT [orcid.org/0000-0001-5208-3...,
2,36469,Familial cerebral small vessel disease,HP:0001297,Stroke,Changed,synonym,"['""Cerebral vascular events"" EXACT []', '""Cere...","['""Cerebral vascular events"" EXACT []', '""Cere..."
3,36469,Familial cerebral small vessel disease,HP:0002326,Transient ischemic attack,Changed,synonym,"['""TIA"" EXACT []', '""Transient ischemic attack...","['""Mini stroke"" EXACT [orcid.org/0000-0001-520..."
4,36469,Familial cerebral small vessel disease,HP:0001342,Cerebral hemorrhage,Changed,synonym,Intracerebral hemorrhage EXACT [],"['""Hemorrhagic stroke"" RELATED [http://orcid.o..."


In [5]:
len(changes['HPOId'].unique())  #this is the number of HPO terms with changes in our models

762

In [6]:
def match (matchList):
    return fuzz.ratio(str(matchList[0]), str(matchList[1]))   #a function to apply a simple fuzzy match

In [7]:
changes['match'] = changes.iloc[:,6:8].apply(match, axis = 1)  #apply a fuzzy match to the old and new terms. A score of >=99
                                                               #indicates only minor changes to spacing etc. which can be ignored

In [8]:
replaced = changes[(changes.ChangedField == 'replaced_by')]   #terms which have been replaced are Red class changes

In [9]:
obsolete = changes[(changes.ChangedField == 'is_obsolete')]  #terms made obsolete are red class

In [10]:
diff1 = obsolete[~(obsolete.HPOId.isin(replaced['HPOId']))]  #obsolete and replaced should be the same
diff1   #but there are two obsolete terms which do not seem to have been replaced

Unnamed: 0,DiseaseID,Disease,HPOId,HPOName,ChangeType,ChangedField,oldValue,newValue,match
1299,36852,Disorders of sex development,HP:0000057,obsolete Clitoromegaly,Added,is_obsolete,True,,0
2723,11115,Anophthalmia or microphthamia,HP:0000611,obsolete Choroid coloboma,Added,is_obsolete,True,,0


In [11]:
diff2 = replaced[~(replaced.HPOId.isin(obsolete['HPOId']))]  
diff2  #there are no terms in replaced that are not in obsolete

Unnamed: 0,DiseaseID,Disease,HPOId,HPOName,ChangeType,ChangedField,oldValue,newValue,match


In [12]:
replaced.sort_values(by='HPOId', ascending=1).to_excel('replaced.xlsx', index=False)  #output sets to excel
obsolete.sort_values(by='HPOId', ascending=1).to_excel('obsolete.xlsx', index=False) #grouped by HPO number

In [13]:
nameChange = changes[(changes.ChangedField == 'name') & ~(changes.HPOId.isin(obsolete['HPOId'])) & (changes.match <=99)]
nameChange.info()   #changes to names are a red class change. Obsolete terms also show a name change so can be excluded
                    #and changes >99 match score are excluded

<class 'pandas.core.frame.DataFrame'>
Int64Index: 139 entries, 310 to 3383
Data columns (total 9 columns):
DiseaseID       139 non-null int64
Disease         139 non-null object
HPOId           139 non-null object
HPOName         139 non-null object
ChangeType      139 non-null object
ChangedField    139 non-null object
oldValue        139 non-null object
newValue        139 non-null object
match           139 non-null int64
dtypes: int64(2), object(7)
memory usage: 10.9+ KB


In [14]:
nameChange.sort_values(by='HPOId', ascending=1).to_excel('nameChange.xlsx', index = False)   #output changes names to an excel sheet

In [15]:
definitionChange = changes[(changes.ChangedField == 'def') & (changes.match < 90)]  #terms with definition changes may be a red class
definitionChange.sort_values(by='HPOId', ascending=1).to_excel('defChange.xlsx', index = False) #90 is chosed following inspection of 
    #all definition changes - only minor changes were seen above 90

In [16]:
parentChange = changes[(changes.ChangedField == 'is_a') & (changes.ChangeType == 'Changed') & (changes.match < 98)]   #get all changes to parent (is_a) terms

In [17]:
parentChange.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406 entries, 15 to 3406
Data columns (total 9 columns):
DiseaseID       406 non-null int64
Disease         406 non-null object
HPOId           406 non-null object
HPOName         406 non-null object
ChangeType      406 non-null object
ChangedField    406 non-null object
oldValue        406 non-null object
newValue        406 non-null object
match           406 non-null int64
dtypes: int64(2), object(7)
memory usage: 31.7+ KB


In [18]:
parentChange.sort_values(by='HPOId', ascending=1).to_excel('parentChange.xlsx', index = False)

In [19]:
hpoChanges = parentChange['HPOId'].unique()

In [20]:
with open('parentChanges.txt', 'w') as outfile:   #save a the set of HPO IDs that are affected by parent changes
    for f in hpoChanges:
        outfile.write(f + '\n')