## Take the morphology and topography codes and determine text match scores to snomed code names - revision of ICD codes

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from openpyxl import load_workbook



In [198]:
ICDO3M = pd.read_csv('ICD-O-3M.csv', header=0)   #get ICD-O-3 codes
Mseries = ICDO3M['Code']

In [50]:
def fuzzyMatch (str1, str2):   #this returns a fuzzy match score on two strings using the token set method
    return fuzz.token_set_ratio(str1.replace('_', ' '), str2.replace('_', ' '))  #disease subtypes have _ instead of spaces

In [51]:
def splitMatch (target, sentence):  #splits up a sentence into words and finds the best fuzzy match of any word to the target
    words = str.split(sentence)
    result = []
    for w in words:
        result.append((fuzzyMatch(target, w), w))  #add each result as a tuple of (target word, fuzzy score) to a list
    return max(result)[0]  #find the maximum score and return only that one

In [None]:
def synMatch (target, ID): #get synonyms for a snomed ID and get the best match of any synonym to the target
    synonyms = getTerms(str(ID))
    result = []
    for term in synonyms:   #aggregate all synonyms into a single list
        result.append((fuzzyMatch(target, term), term))  #get match scores for each term
    return max(result)[0]  #return the highest score

In [None]:
def getParents(ID): #get parent 'is-a' terms from a snomed CT id
    try:
        try:
            term = int(ID)   #if the term is a CT term it is an INT - if not an excpetion is thrown. Relationship 116680003 is 'is-a'
            result = relationships[(relationships['CONCEPTID1'] == ID) & (relationships['RELATIONSHIPTYPE'] == 116680003)]['CONCEPTID2'].values
        except:   #if it is an RT term - ie not an INT - convert from RT to CT before getting parents as only the concept table uses RT terms
            result = relationships[(relationships['CONCEPTID1'] == RT2CT(ID)) & (relationships['RELATIONSHIPTYPE'] == 116680003)]['CONCEPTID2'].values
        terms = []
        for term in result:
            terms += getTerms(term)   #get all synonyms for all parent terms and put them in one list
        return terms
    except:
        return 'not found'

In [None]:
def BTMatch(target, ID): #get immediate parent terms for a snomed CT term and get the best match to the target
    terms = getParents(ID)  #this is all the synonyms for all parent terms
    result = []
    for term in terms:
        result.append((fuzzyMatch(target, term), term))   #get fuzzy matches for each synonym
    return max(result)[0]  #return only the highest score

In [208]:
def lookupM(ID):   #find a fully specified name for an ID using the ICD M concept frame
    try:
        IDint = int(str(ID))   #if the ID is a integer it is an ICD code with no slash. if not it will throw an exception
        IDstr = str(ID)[0:4]+'/'+str(ID)[4]
        result = ICDO3M[Mseries.str.contains(IDstr)]['Name1'].values[0]
        return (IDstr + '_' + str(result))  #return the found name plus original ID
    except:   #ID is string
        try:
            if ID[0] == 'M':
                if len(ID) == 6:
                    IDstr = str(ID)[1:5]+'/'+str(ID)[5]
                else:
                    IDstr = str(ID)[1:5]
                result = ICDO3M[Mseries.str.contains(IDstr)]['Name1'].values[0]
                #print (result)
                return (IDstr + '_' + str(result))
            elif ID[4] == '/':
                result = ICDO3M[Mseries.str.contains(str(ID))]['Name1'].values[0]
                return (str(ID) + '_' + str(result))
            else:
                return (ID + '_not found')
        except:
            return (ID + '_not found')
    

In [211]:
lookupM('8313/0')

'8313/0_Clear cell adenofibroma (C56.9) '

In [236]:
def annotateMorph(codes, col): #this looks up snomed codes in column col of a dataframe to give snomed names and a match score against the disease subtype
    try:
        codes['result'] = codes[col].dropna().apply(lookupM)   #lookup the snomed codes for non-null rows and add to a new column called result
        codes['term'], codes['fsn'] = codes['result'].dropna().str.split('_',1).str  #split the results around _ and add to new column 'term'
        codes['Disease subtype match'] = codes.dropna().apply(lambda x: fuzzyMatch(x['Disease Subtype Id'], x['fsn']), axis=1) #add a new column with the match of the whole fsn to disease subtype
        codes['Disease subtype match words'] = codes.dropna().apply(lambda x: splitMatch(x['Disease Subtype Id'], x['fsn']), axis=1) #add column with match to words in the fsn
        #codes['Synonym subtype match'] = codes.dropna().apply(lambda x: synMatch(x['Disease Subtype Id'], x['term']), axis=1) #add a new column with match to synonyms of the fsn
        codes.drop('result', axis=1, inplace=True)   #remove the result column
        return codes
    except:
        return 

In [None]:
def annotateTop(codes, col): #this looks up snomed codes in column col of a dataframe to give snomed names and a match score against the disease type
    try:
        codes['result'] = codes[col].dropna().apply(lookup)   #lookup the snomed codes for non-null rows, add a new column 'result'
        codes['term'], codes['fsn'] = codes['result'].dropna().str.split('_').str  #split the results around _, add to new column 'term'
        codes['semtag'] = codes['fsn'].dropna().str.extract('.*\((.*)\).*', expand=False)  #extract the semtag from brackets in fsn, add to new column 'semtag'
        codes['Disease match'] = codes.dropna().apply(lambda x: fuzzyMatch(x['Disease Type Id'], x['fsn']), axis=1)  #get fuzzy match of full fsn and disease name, add to new column
        codes['Disease match words'] = codes.dropna().apply(lambda x: splitMatch(x['Disease Type Id'], x['fsn']), axis=1) #get match to words in fsn, add to new column
        codes['Synonym disease match'] = codes.dropna().apply(lambda x: synMatch(x['Disease Type Id'], x['term']), axis=1)  #get match to synonyms of the fsn, add new column
        codes['Parent term match'] = codes.dropna().apply(lambda x: BTMatch(x['Disease Type Id'], x['term']), axis=1) #get match to synonyms of the parent terms, add new column
        codes.drop('result', axis=1, inplace=True)   #remove the result column
        return codes
    except:
        return 

In [None]:
def getTerms(ID): #get synonyms given a snomed ID
    try:
        term = int(ID)  #if the ID is an integer it is a CT term
        return list(descriptions[descriptions['CONCEPTID'] == term]['TERM'])  #find the concept in the description table which includes synonyms
    except:     #if it is not a integer it is an RT term
        return list(descriptions[descriptions['CONCEPTID'] == RT2CT(ID)]['TERM'])

In [None]:
def RT2CT (RTID): #convert a snomed RT ID to a snomed CT ID
    return concepts[concepts['SNOMEDID'] == RTID]['CONCEPTID'].values[0] #SNOMEDID = RT term, CONCEPTID = CT term

In [76]:
xl = pd.ExcelFile('ICD RT CT codes 15Nov2016x.xlsx')   #open the excel file with previous annotations

In [241]:
sheets = xl.sheet_names   #get the names of all sheets in the workbook
frameM = xl.parse(sheets[0])   #frame m is ICD M
frameT = xl.parse(sheets[3])   #frame t is ICD T

In [242]:
frameM.drop(frameM.columns[[6,7,8,9,10,11,12]], axis=1, inplace=True)

In [243]:
frameM['result'] = frameM['Sample Morphology ICD'].dropna().apply(lookupM)

In [249]:
frameM

Unnamed: 0,Participant Identifiers Id,Sample ID,CONCATENATE,Disease Type Id,Disease Subtype Id,Sample Morphology ICD,term,fsn,Disease subtype match,Disease subtype match words
0,211000010,PS16-3280 FF,211000010-PS16-3280 FF,Ovarian,endometrioid_adenocarcinoma,81403,8140/3,"Adenocarcinoma, NOS",88.0,100.0
1,211000011,PS16.03377 FF,211000011-PS16.03377 FF,Ovarian,endometrioid_adenocarcinoma,84413,8441/3,"Serous cystadenocarcinoma, NOS (C56.9)",55.0,62.0
2,211000011,PS16.03377 FFPE,211000011-PS16.03377 FFPE,Ovarian,endometrioid_adenocarcinoma,84413,8441/3,"Serous cystadenocarcinoma, NOS (C56.9)",55.0,62.0
3,211000012,PS16-04200 FFPE,211000012-PS16-04200 FFPE,Breast,ductal,M85003,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",24.0,80.0
4,211000014,PS16-04513 FF,211000014-PS16-04513 FF,Ovarian,endometrioid_adenocarcinoma,80203,8020/3,"Carcinoma, undifferentiated, NOS",46.0,50.0
5,211000015,PS16-04627 FFPE,211000015-PS16-04627 FFPE,Ovarian,endometrioid_adenocarcinoma,84413,8441/3,"Serous cystadenocarcinoma, NOS (C56.9)",55.0,62.0
6,211000016,PS16-04698 FFPE,211000016-PS16-04698 FFPE,Breast,ductal,M85002,8500/2,"Intraductal carcinoma, noninfiltrating, NOS",26.0,71.0
7,211000018,pS16-05851 FF,211000018-pS16-05851 FF,Ovarian,endometrioid_adenocarcinoma,83803,8380/3,"Endometrioid adenocarcinoma, NOS",100.0,100.0
8,211000019,Ps16-06096 ff,211000019-Ps16-06096 ff,Ovarian,unknown,83103,8310/3,"Clear cell adenocarcinoma, NOS",17.0,40.0
9,211000020,Ps16-06682 ff,211000020-Ps16-06682 ff,Sarcoma,unknown,89333,8933/3,Adenosarcoma,21.0,21.0


In [245]:
frameM['term'], frameM['fsn'] = frameM['result'].dropna().str.split('_', 1).str
frameM['Disease subtype match'] = frameM.dropna().apply(lambda x: fuzzyMatch(x['Disease Subtype Id'], x['fsn']), axis=1)
frameM['Disease subtype match words'] = frameM.dropna().apply(lambda x: splitMatch(x['Disease Subtype Id'], x['fsn']), axis=1)
frameM.drop('result', axis=1, inplace=True)

In [254]:
def triage(resultList): #analyse a row of the frame and return pass, review or fail, SheetType = morphology or topography (M or T)
    try:
        if resultList.get('fsn') == 'not found':  #this is the fully specified name column, = not found if snomed lookup failed
            return 'fail'
        elif (pd.isnull(resultList['Disease Type Id'])) or (pd.isnull(resultList['Disease Subtype Id'])):  #if there is no disease or subdisease code
            return 'review'
        elif max([resultList.get('Disease subtype match'), resultList.get('Disease subtype match words')]) < 50:   #these are the match scores: fail if highest <50, review if between 50-99, pass if = 100
            return 'fail'
        elif max([resultList.get('Disease subtype match'), resultList.get('Disease subtype match words')]) < 99:
            return 'review'
        elif max([resultList.get('Disease subtype match'), resultList.get('Disease subtype match words')]) == 100:
            return 'pass'
        
    except:
        return 'fail'   #fail if values do not compute

In [255]:
frameM['validation'] = frameM.apply(triage, axis=1)

In [256]:
frameM

Unnamed: 0,Participant Identifiers Id,Sample ID,CONCATENATE,Disease Type Id,Disease Subtype Id,Sample Morphology ICD,term,fsn,Disease subtype match,Disease subtype match words,validation
0,211000010,PS16-3280 FF,211000010-PS16-3280 FF,Ovarian,endometrioid_adenocarcinoma,81403,8140/3,"Adenocarcinoma, NOS",88.0,100.0,pass
1,211000011,PS16.03377 FF,211000011-PS16.03377 FF,Ovarian,endometrioid_adenocarcinoma,84413,8441/3,"Serous cystadenocarcinoma, NOS (C56.9)",55.0,62.0,review
2,211000011,PS16.03377 FFPE,211000011-PS16.03377 FFPE,Ovarian,endometrioid_adenocarcinoma,84413,8441/3,"Serous cystadenocarcinoma, NOS (C56.9)",55.0,62.0,review
3,211000012,PS16-04200 FFPE,211000012-PS16-04200 FFPE,Breast,ductal,M85003,8500/3,"Infiltrating duct carcinoma, NOS (C50._)",24.0,80.0,review
4,211000014,PS16-04513 FF,211000014-PS16-04513 FF,Ovarian,endometrioid_adenocarcinoma,80203,8020/3,"Carcinoma, undifferentiated, NOS",46.0,50.0,review
5,211000015,PS16-04627 FFPE,211000015-PS16-04627 FFPE,Ovarian,endometrioid_adenocarcinoma,84413,8441/3,"Serous cystadenocarcinoma, NOS (C56.9)",55.0,62.0,review
6,211000016,PS16-04698 FFPE,211000016-PS16-04698 FFPE,Breast,ductal,M85002,8500/2,"Intraductal carcinoma, noninfiltrating, NOS",26.0,71.0,review
7,211000018,pS16-05851 FF,211000018-pS16-05851 FF,Ovarian,endometrioid_adenocarcinoma,83803,8380/3,"Endometrioid adenocarcinoma, NOS",100.0,100.0,pass
8,211000019,Ps16-06096 ff,211000019-Ps16-06096 ff,Ovarian,unknown,83103,8310/3,"Clear cell adenocarcinoma, NOS",17.0,40.0,fail
9,211000020,Ps16-06682 ff,211000020-Ps16-06682 ff,Sarcoma,unknown,89333,8933/3,Adenosarcoma,21.0,21.0,fail


In [257]:
frameM.to_excel('ICDM.xlsx', index=False)