## Take the morphology and topography codes and determine text match scores to snomed code names

In [None]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from openpyxl import load_workbook

In [None]:
location = 'C:/Users/Andrew Devereau/Downloads/SnomedCT_RF1Release_INT_20160731/Terminology/Content/' 
filename = 'sct1_Concepts_Core_INT_20160731.txt'   #this is the SNOMED CT file= international release 31/7/16
concepts = pd.read_csv(location+filename, sep='\t')  #load the snomed concepts

In [None]:
filename = 'sct1_Descriptions_en_INT_20160731.txt'
descriptions = pd.read_csv(location+filename, sep='\t')  #load the snomed descriptions - these include synonyms

In [None]:
filename = 'sct1_Relationships_Core_INT_20160731.txt'
relationships = pd.read_csv(location+filename, sep='\t')  #load the snomed relationships - these include is-a links

In [None]:
def fuzzyMatch (str1, str2):   #this returns a fuzzy match score on two strings using the token set method
    return fuzz.token_set_ratio(str1.replace('_', ' '), str2.replace('_', ' '))  #disease subtypes have _ instead of spaces

In [None]:
def splitMatch (target, sentence):  #splits up a sentence into words and finds the best fuzzy match of any word to the target
    words = str.split(sentence)
    result = []
    for w in words:
        result.append((fuzzyMatch(target, w), w))  #add each result as a tuple of (target word, fuzzy score) to a list
    return max(result)[0]  #find the maximum score and return only that one

In [None]:
def synMatch (target, ID): #get synonyms for a snomed ID and get the best match of any synonym to the target
    synonyms = getTerms(str(ID))
    result = []
    for term in synonyms:   #aggregate all synonyms into a single list
        result.append((fuzzyMatch(target, term), term))  #get match scores for each term
    return max(result)[0]  #return the highest score

In [None]:
def getParents(ID): #get parent 'is-a' terms from a snomed CT id
    try:
        try:
            term = int(ID)   #if the term is a CT term it is an INT - if not an excpetion is thrown. Relationship 116680003 is 'is-a'
            result = relationships[(relationships['CONCEPTID1'] == ID) & (relationships['RELATIONSHIPTYPE'] == 116680003)]['CONCEPTID2'].values
        except:   #if it is an RT term - ie not an INT - convert from RT to CT before getting parents as only the concept table uses RT terms
            result = relationships[(relationships['CONCEPTID1'] == RT2CT(ID)) & (relationships['RELATIONSHIPTYPE'] == 116680003)]['CONCEPTID2'].values
        terms = []
        for term in result:
            terms += getTerms(term)   #get all synonyms for all parent terms and put them in one list
        return terms
    except:
        return 'not found'

In [None]:
def BTMatch(target, ID): #get immediate parent terms for a snomed CT term and get the best match to the target
    terms = getParents(ID)  #this is all the synonyms for all parent terms
    result = []
    for term in terms:
        result.append((fuzzyMatch(target, term), term))   #get fuzzy matches for each synonym
    return max(result)[0]  #return only the highest score

In [None]:
def lookup(ID):   #find a fully specified name for an ID using the SNOMED concept frame
    if type(ID) == np.float64:   #some CT codes are converted to type float64 on loading - convert to int
        ID = ID.astype(int)
    try:
        IDint = int(str(ID))   #if the ID is a integer it is probably a snomed CT code. if not it will throw an exception
        try:
            result = concepts[concepts['CONCEPTID'] == IDint]['FULLYSPECIFIEDNAME'].values[0]
            return (str(ID) + '_' + result)  #return the found name plus original ID
        except:
            return (str(ID) + '_not found')  #ID not found
    except:        #if it is not an integer it is probably a snomed RT code (starts with a letter)
        try:
            ID2 = ID[0] + '-' + ID[1:]    #if snomed RT - add a dash after the first letter
            result = concepts[concepts['SNOMEDID'] == ID2]['FULLYSPECIFIEDNAME'].values[0]  #look up in the concepts frame
            return (ID2 + '_' + result)
        except:
            return (ID + '_not found')  #use the original ID for the 'not found' message to avoid type errors

In [None]:
def annotateMorph(codes, col): #this looks up snomed codes in column col of a dataframe to give snomed names and a match score against the disease subtype
    try:
        codes['result'] = codes[col].dropna().apply(lookup)   #lookup the snomed codes for non-null rows and add to a new column called result
        codes['term'], codes['fsn'] = codes['result'].dropna().str.split('_').str  #split the results around _ and add to new column 'term'
        codes['semtag'] = codes['fsn'].dropna().str.extract('.*\((.*)\).*', expand=False)  #extract the semtag from brackets in fsn and add to column 'semtag'
        codes['Disease subtype match'] = codes.dropna().apply(lambda x: fuzzyMatch(x['Disease Subtype Id'], x['fsn']), axis=1) #add a new column with the match of the whole fsn to disease subtype
        codes['Disease subtype match words'] = codes.dropna().apply(lambda x: splitMatch(x['Disease Subtype Id'], x['fsn']), axis=1) #add column with match to words in the fsn
        codes['Synonym subtype match'] = codes.dropna().apply(lambda x: synMatch(x['Disease Subtype Id'], x['term']), axis=1) #add a new column with match to synonyms of the fsn
        codes.drop('result', axis=1, inplace=True)   #remove the result column
        return codes
    except:
        return 

In [None]:
def annotateTop(codes, col): #this looks up snomed codes in column col of a dataframe to give snomed names and a match score against the disease type
    try:
        codes['result'] = codes[col].dropna().apply(lookup)   #lookup the snomed codes for non-null rows, add a new column 'result'
        codes['term'], codes['fsn'] = codes['result'].dropna().str.split('_').str  #split the results around _, add to new column 'term'
        codes['semtag'] = codes['fsn'].dropna().str.extract('.*\((.*)\).*', expand=False)  #extract the semtag from brackets in fsn, add to new column 'semtag'
        codes['Disease match'] = codes.dropna().apply(lambda x: fuzzyMatch(x['Disease Type Id'], x['fsn']), axis=1)  #get fuzzy match of full fsn and disease name, add to new column
        codes['Disease match words'] = codes.dropna().apply(lambda x: splitMatch(x['Disease Type Id'], x['fsn']), axis=1) #get match to words in fsn, add to new column
        codes['Synonym disease match'] = codes.dropna().apply(lambda x: synMatch(x['Disease Type Id'], x['term']), axis=1)  #get match to synonyms of the fsn, add new column
        codes['Parent term match'] = codes.dropna().apply(lambda x: BTMatch(x['Disease Type Id'], x['term']), axis=1) #get match to synonyms of the parent terms, add new column
        codes.drop('result', axis=1, inplace=True)   #remove the result column
        return codes
    except:
        return 

In [None]:
def getTerms(ID): #get synonyms given a snomed ID
    try:
        term = int(ID)  #if the ID is an integer it is a CT term
        return list(descriptions[descriptions['CONCEPTID'] == term]['TERM'])  #find the concept in the description table which includes synonyms
    except:     #if it is not a integer it is an RT term
        return list(descriptions[descriptions['CONCEPTID'] == RT2CT(ID)]['TERM'])

In [None]:
def RT2CT (RTID): #convert a snomed RT ID to a snomed CT ID
    return concepts[concepts['SNOMEDID'] == RTID]['CONCEPTID'].values[0] #SNOMEDID = RT term, CONCEPTID = CT term

In [None]:
location = 'C:/Users/Andrew Devereau/Documents/GeL/secondary data/Data applications/HES return October 2016/'
filename = 'ICD RT CT codes 15Nov2016v2.xlsx'   #get the excel file with the snomed codes

In [None]:
xl = pd.ExcelFile(location+filename)   #open the excel file

In [None]:
sheets = xl.sheet_names   #get the names of all sheets in the workbook
frameList = []
for sheetNo in range(2,14):  #only analyse the 12 sheets with morph and topo data
    print (sheets[sheetNo])
    frame = xl.parse(sheets[sheetNo])   #make a dataframe from each sheet
    frame.name = sheets[sheetNo]  #name the dataframe according to the sheet name
    frameList.append(frame)   #add each frame to the list of frames

In [None]:
for sheet in frameList:
    if sheet.name[2] == 'M':   #if the middle letter is M this is morphology data - use the morph annotation
        sheet = annotateMorph(sheet, sheet.columns.values[-1])
    else:
        sheet = annotateTop(sheet, sheet.columns.values[-1])    #otherwise it is a topography sheet - use the topography annotation

In [None]:
writer = pd.ExcelWriter('ICD RT CT codes 15Nov2016 v 2 annotated.xlsx', engine='xlsxwriter')

In [None]:
for sheet in frameList:
    sheet.to_excel(writer, index=False, sheet_name=sheet.name)
writer.save()