In [None]:
import pandas as pd
import numpy as np
from matplotlib_venn import venn3
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from cycler import cycler
%matplotlib inline

## First load the HES datasets

In [None]:
location = 'C:/Users/Andrew Devereau/Documents/GeL/secondary data/Data applications/HES return October 2016/'
filename = 'NIC12784_AE.txt'

In [None]:
dataAE = pd.read_csv(location+filename, sep='|')  #this is the A&E data set

In [None]:
filename = 'NIC12784_CC.txt'  #Critical care dataset
dataCC = pd.read_csv(location+filename, sep='|')

In [None]:
filename = 'NIC12784_APC.txt'   #this is the admitted patient care dataset
dataAPC = pd.read_csv(location+filename, sep='|')

In [None]:
filename = 'NIC12784_OP.txt'   #this is the outpatient dataset
dataOP = pd.read_csv(location+filename, sep='|')

### Get the cancer participants

In [None]:
filename = 'Cancer.csv'
cancerIDs = pd.read_csv(location+filename, sep=',')  #get the set of cancer patients sent to HES

In [None]:
cancerIDs = cancerIDs.rename(columns = {'Participant Id':'STUDY_ID'})  #rename the participant Id to STUDY_ID to match results

### Load the ICD, AE and specialty code lookup files and define lookup functions

In [None]:
filename = 'Tabular.xml'       #get ICD10 code list to convert codes to names
ICDtree = ET.parse(location+filename)
ICDroot = ICDtree.getroot()

In [None]:
def getICD(search):  #look up ICD disease name from a search code
    for code in ICDroot.iter('diag'):
        name = code.find('name').text
        if name == search:
            desc = code.find('desc').text
            return(search + ' ' + desc)

In [None]:
filename = 'ae.txt'
aecode = {}
with open(location+filename) as f:
    for line in f:
       (key, val) = line.split('\t')
       aecode[int(key)] = val.strip()

In [None]:
def getAE(search): #look up diag code in AE data 
    try:
        return (aecode[int(search)])
    except:
        return('None')

In [None]:
filename = 'spefCode.txt'
spefCode = {}
with open(location+filename) as f:
    for line in f:
       (key, val) = line.split('\t')
       spefCode[key] = val.strip()

In [None]:
def getSpef(search): #look up speciality codes
    try:
        return (spefCode[search])
    except:
        return('None')

In [None]:
def getNo (prefix, max): #generate a serial number string up to max with prefix, e.g. 'OPERTN_01..24'
    for no in range (1,max+1):
        if no < 10:
            yield (prefix + '0' + str(no))
        else:
            yield (prefix + str(no))

### merge CC with APC to match study id with susrecid

In [None]:
CCmerge = pd.merge(dataCC, dataAPC, on='SUSRECID', how = 'left')   

### Make cancer subsets of the HES data sets

In [None]:
cancerCC = CCmerge[(CCmerge['STUDY_ID'].isin(cancerIDs['STUDY_ID']))] 

In [None]:
cancerCC.info(verbose=True, null_counts=True)

In [None]:
cancerAE = dataAE[(dataAE['STUDY_ID'].isin(cancerIDs['STUDY_ID']))] 

In [None]:
cancerAE.info()

In [None]:
len(cancerAE['STUDY_ID'].value_counts())

In [None]:
cancerAE.info(verbose=True, null_counts=True)

In [None]:
cancerAPC = dataAPC[(dataAPC['STUDY_ID'].isin(cancerIDs['STUDY_ID']))]   #get cancer patients from APC results

In [None]:
cancerAPC.info()

In [None]:
len(cancerAPC['STUDY_ID'].value_counts())

In [None]:
cancerAPC.info(verbose=True, null_counts=True)

In [None]:
cancerOP = dataOP[(dataOP['STUDY_ID'].isin(cancerIDs['STUDY_ID']))]   #get cancer patients from OP results

In [None]:
cancerOP.info(verbose=True, null_counts=True)

### Add consent dates to the cancer data sets

In [None]:
filename = 'cancer_consent_2016-11-28_16-34-24.xlsx'
consentDates = pd.read_excel(location+filename)  #get consent dates

In [None]:
consentDates = consentDates.drop_duplicates('Participant Identifiers Id', keep='first')  #remove duplicate records

In [None]:
consentDates.drop(['Metadata Date', 'Event Date', 'Consent Given Id'], axis=1, inplace=True) #remove all columns except participant ID and date

In [None]:
consentDates.rename(columns={'Participant Identifiers Id': 'STUDY_ID'}, inplace=True) #rename participant to STUDY_ID

In [None]:
consentDates.head()

In [None]:
cancerIDs[~(cancerIDs['STUDY_ID'].isin(consentDates['STUDY_ID']))]  #check for any cancerIDs not in the consentDates set

In [None]:
cancerCC = pd.merge(cancerCC, consentDates, on='STUDY_ID', how = 'left')

In [None]:
cancerAE = pd.merge(cancerAE, consentDates, on='STUDY_ID', how = 'left')
cancerAPC = pd.merge(cancerAPC, consentDates, on='STUDY_ID', how = 'left')
cancerOP = pd.merge(cancerOP, consentDates, on='STUDY_ID', how = 'left')

### Look for radiotherapy using Specialty codes 370 and 800 in APC and OP

In [None]:
def convertSpef (spef):  #some spef codes are integers or floats- convert to strings
    if type(spef) == int:
        return str(spef)
    elif type(spef) == float:
        return str(int(spef))
    else:
        return spef

In [None]:
spefList = ['370', '800', 370, 800]  #list of speciality codes to indicate radiotherapy. Both int and string values are present

In [None]:
oncologyAPCPatients = cancerAPC[((cancerAPC.MAINSPEF.isin(spefList)) | (cancerAPC.TRETSPEF.isin(spefList))) & (pd.to_datetime(cancerAPC.ADMIDATE) < cancerAPC['Date Of Consent'])]

In [None]:
len(oncologyAPCPatients['STUDY_ID'].unique())

In [None]:
oncologyOPPatients = cancerOP[((cancerOP.MAINSPEF.isin(spefList))|(cancerOP.TRETSPEF.isin(spefList))) & (pd.to_datetime(cancerOP.APPTDATE) < cancerOP['Date Of Consent'])]

In [None]:
len(oncologyOPPatients['STUDY_ID'].unique())

In [None]:
columnVals = ['STUDY_ID', 'ADMIDATE', 'MAINSPEF', 'TRETSPEF', 'APPTDATE', 'Date Of Consent']  #create subsets from APC and OP for just these columns
APCsubset = oncologyAPCPatients.loc[:, oncologyAPCPatients.columns.isin(columnVals)]
OPsubset = oncologyOPPatients.loc[:, oncologyOPPatients.columns.isin(columnVals)]
APCsubset = APCsubset.rename(columns = {'ADMIDATE':'DATE'}) #rename the different date fields to 'DATE'
OPsubset = OPsubset.rename(columns = {'APPTDATE':'DATE'}) 

In [None]:
oncologyHES = pd.concat([APCsubset, OPsubset])  #make a single subset of participants

In [None]:
oncologyHES['TRETSPEF'].value_counts().index

In [None]:
oncologyHES['MAINSPEF'] = oncologyHES['MAINSPEF'].apply(convertSpef)
oncologyHES['TRETSPEF'] = oncologyHES['TRETSPEF'].apply(convertSpef)

In [None]:
len(oncologyHES['STUDY_ID'].unique()) #189 pariticpants have an oncology specialty history

In [None]:
def getNewID (ID, IDDict):   #function to create new participant ID for plotting purposes
    return IDDict.get(ID)

In [None]:
IDs = list(np.sort(oncologyHES['STUDY_ID'].unique()))
newIDs = list(int(x) for x in range(1,len(IDs)))
zipped = zip(IDs, newIDs)
IDDict = dict(zip(IDs, newIDs))   #create a dictionary mapping STUDY_ID to a new smaller ID

In [None]:
oncologyHES['ID'] = oncologyHES['STUDY_ID'].apply(getNewID, args =(IDDict,))  #add new column with new ID

In [None]:
groups1 = oncologyHES.groupby('MAINSPEF')   #create a timeseries plot grouped by specialty code
groups2 = oncologyHES.groupby('TRETSPEF')
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax.set_ylabel('Patient number')
for name, group in groups2:
    if name in ['370', '800']:
        ax.plot(pd.to_datetime(group['DATE']), group['ID'], label = name, linestyle='none', marker='o', color = 'r', alpha=1)
    else:
        ax.plot(pd.to_datetime(group['DATE']), group['ID'], label = name, linestyle='none', marker='.', color = 'b', alpha=1)
for name, group in groups1:
    if name in ['370', '800']:
        ax.plot(pd.to_datetime(group['DATE']), group['ID'], label = name, linestyle='none', marker='o', color = 'r', alpha=1)
    else:
        ax.plot(pd.to_datetime(group['DATE']), group['ID'], label = name, linestyle='none', marker='.', color = 'b', alpha=1)
#ax.legend(loc='best')

plt.show()

In [None]:
mainSpef = oncologyHES['MAINSPEF'].value_counts().sort_values(ascending=False).to_frame()

In [None]:
mainSpef['Specialty'] = mainSpef.index.map(getSpef)

In [None]:
mainSpef  #these are the specialties with frequencies

In [None]:
tretSpef = oncologyHES['TRETSPEF'].value_counts().sort_values(ascending=False).to_frame()
tretSpef['Specialty'] = tretSpef.index.map(getSpef)

In [None]:
tretSpef

In [None]:
oncPatients = pd.DataFrame(oncologyHES.STUDY_ID.value_counts().index)

In [None]:
oncPatients.to_excel('oncologyHES.xlsx', index=False)  #save a list of the oncology patients