In [None]:
import pandas as pd
import numpy as np
from matplotlib_venn import venn3
from matplotlib_venn import venn2
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from cycler import cycler
%matplotlib inline

In [None]:
location = 'C:/Users/Andrew Devereau/Documents/GeL/secondary data/Data applications/HES return October 2016/'
filename = 'NIC12784_AE.txt'

In [None]:
dataAE = pd.read_csv(location+filename, sep='|')  #this is the A&E data set

In [None]:
filename = 'NIC12784_CC.txt'  #Critical care dataset
dataCC = pd.read_csv(location+filename, sep='|')

In [None]:
filename = 'NIC12784_APC.txt'   #this is the admitted patient care dataset
dataAPC = pd.read_csv(location+filename, sep='|')

In [None]:
filename = 'NIC12784_OP.txt'   #this is the outpatient dataset
dataOP = pd.read_csv(location+filename, sep='|')

In [None]:
filename = 'Cancer.csv'
cancerIDs = pd.read_csv(location+filename, sep=',')  #get the set of cancer patients sent to HES

In [None]:
cancerIDs = cancerIDs.rename(columns = {'Participant Id':'STUDY_ID'})  #rename the participant Id to STUDY_ID to match results

In [None]:
filename = 'Tabular.xml'       #get ICD10 code list to convert codes to names
ICDtree = ET.parse(location+filename)
ICDroot = ICDtree.getroot()

In [None]:
def getICD(search):  #look up ICD disease name from a search code
    for code in ICDroot.iter('diag'):
        name = code.find('name').text
        if name == search:
            desc = code.find('desc').text
            return(search + ' ' + desc)

In [None]:
filename = 'ae.txt'
aecode = {}
with open(location+filename) as f:
    for line in f:
       (key, val) = line.split('\t')
       aecode[int(key)] = val.strip()

In [None]:
def getAE(search): #look up diag code in AE data 
    try:
        return (aecode[int(search)])
    except:
        return('None')

In [None]:
CCmerge = pd.merge(dataCC, dataAPC, on='SUSRECID', how = 'left')   #merge CC with APC to match study id with susrecid

In [None]:
cancerCC = CCmerge[(CCmerge['STUDY_ID'].isin(cancerIDs['STUDY_ID']))] 

In [None]:
cancerCC.info()

In [None]:
cancerAE = dataAE[(dataAE['STUDY_ID'].isin(cancerIDs['STUDY_ID']))] 

In [None]:
cancerAE.info()

In [None]:
len(cancerAE['STUDY_ID'].value_counts())

In [None]:
cancerAPC = dataAPC[(dataAPC['STUDY_ID'].isin(cancerIDs['STUDY_ID']))]   #get cancer patients from APC results

In [None]:
cancerAPC.info()

In [None]:
len(cancerAPC['STUDY_ID'].value_counts())

In [None]:
cancerOP = dataOP[(dataOP['STUDY_ID'].isin(cancerIDs['STUDY_ID']))]   #get cancer patients from OP results

In [None]:
cancerOP.info()

In [None]:
len(cancerOP['STUDY_ID'].value_counts())

This is to check the gender of the cancer participants using the HES data. 

In [None]:
grouped = cancerAPC.groupby(['STUDY_ID'])   #group by participant
patientSexAPC = grouped['SEX'].mean()    #take a mean of all the SEX codes - inconsistency causes a fractional value
APCSex = pd.DataFrame(patientSexAPC)   #turn the grouped SEX data into a dataframe

In [None]:
grouped = cancerOP.groupby(['STUDY_ID'])
patientSexOP = grouped['SEX'].mean()
OPSex = pd.DataFrame(patientSexOP)

In [None]:
grouped = cancerAE.groupby(['STUDY_ID'])
patientSexAE = grouped['SEX'].mean()
AESex = pd.DataFrame(patientSexAE)

In [None]:
Sex = pd.merge(APCSex, OPSex, left_index=True, right_index=True, how='outer')  #merge all the dataframes

In [None]:
Sex = pd.merge(Sex, AESex, left_index=True, right_index=True, how='outer')

In [None]:
Sex['Mean'] = Sex[['SEX_x', 'SEX_y', 'SEX']].mean(axis=1)     #add a new column which is a mean of all the mean SEX scores

In [None]:
Sex.to_excel('sex.xlsx')  #sent to excel

Check to gender of the RD patients

In [None]:
grouped = dataAPC.groupby(['STUDY_ID'])
patientSexAPC = grouped['SEX'].mean()
APCSex = pd.DataFrame(patientSexAPC)

In [None]:
grouped = dataOP.groupby(['STUDY_ID'])
patientSexOP = grouped['SEX'].mean()
OPSex = pd.DataFrame(patientSexOP)

In [None]:
grouped = dataAE.groupby(['STUDY_ID'])
patientSexAE = grouped['SEX'].mean()
AESex = pd.DataFrame(patientSexAE)

In [None]:
Sex = pd.merge(APCSex, OPSex, left_index=True, right_index=True, how='outer')

In [None]:
Sex = pd.merge(Sex, AESex, left_index=True, right_index=True, how='outer')

In [None]:
Sex['Mean'] = Sex[['SEX_x', 'SEX_y', 'SEX']].mean(axis=1)

In [None]:
SexRD = Sex[~(Sex.index.isin(cancerIDs['STUDY_ID']))]  #remove cancer patients
SexRD = Sex[(Sex.index > 10000)]  #remove the temporary pilot IDs which are less than 10000 - study IDs are 9 figures long

In [None]:
odds = SexRD[((SexRD['Mean'] % 1) != 0)]   #find those with disagreeing sex values

In [None]:
odds

In [None]:
SexRD.to_excel('SexRD.xlsx')

In [None]:
cancerAEID = set(cancerAE['STUDY_ID'])   #make sets of the study_ids to find out how many patients are included

In [None]:
cancerAPCID = set(cancerAPC['STUDY_ID'])

In [None]:
cancerOPID = set(cancerOP['STUDY_ID'])

In [None]:
cancerHES = set(cancerIDs['STUDY_ID'])  #these are the IDs sent to HES

In [None]:
cancerCCID = set(cancerCC['STUDY_ID'])   #there are 172 unique cancer participants in the CC set

In [None]:
len(cancerHES)  #this is the number of submissions to HES

In [None]:
venn3([cancerAPCID, cancerCCID, cancerAEID], ('APC', 'CC', 'A&E'))

In [None]:
venn3([cancerOPID, cancerCCID, cancerAEID], ('OP', 'CC', 'A&E'))

In [None]:
overallSet = cancerAEID.union(cancerAPCID, cancerOPID, cancerCCID)  #this is the overall union of all HES participant IDs

In [None]:
len(overallSet)  #this is how many participants were returned from HES

Conclusion is that 1020 out of 1025 pariticpants sent to HES were returned with at least one record in one of the four datasets, and in most cases records in two sets, and in many three or four sets.

In [None]:
cancerAE.MATCH_RANK.value_counts().plot(kind='pie')
plt.axis('equal')

In [None]:
cancerAPC.MATCH_RANK.value_counts().plot(kind='pie')
plt.axis('equal')

In [None]:
cancerOP.MATCH_RANK.value_counts().plot(kind='pie')
plt.axis('equal')

In [None]:
sortAE = cancerAE.sort_values(by='ARRIVALDATE')
sortAE['ARRIVALDATE']

In [None]:
APCsort=cancerAPC.sort_values(by='EPIEND')
APCsort['EPIEND']

In [None]:
OPsort = cancerOP.sort_values(by='APPTDATE')
OPsort['APPTDATE']

In [None]:
sortCC = cancerCC.sort_values(by='EPIEND')
sortCC['EPIEND']

In [None]:
fullFrame = pd.DataFrame(pd.concat([cancerAE['STUDY_ID'], cancerAPC['STUDY_ID'], cancerOP['STUDY_ID'], cancerCC['STUDY_ID']], ignore_index=True))

In [None]:
len(fullFrame)  #50006 records for the cancer patients

In [None]:
fullFrame

In [None]:
fullFrame['STUDY_ID'].value_counts()

In [None]:
fullFrame.STUDY_ID.value_counts().plot(kind='hist',bins=[0,1,2,5,10,20,50,100,200,500])
plt.xscale('log')
plt.xlabel('No. of records')

## Create subsets of each data set in order to analyse data in detail

In [None]:
AE_subset = cancerAE.reindex(columns=['STUDY_ID', 'ARRIVALDATE', 'DIAG_01'])  #create a subset with only the ID and date

In [None]:
AE_subset['DATE'] = pd.to_datetime(AE_subset['ARRIVALDATE'])

In [None]:
AE_subset['source'] = 'A&E'   #add a column to indicate the data set source

In [None]:
APC_subset = cancerAPC.reindex(columns=['STUDY_ID', 'EPIEND', 'DIAG_01'])

In [None]:
APC_subset['DATE'] = pd.to_datetime(APC_subset['EPIEND'])

In [None]:
APC_subset = APC_subset.dropna()  #need to get rid of the NaT values

In [None]:
APC_subset['source'] = 'In patient'

In [None]:
OP_subset = cancerOP.reindex(columns = ['STUDY_ID', 'APPTDATE', 'DIAG_01'])

In [None]:
OP_subset['DATE'] = pd.to_datetime(OP_subset['APPTDATE'])

In [None]:
OP_subset = OP_subset.dropna()

In [None]:
OP_subset['source'] = 'Out patient'

In [None]:
CC_subset = cancerCC.reindex(columns = ['STUDY_ID', 'EPIEND'])

In [None]:
CC_subset['DATE'] = pd.to_datetime(CC_subset['EPIEND'])

In [None]:
CC_subset = CC_subset.dropna()

In [None]:
CC_subset['source'] = 'Critical Care'

In [None]:
subsets = pd.concat([APC_subset, AE_subset, OP_subset, CC_subset])   #add all the subsets together

In [None]:
groups = subsets.groupby('source')   #create a timeseries plot grouped by the data source
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax.set_ylim(218000020,218000100)
ax.set_ylabel('Patient number')
ax.set_prop_cycle(cycler('color', ['r', 'y', 'b', 'g']))
for name, group in groups:
    ax.plot(group['DATE'], group['STUDY_ID'], label=name, linestyle='none', marker='o', alpha=1)
ax.legend(loc='best')

plt.show()

## Look at primary diagnosis codes in the data sets - these are by report, not by patient. Critical care does not have a diagnosis code. OP and APC use ICD-10 codes, AE uses its own list of values

In [None]:
cancerDiagFreqAPC = cancerAPC['Diag_3_01'].value_counts()

In [None]:
len(cancerDiagFreqAPC)

In [None]:
cancerDiagFreqAPC = cancerDiagFreqAPC.rename(lambda x: getICD(x))  #replace ICD codes with disease names

In [None]:
cancerDiagFreqAPC[cancerDiagFreqAPC > 25].plot(kind='bar', figsize = (20,5), title = 'APC primary diagnosis frequency for cancer patients')

In [None]:
cancerDiagFreqOP = cancerOP['DIAG_3_01'].value_counts()

In [None]:
cancerDiagFreqOP = cancerDiagFreqOP.rename(lambda x: getICD(x))

In [None]:
cancerDiagFreqOP[(cancerDiagFreqOP < 5000) & (cancerDiagFreqOP > 5)].plot(kind='bar', figsize = (20,5), title = 'OP primary diagnosis frequency for cancer patients')

**Note** that the most frequent value was 'R69 Illness, unspecified' with 36436 reports

In [None]:
cancerDiagFreqAE = cancerAE['DIAG2_01'].value_counts()
cancerDiagFreqAE = cancerDiagFreqAE.rename(lambda x: getAE(x))

In [None]:
cancerDiagFreqAE

In [None]:
cancerDiagFreqAE[cancerDiagFreqAE > 4].plot(kind='bar', figsize = (20,5), title = 'A&E diagnosis frequency for cancer patients')

In [None]:
cancerAPC1ry = cancerAPC.loc[cancerAPC['DIAG_01'].str.contains('C|D1|D2|D3|D4',na=False)]  #get just ICD = C or D1-D4 primarys

In [None]:
cancerOP1ry = cancerOP.loc[cancerOP['DIAG_01'].str.contains('C|D1|D2|D3|D4',na=False)]  #get just ICD = C or D1-D4 primarys

In [None]:
cancerAE1ry = cancerAE.loc[cancerAE['DIAG_01'].str.contains('C|D1|D2|D3|D4',na=False)]  #get just ICD = C or D1-D4 primarys

In [None]:
cancer1ryset = set(cancerAPC1ry['STUDY_ID'].unique()).union(set(cancerOP1ry['STUDY_ID'].unique()), set(cancerAE1ry['STUDY_ID'].unique()))

In [None]:
len(cancer1ryset)  #this is the set of all particpant IDs that have a primary cancer diagnosis

In [None]:
APC1ry = cancerAPC[cancerAPC['STUDY_ID'].isin(cancer1ryset)]['Diag_3_01'].value_counts()

In [None]:
APC1ry[APC1ry > 25].plot(kind='bar', figsize = (20,5), title = 'APC primary diagnosis frequency for cancer patients')

In [None]:
APC1rysubset = cancerAPC[cancerAPC['STUDY_ID'].isin(cancer1ryset)]


In [None]:
len(subsets)

In [None]:
subsets.head()

In [None]:
subsetsCa = subsets[subsets.STUDY_ID.isin(cancer1ryset)]

In [None]:
len(subsetsCa)

In [None]:
subsetCa1 = subsetsCa[subsetsCa['DIAG_01'].str.contains('C|D1|D2|D3|D4',na=False)]

In [None]:
subsetCa2 = subsetsCa[~subsetsCa['DIAG_01'].str.contains('C|D1|D2|D3|D4',na=False)]

In [None]:
groups1 = subsetCa1.groupby('source')   #create a timeseries plot grouped by the data source
groups2 = subsetCa2.groupby('source')
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax.set_ylim(218000000,218000200)
ax.set_xlim(pd.Timestamp('2014-01-01'), pd.Timestamp('2016-07-31'))
ax.set_ylabel('Patient number')
ax.set_prop_cycle(cycler('color', ['r', 'y', 'b', 'g']))
for name, group in groups1:
    ax.plot(group['DATE'], group['STUDY_ID'], label=name, linestyle='none', marker='D', alpha=1)
for name, group in groups2:  
    ax.plot(group['DATE'], group['STUDY_ID'], label=name, linestyle='none', marker='+', alpha=1)
ax.legend(loc='best')

plt.show()