In [1]:
import pandas as pd
import cudf
import pyreadr
import numpy as np
import torch
import pickle
from os import listdir
from os.path import isfile, join


In [2]:
clinical_specific_readcode = pd.read_csv('../FinalData/pivotClinicalCodesbyCountry_specific.csv')
therapy_specific_readcode = pd.read_csv('../FinalData/pivotTherapyCodesbyCountry_specific.csv')
read2term = pd.read_csv('../FinalData/Read2Term.csv')
term2desc = pd.read_csv('../FinalData/Term2Desc.csv')

In [3]:
read2desc = read2term[['CC', 'TERMID']].merge(term2desc[['TERMID', 'TERM30', 'TERM60']], on='TERMID', how='left')

In [4]:
read2desc.to_csv('../FinalData/Read2Desc.csv', index_label=False, index=False)

In [19]:
clinical = pd.read_feather('../SeqModel/all_data_clinical_specific.feather')
therapy = pd.read_feather('../SeqModel/all_data_therapy_specific.feather')

In [20]:
code2idx_clin = pickle.load(open('../SeqModel/all_vocab_clinical_specific.sav', 'rb'))
code2idx_ther = pickle.load(open('../SeqModel/all_vocab_therapy_specific.sav', 'rb'))

In [21]:
clinical_vocab = pd.DataFrame(code2idx_clin, columns=['CC'])
clinical_vocab = clinical_vocab.merge(read2desc, on='CC', how='left')
clinical_vocab.drop_duplicates(subset='CC')
clinical_vocab = clinical_vocab.dropna()
clinical_vocab.reset_index(inplace=True, drop=True)

In [22]:
clinical_vocab

Unnamed: 0,CC,TERMID,TERM30,TERM60
0,663K.,Y7A02,Airways obstructn irreversible,Airways obstructn irreversible
1,ZV725,Ya0oq,[V]Radiological examinatn. NEC,[V]Radiological examination NEC
2,ZV725,Ya0op,[V]Routine chest X-ray,[V]Routine chest X-ray
3,X506J,Y50GT,Disseminated secondary eczema,Disseminated secondary eczema
4,j2...,y02vt,Non-steroid anti-inflam. drug,Non-steroidal anti-inflammatory drug
...,...,...,...,...
1775,XaIIW,Yakgo,Asth A&E attend since last vis,Asthma accident and emergency attendance since...
1776,H22yz,Y101v,Pneumonia due to bacteria NOS,Pneumonia due to bacteria NOS
1777,XaFwz,Yah7Z,Asian - ethnic group,Asian - ethnic group
1778,X00lA,Y02SF,Perennial allergic rhinitis,Perennial allergic rhinitis


In [23]:
#vocab and code2idx generation
# vocab_all = all_raw_data.code_id.unique().tolist()
vocab_all = code2idx_clin
vocab_all = list(set(vocab_all))
idx_all = range(1, len(vocab_all)+1)

code2idx_all = dict(zip(vocab_all, idx_all))
idx2code_all = dict(zip(idx_all, vocab_all))

code2idx_all['PAD'] = 0
# code2idx_all['start_visit'] = 1
# code2idx_all['end_visit'] = 2
idx2code_all[0] = 'PAD'
# idx2code_all[1] = 'start_visit'
# idx2code_all[2] = 'end_visit'
VOCAB_SIZE = len(code2idx_all)
print('code2idx Size: {}'.format(len(code2idx_all)))
print('idx2code Size: {}'.format(len(idx2code_all)))

code2idx Size: 1316
idx2code Size: 1316


In [24]:
clinical['read_code_seq'] = clinical['read_code_seq_padded_idx'].apply(lambda x: [idx2code_all.get(key) for key in x])

In [25]:
def getDescfromCode(key):
    desc = read2desc[read2desc.CC == key]['TERM60']
    if desc.shape[0] > 0:
        return desc.values[0]

In [26]:
clinical[:2]['read_code_seq'].apply(lambda x: [getDescfromCode(key) for key in x])

0    [Recurrent bronchiectasis, [D]Respiratory symp...
1    [[D]Respiratory symptom, unspecified, Lung fun...
Name: read_code_seq, dtype: object

In [27]:
clinical[:2]['read_code_seq']

0    [H340., R0600, c13G., M1610, XaFwH, e927., XaL...
1    [R0600, 337Z., Ua1qg, c13G., XaMGo, XaLMx, X50...
Name: read_code_seq, dtype: object

In [None]:
clinical_specific_readcode = clinical_specific_readcode.merge(read2term[['CC', 'TERMID']], left_on='code_id', right_on='CC', how='left')
clinical_specific_readcode = clinical_specific_readcode.merge(term2desc[['TERMID', 'TERM30']], on = 'TERMID', how='left')

In [None]:
clinical_specific_readcode[clinical_specific_readcode['%England'] == 100]

In [None]:
clinical = pd.read_feather('../SeqModel/all_data_clinical.feather')

In [None]:
clinical.patid.unique().shape

In [None]:
#patientData
patient = pyreadr.read_r('../ServerData_13Oct2020/d_patient_overall.Rdata')
practice = pyreadr.read_r('../ServerData_13Oct2020/d_practice.Rdata')
patient = patient['d_patient_overall']
practice = practice['d_practice']
patient = patient[['patid', 'practice_id']].merge(practice[['practice_id', 'Country']], how='left', on='practice_id')


#load clinical information
path = '../ServerData_13Oct2020/'
clinical_files = [join(path, f) for f in listdir(path) if (isfile(join(path, f))) & ('f_clinical_part' in f)]
clinical = pyreadr.read_r('../ServerData_13Oct2020/f_clinical_part1.Rdata')
clinical = clinical['f_clinical_part']
clinical = clinical.dropna(subset=['code_id'])
clinical['event_date'] = pd.to_datetime(clinical['event_date'])
clinical = clinical.loc[(clinical['event_date'] >= '2016-01-01') & (clinical['event_date'] < '2018-01-01')]
# clinical = clinical[clinical.code_id.isin(target_readcode.readcodes.values)]
clinical = clinical.merge(patient[['patid', 'Country']], on='patid', how='left')
clinical = clinical[['patid', 'code_id', 'Country']]

In [None]:
clinical = pd.pivot_table(data=clinical, values='patid', index='code_id', columns='Country', aggfunc=pd.Series.nunique)
clinical = clinical[['England','Scotland','Wales']]
clinical = clinical.fillna(0)
clinical['%England'] = clinical.apply(lambda x: x.England/sum([x.England, x.Scotland, x.Wales])*100, axis=1)
clinical['%Scotland'] = clinical.apply(lambda x: x.Scotland/sum([x.England, x.Scotland, x.Wales])*100, axis=1)
clinical['%Wales'] = clinical.apply(lambda x: x.Wales/sum([x.England, x.Scotland, x.Wales])*100, axis=1)
clinical['%patient'] = clinical.apply(lambda x: (sum([x.England, x.Scotland, x.Wales])/675260)*100, axis=1)
clinical.sort_values('%England', ascending=False)


In [None]:
clinical.sort_values('%patient', ascending=False)