In [1]:
#!pip install icd10-cm

In [1]:
import pandas as pd
import numpy as np
import re
import warnings
import datetime as dt
import icd10
warnings.filterwarnings('ignore')
path="../../Data/"

'''
Diseases to map
'''


year=2021
month=1
day=1

inpatient_update_file='ukb_ICD10.parquet'


def returndesc(string):
    '''
    functions to apply the icd10 mapping and return disease and disease block
    '''
    code=icd10.find(str(string))
    if code:
        desc=code.description
    else:
        desc=string
    return desc

def returndescblock(string):
    
    try:
        code=icd10.find(str(string))
        desc_block=str(code.block_description)
        
        return desc_block
    except:
        pass

In [2]:
def findcols(df,string):
    return [col for col in df if string in col]

In [3]:
inpatient_update=pd.read_parquet('%s%s' % (path,inpatient_update_file))

In [21]:
inpatient_update['Age_Today']

0         72.442847
1         54.357974
2         74.490760
3         76.931554
4         63.162218
            ...    
502443    80.468172
502444    74.169062
502445    52.683094
502446    54.029432
502447          NaN
Name: Age_Today, Length: 502448, dtype: float64

In [20]:
inpatient_update['Age_Today']=\
inpatient_update['age_when_attended_assessment_centre_f21003_0_0']+\
(dt.datetime(year, month, day)-pd.to_datetime(inpatient_update['date_of_attending_assessment_centre_f53_0_0'])).dt.days/365.25

In [22]:

inpatient_update['date_of_attending_assessment_centre_f53_0_0']=\
pd.to_datetime(inpatient_update['date_of_attending_assessment_centre_f53_0_0'])
inpatient_update['eid']=inpatient_update['eid'].astype(str)

In [23]:
%%time
cols1=[col for col in inpatient_update.columns if '41270' in col or 'eid' in col]
cols2=[col for col in inpatient_update.columns if '41280' in col or 'eid' in col]

df_dis=inpatient_update[cols1]
df_date=inpatient_update[cols2]

df_dis = pd.melt(df_dis, id_vars='eid', value_name='VALUE')
df_dis=df_dis[pd.notnull(df_dis['VALUE'])]

df_dis.columns=['eid','variable','disease']
df_dis['disease']=df_dis['disease'].str.replace('b','')

'''
align variables for merge
'''
df_dis['disease']=df_dis['disease'].str.replace("'","")
df_dis['variable']=df_dis['variable'].str.replace('diagnoses_icd10_','')

df_date = pd.melt(df_date, id_vars='eid', value_name='VALUE')
df_date=df_date[pd.notnull(df_date['VALUE'])]
df_date['variable']=df_date['variable'].str.replace('41280','41270')
df_date['variable']=df_date['variable'].str.replace('date_of_first_inpatient_diagnosis_icd10_','')

df_date.columns=['eid','variable','dis_date']

'''
align variables for merge
'''
df_date['dis_date']=df_date['dis_date'].str.replace('b','')
df_date['dis_date']=df_date['dis_date'].str.replace("'","")
df_date['dis_date']=pd.to_datetime(df_date['dis_date'])

df_dis_date=pd.merge(df_dis,df_date,on=['eid','variable'],how='left')

df_dis_date=pd.merge(df_dis_date,inpatient_update[['eid','Age_Today','date_of_attending_assessment_centre_f53_0_0']])
df_dis_date['Age_disease']=df_dis_date['Age_Today']-\
(dt.datetime(year, month, day)-pd.to_datetime(df_dis_date['dis_date']))\
.dt.days/365.25

df_dis_date['disease_name']=df_dis_date['disease'].apply(returndesc)
df_dis_date['disease_block']=df_dis_date['disease'].apply(returndescblock)

CPU times: user 1min 58s, sys: 13.6 s, total: 2min 11s
Wall time: 2min 18s


In [24]:
df_dis_date.rename(columns={'date_of_attending_assessment_centre_f53_0_0':'date_assess','dis_date':'disease_date'},inplace=True)

In [25]:
mask_bef=(df_dis_date['disease_date']<df_dis_date['date_assess'])

mask_aft=(df_dis_date['disease_date']>=df_dis_date['date_assess']+pd.offsets.DateOffset(years=2))&\
(df_dis_date['disease_date']<=df_dis_date['date_assess']+pd.offsets.DateOffset(years=10))

mask_10y=(df_dis_date['disease_date']>df_dis_date['date_assess']+pd.offsets.DateOffset(years=10))

df_dis_date['dis_bef']=0
df_dis_date['dis_bef'][mask_bef]=1

df_dis_date['dis_aft']=0
df_dis_date['dis_aft'][mask_aft]=1

df_dis_date['dis_exc']=0
df_dis_date['dis_exc'][mask_10y|mask_bef]=1

df_dis_date['total_bef']=df_dis_date.groupby('disease')['dis_bef'].transform('sum')
df_dis_date['total_aft']=df_dis_date.groupby('disease')['dis_aft'].transform('sum')

In [26]:
df_dis_date[(df_dis_date['disease_name'].str.contains('dementia'))]['eid'].nunique()

3429

In [27]:
inpatient_update['eid'].nunique()

502448

In [28]:
df_dis_date=df_dis_date[(df_dis_date['total_bef']>200)]

In [29]:
df_dis_date.to_parquet(path+'df_dis_date.parquet')

In [None]:
df_dis_date[()]

In [31]:
%%time
dis_ohe_icd10=pd.DataFrame(df_dis_date.groupby(['eid','disease_name'])['dis_bef'].sum().unstack('disease_name')).reset_index()

CPU times: user 8.68 s, sys: 12.3 s, total: 21 s
Wall time: 25 s


In [32]:
dis_ohe_icd10=pd.merge(inpatient_update['eid'],dis_ohe_icd10,how='left',on='eid')
dis_ohe_icd10.fillna(0,inplace=True)

In [33]:
 #dis_ohe_icd10['total_dis']=dis_ohe_icd10[[col for col in dis_ohe_icd10.columns]].sum(axis=1)

In [34]:
totaldis=pd.DataFrame(df_dis_date.groupby('eid')['dis_bef'].sum()).reset_index()
totaldis.columns=['eid','total_dis']

disblock=pd.DataFrame(df_dis_date.groupby(['eid','disease_block'])['dis_bef'].max().unstack('disease_block')).reset_index()
disblock.fillna(0,inplace=True)

In [35]:
totaldis

Unnamed: 0,eid,total_dis
0,1000014,1
1,1000023,0
2,1000030,1
3,1000041,5
4,1000062,9
...,...,...
432690,6025185,7
432691,6025194,5
432692,6025203,1
432693,6025211,1


In [36]:
dis_ohe_icd10=pd.merge(dis_ohe_icd10,disblock,how='left',on='eid')
dis_ohe_icd10=pd.merge(dis_ohe_icd10,totaldis,how='left',on='eid')
dis_ohe_icd10.fillna(0,inplace=True)

In [37]:
findcols(dis_ohe_icd10,'total_dis')

['total_dis']

In [38]:
dis_ohe_icd10.to_parquet(path+'dis_ohe_icd10_20210924.parquet')