In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Importing tables with necessary columns
tables = {}
tables['patients'] = pd.read_csv('patients.csv', index_col=0).filter(['GENDER'])
tables['allergies'] = pd.read_csv('allergies.csv').filter(['PATIENT', 'CODE', 'DESCRIPTION'])
tables['observations'] = pd.read_csv('observations.csv').filter(['PATIENT', 'CODE', 'DESCRIPTION', 'VALUE', 'UNITS', 'TYPE'])
tables['conditions'] = pd.read_csv('conditions.csv').filter(['PATIENT', 'CODE', 'DESCRIPTION'])
tables['careplans'] = pd.read_csv('careplans.csv').filter(['PATIENT', 'CODE', 'DESCRIPTION', 'REASONCODE', 'REASONDESCRIPTION'])
for k, v in tables.items():
    print('\n', k.upper(), v.shape)
    display(v.head())


 PATIENTS (12352, 1)


Unnamed: 0_level_0,GENDER
Id,Unnamed: 1_level_1
f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,M
067318a4-db8f-447f-8b6e-f2f61e9baaa5,F
ae9efba3-ddc4-43f9-a781-f72019388548,M
199c586f-af16-4091-9998-ee4cfc02ee7a,F
353016ea-a0ff-4154-85bb-1cf8b6cedf20,M



 ALLERGIES (5417, 3)


Unnamed: 0,PATIENT,CODE,DESCRIPTION
0,df6b563d-1ff4-4833-9af8-84431e641e9c,424213003,Allergy to bee venom
1,df6b563d-1ff4-4833-9af8-84431e641e9c,418689008,Allergy to grass pollen
2,df6b563d-1ff4-4833-9af8-84431e641e9c,419263009,Allergy to tree pollen
3,df6b563d-1ff4-4833-9af8-84431e641e9c,417532002,Allergy to fish
4,ff7b040b-aa96-4003-8926-3dac8ca8eb05,91934008,Allergy to nut



 OBSERVATIONS (1590980, 6)


Unnamed: 0,PATIENT,CODE,DESCRIPTION,VALUE,UNITS,TYPE
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,8302-2,Body Height,82.7,cm,numeric
1,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,2.0,{score},numeric
2,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,29463-7,Body Weight,12.6,kg,numeric
3,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,77606-2,Weight-for-length Per age and sex,86.1,%,numeric
4,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,9843-4,Head Occipital-frontal circumference,46.9,cm,numeric



 CONDITIONS (114544, 3)


Unnamed: 0,PATIENT,CODE,DESCRIPTION
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,65363002,Otitis media
1,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,65363002,Otitis media
2,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,386661006,Fever (finding)
3,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,840544004,Suspected COVID-19
4,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,840539006,COVID-19



 CAREPLANS (37715, 5)


Unnamed: 0,PATIENT,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,736376001,Infectious disease care plan (record artifact),840544004.0,Suspected COVID-19
1,f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,736376001,Infectious disease care plan (record artifact),840539006.0,COVID-19
2,067318a4-db8f-447f-8b6e-f2f61e9baaa5,91251008,Physical therapy procedure,44465007.0,Sprain of ankle
3,067318a4-db8f-447f-8b6e-f2f61e9baaa5,736376001,Infectious disease care plan (record artifact),840544004.0,Suspected COVID-19
4,067318a4-db8f-447f-8b6e-f2f61e9baaa5,736376001,Infectious disease care plan (record artifact),840539006.0,COVID-19


In [3]:
# Transforming tables
encoded_tables = {}

alle_table = tables['allergies']
alle_codes = alle_table['CODE'].unique()
pat_alle = {}
alle_desc = {}
for i in range(alle_table.shape[0]):
    j = alle_table['PATIENT'][i]
    if j not in pat_alle:
        pat_alle[j] = {k:0 for k in alle_codes}
    c = alle_table['CODE'][i]
    pat_alle[j][c] = 1
    if c not in alle_desc:
        alle_desc[c] = alle_table['DESCRIPTION'][i]
encoded_tables['allergies'] = pd.DataFrame(pat_alle).transpose()
alle_desc = list((k, alle_desc[k]) for k in encoded_tables['allergies'].columns)
display(encoded_tables['allergies'].head())

obs_table = tables['observations']
obs_codes = obs_table['CODE'].unique()
pat_obs = {}
obs_desc = {}
for i in range(obs_table.shape[0]):
    j = obs_table['PATIENT'][i]
    if j not in pat_obs:
        pat_obs[j] = {k:-1 for k in obs_codes}
    c = obs_table['CODE'][i]
    pat_obs[j][c] = obs_table['VALUE'][i]
    if c not in obs_desc:
        obs_desc[c] = (obs_table['DESCRIPTION'][i], obs_table['UNITS'][i], obs_table['TYPE'][i])
encoded_tables['observations'] = pd.DataFrame(pat_obs).transpose()
obs_desc = list((k, *obs_desc[k]) for k in encoded_tables['observations'].columns)
display(encoded_tables['observations'].head())

cond_table = tables['conditions']
cond_codes = cond_table['CODE'].unique()
pat_cond = {}
cond_desc = {}
for i in range(cond_table.shape[0]):
    j = cond_table['PATIENT'][i]
    if j not in pat_cond:
        pat_cond[j] = {k:0 for k in cond_codes}
    c = cond_table['CODE'][i]
    pat_cond[j][c] = 1
    if c not in cond_desc:
        cond_desc[c] = cond_table['DESCRIPTION'][i]
encoded_tables['conditions'] = pd.DataFrame(pat_cond).transpose()
cond_desc = list((k, cond_desc[k]) for k in encoded_tables['conditions'].columns)
display(encoded_tables['conditions'].head())

crp_table = tables['careplans']
crp_codes = crp_table['CODE'].unique()
pat_crp = {}
target_desc = {}
for i in range(crp_table.shape[0]):
    j = crp_table['PATIENT'][i]
    if j not in pat_crp:
        pat_crp[j] = {k:0 for k in crp_codes}
    c = crp_table['CODE'][i]
    pat_crp[j][c] = 1
    if c not in target_desc:
        target_desc[c] = crp_table['DESCRIPTION'][i]
encoded_tables['careplans'] = pd.DataFrame(pat_crp).transpose()
target_desc = list((k, target_desc[k]) for k in encoded_tables['careplans'].columns)
display(encoded_tables['careplans'].head())

Unnamed: 0,424213003,418689008,419263009,417532002,91934008,419474003,232347008,300913006,232350006,425525006,91930004,420174000,300916003,714035009,91935009
df6b563d-1ff4-4833-9af8-84431e641e9c,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0
ff7b040b-aa96-4003-8926-3dac8ca8eb05,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
0bccccae-0961-4ee0-896a-d80729b22e6c,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0
034d0449-81a4-498c-88f6-74822b92c7ce,0,1,1,0,0,1,1,0,1,1,1,1,0,0,0
55c70d2a-03ad-4f4d-8599-c967121434f1,1,1,1,0,0,1,1,0,1,0,0,0,1,0,0


Unnamed: 0,8302-2,72514-3,29463-7,77606-2,9843-4,8462-4,8480-6,8867-4,9279-1,72166-2,...,26453-1,30428-7,30385-9,26515-7,42719-5,33037-3,80271-0,71970-8,71972-4,NaN
f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,86.3,4.0,13.5,87.5,47.4,75.0,130.0,164.2,32.9,Never smoker,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
067318a4-db8f-447f-8b6e-f2f61e9baaa5,97.7,2.0,15.8,66.7,48.9,79.0,118.0,172.4,20.0,Never smoker,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
ae9efba3-ddc4-43f9-a781-f72019388548,165.7,1.0,68.2,-1.0,-1.0,96.0,187.0,186.9,32.5,Never smoker,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
199c586f-af16-4091-9998-ee4cfc02ee7a,149.3,2.0,45.3,-1.0,-1.0,89.0,120.0,97.5,22.3,Never smoker,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
353016ea-a0ff-4154-85bb-1cf8b6cedf20,-1.0,-1.0,91.2,-1.0,-1.0,80.0,116.0,51.0,18.6,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


Unnamed: 0,65363002,386661006,840544004,840539006,44465007,49727002,248595008,267060006,43878008,59621000,...,403192003,239720000,47505003,707577004,190905008,427089005,60951000119105,698423002,65275009,235919008
f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
067318a4-db8f-447f-8b6e-f2f61e9baaa5,0,1,1,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
ae9efba3-ddc4-43f9-a781-f72019388548,0,1,1,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
199c586f-af16-4091-9998-ee4cfc02ee7a,0,1,1,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
353016ea-a0ff-4154-85bb-1cf8b6cedf20,0,1,1,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,736376001,91251008,443402002,698360004,385691007,134435003,384758001,412776001,699728000,734163000,...,395082007,781831000000109,869761000000107,718347000,737434004,736254008,736690008,183401008,703040004,133899007
f0f3bc8d-ef38-49ce-a2bd-dfdda982b271,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
067318a4-db8f-447f-8b6e-f2f61e9baaa5,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ae9efba3-ddc4-43f9-a781-f72019388548,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199c586f-af16-4091-9998-ee4cfc02ee7a,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
353016ea-a0ff-4154-85bb-1cf8b6cedf20,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Joining tables
df = pd.merge(tables['patients'], encoded_tables['allergies'], left_index=True, right_index=True, how='outer')
df.fillna(0, inplace=True)
df = pd.merge(df, encoded_tables['observations'], left_index=True, right_index=True, how='outer')
df.fillna(-1, inplace=True)
df = pd.merge(df, encoded_tables['conditions'], left_index=True, right_index=True, how='outer')
df.fillna(0, inplace=True)
df = pd.merge(df, encoded_tables['careplans'], left_index=True, right_index=True, how='inner')
df.fillna(0, inplace=True)
display(df.head())

Unnamed: 0,GENDER,424213003,418689008,419263009,417532002,91934008,419474003,232347008,300913006,232350006,...,395082007,781831000000109,869761000000107,718347000,737434004,736254008,736690008,183401008,703040004,133899007
0000b247-1def-417a-a783-41c8682be022,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
00049ee8-5953-4edd-a277-b9c1b1a7f16b,M,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
000769a6-23a7-426e-a264-cb0e509b2da2,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
00079a57-24a8-430f-b4f8-a1cf34f90060,F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0008a63c-c95c-46c2-9ef3-831d68892019,M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Encoding labelled data
obs_table = tables['observations']
cat_cols_to_encode = np.concatenate((obs_table[obs_table['TYPE']=='text']['CODE'].unique(), ['GENDER']))
encoders = {}
for col in cat_cols_to_encode:
    encoder = LabelEncoder()
    df[col] = encoder.fit_transform(df[col].astype(str))
    encoders[col] = encoder
df.head()

Unnamed: 0,GENDER,424213003,418689008,419263009,417532002,91934008,419474003,232347008,300913006,232350006,...,395082007,781831000000109,869761000000107,718347000,737434004,736254008,736690008,183401008,703040004,133899007
0000b247-1def-417a-a783-41c8682be022,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
00049ee8-5953-4edd-a277-b9c1b1a7f16b,1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0,0,0,0,0,0,0,0,0,0
000769a6-23a7-426e-a264-cb0e509b2da2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
00079a57-24a8-430f-b4f8-a1cf34f90060,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
0008a63c-c95c-46c2-9ef3-831d68892019,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Preparing dataset
target_labels = encoded_tables['careplans'].columns
X = df[df.columns.drop(target_labels)]
y = df[target_labels]
display(X)
display(y)

Unnamed: 0,GENDER,424213003,418689008,419263009,417532002,91934008,419474003,232347008,300913006,232350006,...,403192003,239720000,47505003,707577004,190905008,427089005,60951000119105,698423002,65275009,235919008
0000b247-1def-417a-a783-41c8682be022,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00049ee8-5953-4edd-a277-b9c1b1a7f16b,1,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000769a6-23a7-426e-a264-cb0e509b2da2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00079a57-24a8-430f-b4f8-a1cf34f90060,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0008a63c-c95c-46c2-9ef3-831d68892019,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffd3d544-1fcd-4a87-9514-fa6c37409cbc,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ffd86fda-ebb9-400e-9fe3-ea1a1037dbad,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ffdbbb1b-745e-4e38-ade2-a19d6e778fee,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ffdf0900-bc4b-4f81-b95b-1ea57da21e07,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,736376001,91251008,443402002,698360004,385691007,134435003,384758001,412776001,699728000,734163000,...,395082007,781831000000109,869761000000107,718347000,737434004,736254008,736690008,183401008,703040004,133899007
0000b247-1def-417a-a783-41c8682be022,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00049ee8-5953-4edd-a277-b9c1b1a7f16b,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000769a6-23a7-426e-a264-cb0e509b2da2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00079a57-24a8-430f-b4f8-a1cf34f90060,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0008a63c-c95c-46c2-9ef3-831d68892019,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffd3d544-1fcd-4a87-9514-fa6c37409cbc,0,0,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffd86fda-ebb9-400e-9fe3-ea1a1037dbad,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffdbbb1b-745e-4e38-ade2-a19d6e778fee,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffdf0900-bc4b-4f81-b95b-1ea57da21e07,1,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Scaling dataset, preparing train and test datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
# This case requires multi-label classification
from sklearn.multioutput import MultiOutputClassifier

scaler = StandardScaler()
X_s = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_s, y, test_size=0.25, random_state=123)
labels_t = list(str(v[1]) for v in target_desc)



In [8]:
# Using Decision Tree
from sklearn.tree import DecisionTreeClassifier

model = MultiOutputClassifier(DecisionTreeClassifier(), n_jobs=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Test Accuracy: {round(accuracy_score(y_test, y_pred)*100, 2)}%')
print(classification_report(y_test, y_pred, target_names = labels_t))

Test Accuracy: 75.09%
                                                                precision    recall  f1-score   support

                Infectious disease care plan (record artifact)       1.00      1.00      1.00      2260
                                    Physical therapy procedure       1.00      1.00      1.00        52
                    Lifestyle education regarding hypertension       1.00      1.00      1.00       790
                                 Diabetes self management plan       1.00      1.00      1.00      1055
                                                 Fracture care       1.00      1.00      1.00        61
                                        Routine antenatal care       0.68      0.69      0.69       166
                           Self-care interventions (procedure)       0.78      0.78      0.78       563
Chronic obstructive pulmonary disease clinical management plan       1.00      1.00      1.00       101
                                        A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Using Support Vector Classifer
from sklearn.svm import SVC

model = MultiOutputClassifier(SVC(), n_jobs=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Test Accuracy: {round(accuracy_score(y_test, y_pred)*100, 2)}%')
print(classification_report(y_test, y_pred, target_names = labels_t))

Test Accuracy: 78.46%
                                                                precision    recall  f1-score   support

                Infectious disease care plan (record artifact)       0.99      1.00      0.99      2260
                                    Physical therapy procedure       1.00      0.81      0.89        52
                    Lifestyle education regarding hypertension       0.99      0.94      0.96       790
                                 Diabetes self management plan       0.99      0.97      0.98      1055
                                                 Fracture care       1.00      0.87      0.93        61
                                        Routine antenatal care       0.89      0.72      0.79       166
                           Self-care interventions (procedure)       1.00      0.63      0.77       563
Chronic obstructive pulmonary disease clinical management plan       1.00      0.90      0.95       101
                                        A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Using Logistic Regression
from sklearn.linear_model import LogisticRegression

model = MultiOutputClassifier(LogisticRegression(max_iter=1000), n_jobs=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f'Test Accuracy: {round(accuracy_score(y_test, y_pred)*100, 2)}%')
print(classification_report(y_test, y_pred, target_names = labels_t))

Test Accuracy: 83.18%
                                                                precision    recall  f1-score   support

                Infectious disease care plan (record artifact)       1.00      1.00      1.00      2260
                                    Physical therapy procedure       1.00      1.00      1.00        52
                    Lifestyle education regarding hypertension       1.00      1.00      1.00       790
                                 Diabetes self management plan       1.00      1.00      1.00      1055
                                                 Fracture care       1.00      1.00      1.00        61
                                        Routine antenatal care       0.85      0.70      0.77       166
                           Self-care interventions (procedure)       0.99      0.76      0.86       563
Chronic obstructive pulmonary disease clinical management plan       1.00      1.00      1.00       101
                                        A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
