# Revised case normalization for St. Anna 2018

Notes:

- 17 of 17 revised DtoD cases were joined with the BFS cases from the DB.



In [1]:
import sys
import itertools
import pandas as pd
import numpy as np

from loguru import logger

sys.path.insert(0, '/home/jovyan/work')

from src.revised_case_normalization.py.global_configs import *
from src.revised_case_normalization.py.normalize import normalize, remove_leading_zeros
from src.service import bfs_cases_db_service as bfs_db
from src.service.bfs_cases_db_service import session, get_sociodemographics_for_hospital_year, get_earliest_revisions_for_aimedic_ids, get_codes, apply_revisions, get_primary_procedures_codes, get_secondary_procedures_codes
from src.revised_case_normalization.py.format_for_grouper import format_for_grouper

In [2]:
file_info = FileInfo(
        os.path.join(ROOT_DIR, 'raw_data/HI_Aarau_Birshof_ST. Anna.xlsx'),
        'Hirslanden St. Anna',
        '2018',
        ['KOPIE_Änderungen_ST. Anna_2018'])

print(file_info)

FileInfo(path='/home/jovyan/work/src/revised_case_normalization/raw_data/HI_Aarau_Birshof_ST. Anna.xlsx', hospital_name_db='Hirslanden St. Anna', year='2018', sheets=['KOPIE_Änderungen_ST. Anna_2018'])


In [3]:
df_revised_case_d2d = normalize(file_info, 0)


2022-11-01 11:17:48.680 | INFO     | src.revised_case_normalization.py.normalize:normalize:35 - Read 51 cases for Hirslanden St. Anna 2018
2022-11-01 11:17:48.684 | INFO     | src.revised_case_normalization.py.normalize:normalize:57 - TYPES:
tranche                   string
datum                     string
case_id                   object
patient_id                object
gender                    string
age_years                  int64
bfs_code                  string
duration_of_stay           int64
pflegetage neu            string
pccl                       int64
pccl neu                  string
old_pd                    string
primary_diagnosis         string
added_icds                string
removed_icds              string
added_chops               string
removed_chops             string
drg                       string
drg neu                   string
cw alt                    string
cw neu                    string
cw-änderung möglich       string
cw änderung akzeptiert    string

In [4]:
df_revised_case_d2d.head()

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,case_id_norm,old_pd,primary_diagnosis,bfs_code,pccl,drg,added_icds,removed_icds,added_chops,removed_chops
0,9358410,EB395DA315D5B285,W,79,9,9358410,M169,M169,M200,3,I46C,[D62],[D648],[],[]
1,9364697,6586D083924C0E99,W,68,16,9364697,M4806,M4806,M200,3,I09C,[E1191],[E1190],[],[]
2,9366989,9FBF3AE4CEA4A950,M,27,16,9366989,M511,M511,M200,3,I10C,[B956],[],[],[]
3,9371704,FDE409238A9BEC4F,M,65,13,9371704,I5001,I5001,M100,3,F49C,[N183],[],[],[]
4,9375929,036F0863E37E3493,M,74,18,9375929,M4806,M4806,M200,3,I27C,[T846],[T844],[],[]


# Match revised DtoD data with the BfS data from the database

In [5]:
cases_in_db = get_sociodemographics_for_hospital_year(file_info.hospital_name_db, int(file_info.year))
cases_in_db[NORM_CASE_ID_COL] = cases_in_db[CASE_ID_COL].apply(remove_leading_zeros)
print(cases_in_db.head())

2022-11-01 11:17:49.658 | INFO     | src.service.bfs_cases_db_service:get_sociodemographics_for_hospital_year:125 - Read 12990 rows from the DB, for the hospital 'Hirslanden St. Anna' in 2018


   aimedic_id     case_id        patient_id  age_years  age_days  admission_weight  gestation_age gender admission_date grouper_admission_type discharge_date grouper_discharge_type  duration_of_stay  ventilation_hours case_id_norm
0      138498  0009264434  E9E14AD826B70BFA         59         0                 0              0      M     2018-03-14                     01     2018-03-17                     00                 3                  0      9264434
1      138499  0009268866  0EE461E7ED376814         58         0                 0              0      M     2018-01-11                     01     2018-01-16                     00                 5                  0      9268866
2      138500  0009286976  AD0FB688816E4930         49         0                 0              0      W     2018-01-10                     01     2018-01-11                     00                 1                  0      9286976
3      138501  0009306256  8AA6481F39077A85         29         0            

In [6]:
cols_to_join = list(VALIDATION_COLS)
cols_to_join.remove(CASE_ID_COL)
cols_to_join.append(NORM_CASE_ID_COL)

cols_to_join.remove(PATIENT_ID_COL)

# Merge cases in db with the revised cases

joined = pd.merge(df_revised_case_d2d, cases_in_db,
                  how='left',
                  on=cols_to_join,
                  suffixes=('', '_db'))

joined

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,case_id_norm,old_pd,primary_diagnosis,bfs_code,pccl,drg,added_icds,removed_icds,added_chops,removed_chops,aimedic_id,case_id_db,patient_id_db,age_days,admission_weight,gestation_age,admission_date,grouper_admission_type,discharge_date,grouper_discharge_type,ventilation_hours
0,9358410,EB395DA315D5B285,W,79,9,9358410,M169,M169,M200,3,I46C,[D62],[D648],[],[],138741,9358410,EB395DA315D5B285,0,0,0,2018-02-01,1,2018-02-10,0,0
1,9364697,6586D083924C0E99,W,68,16,9364697,M4806,M4806,M200,3,I09C,[E1191],[E1190],[],[],138969,9364697,6586D083924C0E99,0,0,0,2018-01-14,1,2018-01-30,0,0
2,9366989,9FBF3AE4CEA4A950,M,27,16,9366989,M511,M511,M200,3,I10C,[B956],[],[],[],139157,9366989,9FBF3AE4CEA4A950,0,0,0,2017-12-25,1,2018-01-10,0,0
3,9371704,FDE409238A9BEC4F,M,65,13,9371704,I5001,I5001,M100,3,F49C,[N183],[],[],[],139683,9371704,FDE409238A9BEC4F,0,0,0,2018-01-12,1,2018-01-25,0,0
4,9375929,036F0863E37E3493,M,74,18,9375929,M4806,M4806,M200,3,I27C,[T846],[T844],[],[],140167,9375929,036F0863E37E3493,0,0,0,2018-03-27,1,2018-04-28,0,0
5,9377457,B172FAC0D995D8A0,M,77,5,9377457,I5000,I5000,M100,3,F62C,"[K746, E870]",[],[],[],140323,9377457,B172FAC0D995D8A0,0,0,0,2018-01-27,1,2018-02-01,0,0
6,9381325,E3D0603E52140F32,W,55,15,9381325,J4410,J4410,M100,3,E65B,[J9610],[],[],[],140730,9381325,E3D0603E52140F32,0,0,0,2018-02-07,6,2018-02-22,0,0
7,9382082,4ED15568FB7BDE9E,M,82,8,9382082,J100,J100,M100,3,E77E,[E870],[],[],[],140838,9382082,4ED15568FB7BDE9E,0,0,0,2018-02-12,1,2018-02-20,0,0
8,9384055,44197C8BC8569771,W,87,14,9384055,S8180,S8180,M200,3,X06B,[E871],[],[],[],141112,9384055,44197C8BC8569771,0,0,0,2018-02-19,1,2018-03-05,0,0
9,9391002,8BD23E59AF760596,M,64,14,9391002,J128,J128,M100,3,E77E,[C9000],[],[],[],141794,9391002,8BD23E59AF760596,0,0,0,2018-03-08,1,2018-03-22,0,0


In [7]:
# Print out how many rows could not be matched
unmatched = joined[joined[AIMEDIC_ID_COL].isna()]
num_unmatched = unmatched.shape[0]
logger.info(f'{num_unmatched} rows could not be matched, given {sorted(cols_to_join)}')

2022-11-01 11:17:49.718 | INFO     | __main__:<module>:4 - 0 rows could not be matched, given ['age_years', 'case_id_norm', 'duration_of_stay', 'gender']


In [8]:
original_revision_ids = get_earliest_revisions_for_aimedic_ids(joined[AIMEDIC_ID_COL].values.tolist())

original_cases = get_codes(original_revision_ids)
original_cases

Unnamed: 0,aimedic_id,revision_id,old_pd,secondary_diagnoses,primary_procedure,side,date,secondary_procedures
0,138741,138741,M169,"[I2513, Z950, Z955, I420, E210, ...]",815111,L,2018-02-01,"[009940, 009960, 9900, 948X40, 990410]"
1,138969,138969,M4806,"[G553, T844, M4786, M513, G961, ...]",8138,,2018-01-15,"[816399, 030935, 036, 030991, 845110, ...]"
2,139157,139157,M511,"[G9780, T814, G9788, T813, G474, ...]",862A2E,,2017-12-25,"[035913, 030934, 848199, 99221A, 99B812, ...]"
3,139683,139683,I5001,"[I5019, I2519, Z955, I2522, I340, ...]",8856,,2018-01-22,"[887221, 3723, 8854, 393013]"
4,140167,140167,M4806,"[G553, I1090, T844, Y828, B957]",848010,,2018-03-27,"[030991, 030934, 009920, 862A2E, 832121, ...]"
5,140323,140323,I5000,"[I425, Z950, Z921, N183, J4489]",893909,,2018-01-27,[]
6,140730,140730,J4410,"[J101, F328, K219, D508]",998426,,2018-02-07,"[948X40, 992909]"
7,140838,140838,J100,"[J91, J9600, B023, I2519, I489, ...]",992218,,2018-02-12,[998425]
8,141112,141112,S8180,"[X599, L908, E440, F328, I1090]",867A9E,L,2018-02-19,"[9938, 890A32]"
9,141794,141794,J128,"[J9600, K528, Y579, E871, G6288, ...]",998425,,2018-03-08,"[992217, 890A11]"


In [9]:
revised_cases = apply_revisions(original_cases, joined)
revised_cases.head()

Unnamed: 0,aimedic_id,primary_diagnosis,secondary_diagnoses,primary_procedure,secondary_procedures
0,138741,M169,"[I2513, Z950, Z955, I420, E210, ...]",815111,"[009940, 009960, 9900, 948X40, 990410]"
1,138969,M4806,"[G553, T844, M4786, M513, G961, ...]",8138,"[816399, 030935, 036, 030991, 845110, ...]"
2,139157,M511,"[G9780, T814, G9788, T813, G474, ...]",862A2E,"[035913, 030934, 848199, 99221A, 99B812, ...]"
3,139683,I5001,"[I5019, I2519, Z955, I2522, I340, ...]",8856,"[887221, 3723, 8854, 393013]"
4,140167,M4806,"[G553, I1090, Y828, B957, T846]",848010,"[030991, 030934, 009920, 862A2E, 832121, ...]"


In [10]:
# PRIMARY Procedures: Retrieve Side and DATE

original_cases_primary_procedures = get_primary_procedures_codes(original_revision_ids)

# Formatting Date
original_cases_primary_procedures['date'] = original_cases_primary_procedures['date'].astype(str).str.replace("-","")

# Merge columns to grouper format
original_cases_primary_procedures['primary_procedure_grouper_bfs'] = original_cases_primary_procedures['primary_procedure'].map(str) + ":" + original_cases_primary_procedures['side'].map(str) + ":" + original_cases_primary_procedures['date'].map(str)
   

In [11]:

primary_procedures_revised = revised_cases[["aimedic_id", "primary_procedure"]]

primary_procedures_revised = pd.merge(primary_procedures_revised, original_cases_primary_procedures, on=["aimedic_id","primary_procedure"], how="left", suffixes=("","_original"))
primary_procedures_revised['primary_procedure_grouper_bfs'].fillna(primary_procedures_revised['primary_procedure'], inplace=True)
primary_procedures_revised

Unnamed: 0,aimedic_id,primary_procedure,revision_id,side,date,primary_procedure_grouper_bfs
0,138741,815111,138741,L,20180201,815111:L:20180201
1,138969,8138,138969,,20180115,8138: :20180115
2,139157,862A2E,139157,,20171225,862A2E: :20171225
3,139683,8856,139683,,20180122,8856: :20180122
4,140167,848010,140167,,20180327,848010: :20180327
5,140323,893909,140323,,20180127,893909: :20180127
6,140730,998426,140730,,20180207,998426: :20180207
7,140838,992218,140838,,20180212,992218: :20180212
8,141112,867A9E,141112,L,20180219,867A9E:L:20180219
9,141794,998425,141794,,20180308,998425: :20180308


In [12]:
# SECONDARY PROCEDURES

In [13]:
original_cases_secondary_procedures = get_secondary_procedures_codes(original_revision_ids)

In [14]:
original_cases_secondary_procedures['date'] = original_cases_secondary_procedures['date'].astype(str).str.replace("-","")

original_cases_secondary_procedures['secondary_procedures_grouper_bfs'] = original_cases_secondary_procedures['secondary_procedures'].map(str) + ":" + \
                                                      original_cases_secondary_procedures['side'].map(str) + ":" + original_cases_secondary_procedures['date'].map(str)

original_cases_secondary_procedures = original_cases_secondary_procedures.explode('secondary_procedures')
# Group by to list
original_cases_secondary_procedures = pd.DataFrame(original_cases_secondary_procedures.groupby(['aimedic_id', 'secondary_procedures'], group_keys=True)['secondary_procedures_grouper_bfs'].apply(list))

original_cases_secondary_procedures['secondary_procedures_grouper_bfs'] = original_cases_secondary_procedures['secondary_procedures_grouper_bfs'].astype(str).str.replace(" ", "").str.replace(",", ";")
original_cases_secondary_procedures['secondary_procedures_grouper_bfs'] = original_cases_secondary_procedures['secondary_procedures_grouper_bfs'].astype(str).str.replace("'", "")
original_cases_secondary_procedures['secondary_procedures_grouper_bfs'] = original_cases_secondary_procedures['secondary_procedures_grouper_bfs'].str.strip("[]")

original_cases_secondary_procedures




Unnamed: 0_level_0,Unnamed: 1_level_0,secondary_procedures_grouper_bfs
aimedic_id,secondary_procedures,Unnamed: 2_level_1
138741,009940,009940::20180201
138741,009960,009960::20180201
138741,948X40,948X40::20180202
138741,9900,9900::20180201
138741,990410,990410::20180207
...,...,...
151124,992909,992909::20181221
151124,99B811,99B811::20181221
151435,887211,887211::20181222
151435,991016,991016::20181222


In [15]:
original_cases = get_codes(original_revision_ids)
original_cases

Unnamed: 0,aimedic_id,revision_id,old_pd,secondary_diagnoses,primary_procedure,side,date,secondary_procedures
0,138741,138741,M169,"[I2513, Z950, Z955, I420, E210, ...]",815111,L,2018-02-01,"[009940, 009960, 9900, 948X40, 990410]"
1,138969,138969,M4806,"[G553, T844, M4786, M513, G961, ...]",8138,,2018-01-15,"[816399, 030935, 036, 030991, 845110, ...]"
2,139157,139157,M511,"[G9780, T814, G9788, T813, G474, ...]",862A2E,,2017-12-25,"[035913, 030934, 848199, 99221A, 99B812, ...]"
3,139683,139683,I5001,"[I5019, I2519, Z955, I2522, I340, ...]",8856,,2018-01-22,"[887221, 3723, 8854, 393013]"
4,140167,140167,M4806,"[G553, I1090, T844, Y828, B957]",848010,,2018-03-27,"[030991, 030934, 009920, 862A2E, 832121, ...]"
5,140323,140323,I5000,"[I425, Z950, Z921, N183, J4489]",893909,,2018-01-27,[]
6,140730,140730,J4410,"[J101, F328, K219, D508]",998426,,2018-02-07,"[948X40, 992909]"
7,140838,140838,J100,"[J91, J9600, B023, I2519, I489, ...]",992218,,2018-02-12,[998425]
8,141112,141112,S8180,"[X599, L908, E440, F328, I1090]",867A9E,L,2018-02-19,"[9938, 890A32]"
9,141794,141794,J128,"[J9600, K528, Y579, E871, G6288, ...]",998425,,2018-03-08,"[992217, 890A11]"


In [16]:
secondary_procedures_revised = revised_cases[["aimedic_id", "secondary_procedures"]]
secondary_procedures_revised = secondary_procedures_revised.explode('secondary_procedures')
secondary_procedures_revised

Unnamed: 0,aimedic_id,secondary_procedures
0,138741,009940
0,138741,009960
0,138741,9900
0,138741,948X40
0,138741,990410
...,...,...
49,151124,992909
49,151124,99B811
50,151435,887211
50,151435,991016


In [17]:
# merge information on sideness and date with secondary procedures of revised cases (left join to still keep what doesn't get merged)

secondary_procedures_revised = pd.merge(secondary_procedures_revised, original_cases_secondary_procedures, on=["aimedic_id","secondary_procedures"], how="left", suffixes=("","_original"))

secondary_procedures_revised['secondary_procedures_grouper_bfs'].fillna(secondary_procedures_revised['secondary_procedures'], inplace=True)

secondary_procedures_revised = pd.DataFrame(secondary_procedures_revised.groupby(['aimedic_id'], group_keys=True)['secondary_procedures_grouper_bfs'].apply(list))

secondary_procedures_revised.secondary_procedures_grouper_bfs = secondary_procedures_revised.secondary_procedures_grouper_bfs.fillna('')
secondary_procedures_revised

Unnamed: 0_level_0,secondary_procedures_grouper_bfs
aimedic_id,Unnamed: 1_level_1
138741,"[009940::20180201, 009960::20180201, 9900::20180201, 948X40::20180202, 990410::20180207]"
138969,"[816399::20180115, 030935::20180115, 036::20180115, 030991::20180115, 845110::20180115, ...]"
139157,"[035913::20171225, 030934::20171225, 848199::20171225, 99221A::20171225, 99B812::20171225, ...]"
139683,"[887221::20180118, 3723::20180122, 8854::20180122, 393013:R:20180122]"
140167,"[030991::20180327, 030934::20180327, 009920::20180327, 862A2E::20180413, 832121::20180413, ...]"
140323,[nan]
140730,"[948X40::20180212, 992909::20180212]"
140838,[998425::20180212]
141112,"[9938::20180219, 890A32::20180223]"
141794,"[992217::20180309, 890A11::20180314]"


In [None]:
revised_cases["primary_procedure"] = [procedure + "::" for procedure in revised_cases["primary_procedure"]]
revised_cases['primary_procedure'] = revised_cases.primary_procedure.apply(lambda case: case + "::" if len(case) <= 6 else case)

In [18]:
sys.exit()

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:

revised_cases_join = pd.merge(revised_cases, original_cases, on="aimedic_id", how="left", suffixes=("","_y"))              
revised_cases_join = revised_cases_join.loc[:, ~revised_cases_join.columns.str.contains("_y")]
revised_cases_join.head()

In [None]:
# Select primary Procedures_Grouper based on "primary_procedure"


In [None]:
# Format joined dataset to the SwissDRG Batchgrouper 2017 Format 

grouper_input_data_string = format_for_grouper(joined)

grouper_input_data_string

grouper_input_data_string[17]

In [None]:
import subprocess


example_batch_line = """0044007489;57;0;0|0;W;20190101;01;20190101;00;1;0;C20|Z432|N40|I440|I493;465110::20190101|4823::20190101|009A13::20190101;"""

# add date to procedures
# change ; with | at one spot
test_df_line = """17722;93;;;M;20180315;01;20180317;00;2;0;I7024|Z9588|N184|Z922|I743;395011::|397511::|395021::|397510::|0042::|004B18::|004B1A::|884911::|005599::|0046::|393012::;"""

grouper_result = subprocess.check_output(["java",
                                "-cp",
                                "/home/jovyan/work/resources/aimedic-grouper-assembly-0.0.0-SNAPSHOT.jar",
                                "ch.aimedic.grouper.BatchGroupeOne",
                                grouper_input_data_string[6]
                                #test_df_line,
                                ]).decode("utf-8")


grouperResults = subprocess.check_output(["java",
                                "-cp",
                                "/home/jovyan/work/resources/aimedic-grouper-assembly-0.0.0-SNAPSHOT.jar",
                                "ch.aimedic.grouper.BatchGroupMany",
                                f'{grouper_input_data_string[1]}#{grouper_input_data_string[6]}#{grouper_input_data_string[16]}',
                                "#"
                                #test_df_line,
                                ]).decode("utf-8")

print(grouper_result)

In [None]:
# TODO:
# - Add medication 
# - Add and compare CHOP Codes for sideness and procedure date