# Analysis of normalize function to retrieve entire chop codes St. Anna

Notes:

- 17 of 17 revised DtoD cases were joined with the BFS cases from the DB.



In [1]:
import sys

import pandas as pd
import numpy as np

from loguru import logger

sys.path.insert(0, '/home/jovyan/work')

from src.revised_case_normalization.py.global_configs import *
from src.revised_case_normalization.py.normalize import normalize, remove_leading_zeros
from src.service import bfs_cases_db_service as bfs_db
from src.service.bfs_cases_db_service import session, get_sociodemographics_for_hospital_year, get_earliest_revisions_for_aimedic_ids, get_codes, apply_revisions
from src.revised_case_normalization.py.format_for_grouper import format_for_grouper

from src.utils.dataframe_utils import validate_icd_codes, validate_chop_codes, remove_duplicated_chops, validate_pd_revised_sd
 


In [2]:
file_info = FileInfo(
        os.path.join(ROOT_DIR, 'raw_data/HI_Aarau_Birshof_ST. Anna.xlsx'),
        'Hirslanden St. Anna',
        '2018',
        ['KOPIE_Änderungen_ST. Anna_2018'])

print(file_info)

FileInfo(path='/home/jovyan/work/src/revised_case_normalization/raw_data/HI_Aarau_Birshof_ST. Anna.xlsx', hospital_name_db='Hirslanden St. Anna', year='2018', sheets=['KOPIE_Änderungen_ST. Anna_2018'])


In [3]:

excel_sheet_idx = 0
df = pd.read_excel(file_info.path, sheet_name=file_info.sheets[excel_sheet_idx], dtype='string[pyarrow]')

In [4]:
df.columns = [c.lower() for c in df.columns]

In [5]:
columns_mapper: dict = COLUMNS_TO_RENAME

n_all_rows = df.shape[0]

assert(len(set(columns_mapper.keys()).difference(df.columns)) == 0)

In [6]:
df.rename(columns=columns_mapper, inplace=True)

In [7]:
# Fix unavailable duration of stay
df[DURATION_OF_STAY_COL] = df[DURATION_OF_STAY_COL].replace('n.ü.', np.nan)

columns_to_lstrip: set = COLUMNS_TO_LSTRIP
columns_to_cast: dict = COLUMNS_TO_CAST


# Fix format of some columns
lstrip_fun = lambda x: x.lstrip("'")
for col_name in columns_to_lstrip:
    df[col_name] = df[col_name].apply(lstrip_fun)

# Duplicate the case ID column which does not have leading zeros
df[NORM_CASE_ID_COL] = df[CASE_ID_COL].apply(remove_leading_zeros)

for col_name, col_type in columns_to_cast.items():
    df[col_name] = df[col_name].astype(col_type)
logger.info(f'TYPES:\n{df.dtypes}')

# Discard rows where any value on any validation col is empty
assert(len(set(VALIDATION_COLS).difference(df.columns)) == 0)
df.dropna(subset=VALIDATION_COLS, inplace=True)
n_valid_rows = df.shape[0]

#if n_valid_rows < n_all_rows:
#    logger.info(f'{n_all_rows - n_valid_rows}/{n_all_rows} rows were deleted because contained NaNs')

# Split ICD and CHOP columns into list[str]
for code_col_to_fix in (ADDED_ICD_CODES, REMOVED_ICD_CODES, ADDED_CHOP_CODES, REMOVED_CHOP_CODES):
    df[code_col_to_fix] = df[code_col_to_fix].fillna('').str.split(',')

2022-10-28 15:22:38.207 | INFO     | __main__:<module>:18 - TYPES:
tranche                   string
datum                     string
case_id                   object
patient_id                object
gender                    string
age_years                  int64
bfs_code                  string
duration_of_stay           int64
pflegetage neu            string
pccl                       int64
pccl neu                  string
old_pd                    string
primary_diagnosis         string
added_icds                string
removed_icds              string
added_chops               string
removed_chops             string
drg                       string
drg neu                   string
cw alt                    string
cw neu                    string
cw-änderung möglich       string
cw änderung akzeptiert    string
kommentar                 string
kodierer                  string
bemerkung                 string
kassenbeanstandung        string
case_id_norm              object
dtype: ob

In [8]:
# Validate ICD and CHOP codes
df = validate_icd_codes(df, icd_codes_col=ADDED_ICD_CODES, output_icd_codes_col=ADDED_ICD_CODES)
df = validate_icd_codes(df, icd_codes_col=REMOVED_ICD_CODES, output_icd_codes_col=REMOVED_ICD_CODES)
df = validate_chop_codes(df, chop_codes_col=ADDED_CHOP_CODES, output_chop_codes_col=ADDED_CHOP_CODES)
df = validate_chop_codes(df, chop_codes_col=REMOVED_CHOP_CODES, output_chop_codes_col=REMOVED_CHOP_CODES)

2022-10-28 15:22:38.220 | INFO     | src.utils.dataframe_utils:validate_icd_codes:68 - Validated ICD codes in 'added_icds' and stored them into 'added_icds': All rows contain valid codes
2022-10-28 15:22:38.225 | INFO     | src.utils.dataframe_utils:validate_icd_codes:68 - Validated ICD codes in 'removed_icds' and stored them into 'removed_icds': All rows contain valid codes
2022-10-28 15:22:38.229 | INFO     | src.utils.dataframe_utils:validate_chop_codes:106 - Validated CHOP codes in 'added_chops' and stored them into 'added_chops': All rows contain valid codes
2022-10-28 15:22:38.234 | INFO     | src.utils.dataframe_utils:validate_chop_codes:106 - Validated CHOP codes in 'removed_chops' and stored them into 'removed_chops': All rows contain valid codes


In [9]:
# Remove CHOP codes which appear in both added and removed lists
df = remove_duplicated_chops(df, added_chops_col=ADDED_CHOP_CODES, cleaned_added_chops_col=ADDED_CHOP_CODES, removed_chops_col=REMOVED_CHOP_CODES, cleaned_removed_chops_col=REMOVED_CHOP_CODES)
df.iloc[[17]]

2022-10-28 15:22:38.238 | INFO     | src.utils.dataframe_utils:remove_duplicated_chops:32 - Removing duplicated CHOP codes due to different casing ...


Unnamed: 0,tranche,datum,case_id,patient_id,gender,age_years,bfs_code,duration_of_stay,pflegetage neu,pccl,pccl neu,old_pd,primary_diagnosis,added_icds,removed_icds,added_chops,removed_chops,drg,drg neu,cw alt,cw neu,cw-änderung möglich,cw änderung akzeptiert,kommentar,kodierer,bemerkung,kassenbeanstandung,case_id_norm
17,,,9409623,D12521E1C91607CA,M,85,M200,8,8,3,4,S722,S722,[D62],[D649],[990410:B:20180429],[],I08D,I08C,1.55,2.32,0.77,0.7699999999999998,Arztanfrage neue ND/Vorschlag: D62 statt D64.9|+ CHOP 99.04.10|Siehe P040537689 Bericht Austritt...,Atti,Storno,,9409623


In [10]:
sys.exit()

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
df_revised_case_d2d = normalize(file_info, 0)


In [None]:
df_revised_case_d2d

# Match revised DtoD data with the BfS data from the database

In [None]:
cases_in_db = get_sociodemographics_for_hospital_year(file_info.hospital_name_db, int(file_info.year))
cases_in_db[NORM_CASE_ID_COL] = cases_in_db[CASE_ID_COL].apply(remove_leading_zeros)
print(cases_in_db.head())

In [None]:
cols_to_join = list(VALIDATION_COLS)
cols_to_join.remove(CASE_ID_COL)
cols_to_join.append(NORM_CASE_ID_COL)

cols_to_join.remove(PATIENT_ID_COL)

# Merge cases in db with the revised cases

joined = pd.merge(df_revised_case_d2d, cases_in_db,
                  how='left',
                  on=cols_to_join,
                  suffixes=('', '_db'))

joined

In [None]:
# Print out how many rows could not be matched
unmatched = joined[joined[AIMEDIC_ID_COL].isna()]
num_unmatched = unmatched.shape[0]
logger.info(f'{num_unmatched} rows could not be matched, given {sorted(cols_to_join)}')

In [None]:
# Format joined dataset to the SwissDRG Batchgrouper 2017 Format 

grouper_input_data_string = format_for_grouper(joined)

grouper_input_data_string[17]

In [None]:
import subprocess


example_batch_line = """0044007489;57;0;0|0;W;20190101;01;20190101;00;1;0;C20|Z432|N40|I440|I493;465110::20190101|4823::20190101|009A13::20190101;"""

# add date to procedures
# change ; with | at one spot
test_df_line = """17722;93;;;M;20180315;01;20180317;00;2;0;I7024|Z9588|N184|Z922|I743;395011::|397511::|395021::|397510::|0042::|004B18::|004B1A::|884911::|005599::|0046::|393012::;"""

grouper_result = subprocess.check_output(["java",
                                "-cp",
                                "/home/jovyan/work/resources/aimedic-grouper-assembly-0.0.0-SNAPSHOT.jar",
                                "ch.aimedic.grouper.BatchGroupeOne",
                                grouper_input_data_string[6]
                                #test_df_line,
                                ]).decode("utf-8")


grouperResults = subprocess.check_output(["java",
                                "-cp",
                                "/home/jovyan/work/resources/aimedic-grouper-assembly-0.0.0-SNAPSHOT.jar",
                                "ch.aimedic.grouper.BatchGroupMany",
                                f'{grouper_input_data_string[1]}#{grouper_input_data_string[6]}#{grouper_input_data_string[16]}',
                                "#"
                                #test_df_line,
                                ]).decode("utf-8")

print(grouper_result)

In [None]:
# TODO:
# - Add medication 
# - Add and compare CHOP Codes for sideness and procedure date