# Revised case normalization for Hirslanden Aarau 2018

Notes:

- 17 of 17 revised DtoD cases were joined with the BFS cases from the DB.



In [1]:
import sys

import pandas as pd
import numpy as np

from loguru import logger

sys.path.insert(0, '/home/jovyan/work')

from src.revised_case_normalization.py.global_configs import *
from src.revised_case_normalization.py.normalize import normalize
from src.service import bfs_cases_db_service as bfs_db
from src.service.bfs_cases_db_service import session, get_hospital_year_cases

In [2]:
file_info = FileInfo(
        os.path.join(ROOT_DIR, 'raw_data/HI_Aarau_Birshof_ST. Anna.xlsx'),
        'Hirslanden Aarau',
        '2018',
        ['Aarau 2018'])

print(file_info)

FileInfo(path='/home/jovyan/work/src/revised_case_normalization/raw_data/HI_Aarau_Birshof_ST. Anna.xlsx', hospital_name_db='Hirslanden Aarau', year='2018', sheets=['Aarau 2018'])


In [3]:
cols_to_rename = dict(COLUMNS_TO_RENAME)
# Replace 'admno' with 'fall nummer'
cols_to_rename.pop('admno')
cols_to_rename['fall nummer'] = CASE_ID_COL

df_revised_case_d2d = normalize(file_info, 0)

2022-10-19 15:59:31.742 | INFO     | src.revised_case_normalization.py.normalize:normalize:46 - Read 17 cases for Hirslanden Aarau 2018
2022-10-19 15:59:31.745 | INFO     | src.revised_case_normalization.py.normalize:normalize:69 - TYPES:
case_id              int64
patient_id          object
gender              string
age_years            int64
duration_of_stay     int64
pccl                 int64
drg                 string
old_pd              string
new_pd              string
bfs_code            string
added_icds          string
removed_icds        string
added_chops         string
removed_chops       string
dtype: object
2022-10-19 15:59:31.749 | INFO     | src.utils.dataframe_utils:validate_icd_codes:68 - Validated ICD codes in 'added_icds' and stored them into 'added_icds': All rows contain valid codes
2022-10-19 15:59:31.751 | INFO     | src.utils.dataframe_utils:validate_icd_codes:68 - Validated ICD codes in 'removed_icds' and stored them into 'removed_icds': All rows contain val

In [4]:
df_revised_case_d2d.head()

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
0,41623461,1059029,W,77,2,3,F59E,I7024,I7024,M100,[J4481],[J4483],[],[]
1,41665678,2043896,M,82,17,3,F62C,I5001,I5001,M100,[R18],[],[],[]
2,41698586,22177655,M,93,2,3,F59E,I7024,I7024,M100,[I743],[],[],[]
3,41719219,1339311,M,91,4,3,F71B,I495,I495,M100,[N182],[],[],[]
4,41777951,22177655,M,93,2,0,F59E,I7024,I7024,M100,"[N184, T828]",[],[],[]


# Match revised DtoD data with the BfS data from the database

In [5]:
cases_in_db = get_hospital_year_cases(file_info.hospital_name_db, file_info.year)[[AIMEDIC_ID] + VALIDATION_COLS]
cases_in_db.head()


Unnamed: 0,aimedic_id,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg
0,115452,41415230,A0E36E42C141D541,M,29,2,0,I18B
1,115453,41453333,10CA4E4A4D6A6648,W,34,1,0,J24A
2,115454,41473239,E7A902368D82844A,M,63,4,0,I46C
3,115455,41506268,5FCEADE16E4FD6EC,M,44,3,0,D06B
4,115456,41511524,A2B2DB76985BA881,M,43,2,0,G09D


In [6]:
num_cases_in_db = cases_in_db.shape[0]
if num_cases_in_db == 0:
    raise ValueError(f"There is no data for the hospital '{file_info.hospital_name_db}' in {file_info.year}")
else:
    logger.info(f"Read {num_cases_in_db} rows from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}")    

2022-10-19 15:59:34.076 | INFO     | __main__:<module>:5 - Read 10112 rows from the DB, for the hospital 'Hirslanden Aarau' in 2018


In [7]:
# Remove columns with empty 'case_id' in DB data

cases_in_db['case_id'].replace('', np.nan, inplace=True)
num_na_cases_in_db = cases_in_db['case_id'].isna().sum()
cases_in_db.dropna(subset=['case_id'], inplace=True)
logger.info(f"{num_na_cases_in_db} rows with empty 'case_id's were discovered and removed from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}")   

2022-10-19 15:59:34.090 | INFO     | __main__:<module>:6 - 0 rows with empty 'case_id's were discovered and removed from the DB, for the hospital 'Hirslanden Aarau' in 2018


In [8]:
# get the case_id from revised_case 
df_revised_case_d2d['case_id_num'] = df_revised_case_d2d[CASE_ID_COL].astype(int)
cases_in_db['case_id_num'] = cases_in_db[CASE_ID_COL].astype(int)

In [9]:
# get case ids in revised case data from DtoD
case_id_d2d = df_revised_case_d2d['case_id_num'].values

In [10]:
# Check for duplicates in the case ids of DtoD

case_id_d2d_set = set(case_id_d2d)
if len(case_id_d2d) > len(case_id_d2d_set):
    logger.info('There are duplicated case_ids in the revised cases.' )
else:
    logger.info('There are no duplicated case_ids in the revised cases.' )

len(case_id_d2d_set)

2022-10-19 15:59:34.111 | INFO     | __main__:<module>:7 - There are no duplicated case_ids in the revised cases.


17

In [11]:
# Search for case_ids in the BFS DB data with the same case_ids as in revised case dataset of DtoD 
# Reason: check whether case ids of the revised data appear more than once in BFS DB data

revised_case_bfs_db = cases_in_db[cases_in_db['case_id_num'].isin(case_id_d2d)]
revised_case_bfs_db.shape[0]

17

In [12]:
# Join revised cases from DtoD with BFS Cases from Database

if revised_case_bfs_db.shape[0] <= len(case_id_d2d):
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_d2d_subset.merge(revised_case_bfs_db, how ='inner', on=['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'], suffixes=None)
    num_match_case_id = revised_case_normalized.shape[0]
else:
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_num', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_d2d_subset.merge(revised_case_bfs_db, how ='inner', on=['case_id_num', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
    num_match_case_patient_id = revised_case_normalized.shape[0]
print(revised_case_normalized.shape)
# revised_case_normalized.head()

(17, 16)


  revised_case_normalized = revised_case_d2d_subset.merge(revised_case_bfs_db, how ='inner', on=['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'], suffixes=None)


Unnamed: 0,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops,case_id_num,gender,age_years,duration_of_stay,pccl,drg,aimedic_id,case_id,patient_id
0,I7024,I7024,M100,[J4481],[J4483],[],[],41623461,W,77,2,3,F59E,115875,41623461,A9832A23F251C511
1,I5001,I5001,M100,[R18],[],[],[],41665678,M,82,17,3,F62C,116851,41665678,2C4C8942514B875C
2,I7024,I7024,M100,[I743],[],[],[],41698586,M,93,2,3,F59E,117722,41698586,296ABD4577F82DE5
3,I495,I495,M100,[N182],[],[],[],41719219,M,91,4,3,F71B,118215,41719219,F99B9F90EF929909
4,I7024,I7024,M100,"[N184, T828]",[],[],[],41777951,M,93,2,0,F59E,120279,41777951,296ABD4577F82DE5


In [14]:
# Join the revised cases of DtoD with the BFS data (alternative solution to cross check upper join)

#if revised_case_bfs_db.shape[0] <= len(case_id_d2d):
#    revised_case_normalized = pd.merge(df_revised_case_d2d, cases_in_db, suffixes=('_drop', ''), on=['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
#    revised_case_normalized.drop([i for i in revised_case_normalized.columns if '_drop' in i], axis=1, inplace=True)
#    revised_case_normalized.head()
#else:
#    revised_case_normalized = pd.merge(df_revised_case_d2d, cases_in_db, suffixes=('_drop', ''), on=['case_id_num','patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
#    revised_case_normalized.drop([i for i in revised_case_normalized.columns if '_drop' in i], axis=1, inplace=True)
#    revised_case_normalized.head()
#print(revised_case_normalized.shape)
#revised_case_normalized.head()

In [16]:
num_revised_case_normalized = revised_case_normalized.shape[0]
num_df_revised_case_d2d = df_revised_case_d2d.shape[0] 

if num_revised_case_normalized == 0:
    raise ValueError(f"No cases between the revised cases for the hospital '{file_info.hospital_name_db}' in {file_info.year} were joined.")
else:
    logger.info(f"{num_revised_case_normalized} of {num_df_revised_case_d2d} rows were joined with the cases from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}") 

2022-10-19 15:59:34.163 | INFO     | __main__:<module>:7 - 17 of 17 rows were joined with the cases from the DB, for the hospital 'Hirslanden Aarau' in 2018


In [None]:
df_revised_case_normalized = revised_case_normalized[COLS_TO_OUTPUT]

In [17]:
# Export normalized revision data as a csv: 

hospital = file_info.hospital_name_db.replace(' ', '_') # replace spaces with '_' for the file name
hospital_year = f'{hospital}_{file_info.year}'

output_folder = os.path.join(ROOT_DIR, 'normalized_revision_data/')

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

NEW = '_NEW'
    
output_path = os.path.join(ROOT_DIR, 'normalized_revision_data/') + hospital_year + NEW + '.csv'

revised_case_normalized[COLS_TO_OUTPUT].sort_values('aimedic_id').to_csv(output_path)


