# Revised case normalization for Hirslanden Aarau 2017

Notes:

- There is no data for the hospital 'Hirslanden Aarau' in 2017 in the BFS DB data.



In [1]:
import sys

import pandas as pd
import numpy as np

from loguru import logger

sys.path.insert(0, '/home/jovyan/work')

from src.revised_case_normalization.py.global_configs import *
from src.revised_case_normalization.py.normalize import normalize
from src.service import bfs_cases_db_service as bfs_db
from src.service.bfs_cases_db_service import session, get_hospital_year_cases

In [2]:
file_info = FileInfo(
        os.path.join(ROOT_DIR, 'raw_data/HI_Aarau_Birshof_ST. Anna.xlsx'),
        'Hirslanden Aarau',
        '2017',
        ['Aarau_2017'])

print(file_info)

FileInfo(path='/home/jovyan/work/src/revised_case_normalization/raw_data/HI_Aarau_Birshof_ST. Anna.xlsx', hospital_name_db='Hirslanden Aarau', year='2017', sheets=['Aarau_2017'])


In [3]:
cols_to_rename = dict(COLUMNS_TO_RENAME)
# Replace 'admno' with 'fall nummer'
cols_to_rename.pop('admno')
cols_to_rename['fall nummer'] = CASE_ID_COL

df_revised_case_d2d = normalize(file_info, 0)

2022-10-19 16:03:07.671 | INFO     | src.revised_case_normalization.py.normalize:normalize:46 - Read 14 cases for Hirslanden Aarau 2017
2022-10-19 16:03:07.674 | INFO     | src.revised_case_normalization.py.normalize:normalize:69 - TYPES:
case_id              int64
patient_id          object
gender              string
age_years            int64
duration_of_stay     int64
pccl                 int64
drg                 string
old_pd              string
new_pd              string
bfs_code            string
added_icds          string
removed_icds        string
added_chops         string
removed_chops       string
dtype: object
2022-10-19 16:03:07.679 | INFO     | src.utils.dataframe_utils:validate_icd_codes:68 - Validated ICD codes in 'added_icds' and stored them into 'added_icds': All rows contain valid codes
2022-10-19 16:03:07.681 | INFO     | src.utils.dataframe_utils:validate_icd_codes:68 - Validated ICD codes in 'removed_icds' and stored them into 'removed_icds': All rows contain val

In [4]:
df_revised_case_d2d.head()

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
0,41206174,1176423,W,72,5,3,I21C,T840,T840,M200,[E1140],[],"[80051D, 815255]",[009031::20170106]
1,41230776,1659346,W,67,19,3,B72C,G001,G001,M100,[B370],[],[],[]
2,41281042,1823347,M,67,6,3,F14B,E1150,E1150,M100,[F101],[],[],[]
3,41386032,1051316,M,74,10,3,I08C,S7211,S7211,M200,[L8914],[],[],[]
4,41418378,22136564,M,86,8,3,G18B,C182,C182,M200,[E1191],[E1190],[],[]


# Match revised DtoD data with the BfS data from the database

In [5]:
cases_in_db = get_hospital_year_cases(file_info.hospital_name_db, file_info.year)[[AIMEDIC_ID] + VALIDATION_COLS]
cases_in_db.head()


Unnamed: 0,aimedic_id,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg


In [6]:
num_cases_in_db = cases_in_db.shape[0]
if num_cases_in_db == 0:
    raise ValueError(f"There is no data for the hospital '{file_info.hospital_name_db}' in {file_info.year}")
else:
    logger.info(f"Read {num_cases_in_db} rows from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}")    

ValueError: There is no data for the hospital 'Hirslanden Aarau' in 2017

In [None]:
# Remove columns with empty 'case_id' in DB data

cases_in_db['case_id'].replace('', np.nan, inplace=True)
num_na_cases_in_db = cases_in_db['case_id'].isna().sum()
cases_in_db.dropna(subset=['case_id'], inplace=True)
logger.info(f"{num_na_cases_in_db} rows with empty 'case_id's were discovered and removed from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}")   

In [None]:
# get the case_id from revised_case 
df_revised_case_d2d['case_id_num'] = df_revised_case_d2d[CASE_ID_COL].astype(int)
cases_in_db['case_id_num'] = cases_in_db[CASE_ID_COL].astype(int)

In [None]:
# get case ids in revised case data from DtoD
case_id_d2d = df_revised_case_d2d['case_id_num'].values

In [None]:
# Check for duplicates in the case ids of DtoD

case_id_d2d_set = set(case_id_d2d)
if len(case_id_d2d) > len(case_id_d2d_set):
    logger.info('There are duplicated case_ids in the revised cases.' )
else:
    logger.info('There are no duplicated case_ids in the revised cases.' )

len(case_id_d2d_set)

In [None]:
# Search for case_ids in the BFS DB data with the same case_ids as in revised case dataset of DtoD 
# Reason: check whether case ids of the revised data appear more than once in BFS DB data

revised_case_bfs_db = cases_in_db[cases_in_db['case_id_num'].isin(case_id_d2d)]
revised_case_bfs_db.shape[0]

In [None]:
# Join revised cases from DtoD with BFS Cases from Database

if revised_case_bfs_db.shape[0] <= len(case_id_d2d):
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_d2d_subset.merge(revised_case_bfs_db, how ='inner', on=['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'], suffixes=None)
    num_match_case_id = revised_case_normalized.shape[0]
else:
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_num', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_d2d_subset.merge(revised_case_bfs_db, how ='inner', on=['case_id_num', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
    num_match_case_patient_id = revised_case_normalized.shape[0]
print(revised_case_normalized.shape)
#revised_case_normalized.head()

In [None]:
num_revised_case_normalized = revised_case_normalized.shape[0]
num_df_revised_case_d2d = df_revised_case_d2d.shape[0] 

if num_revised_case_normalized == 0:
    raise ValueError(f"No cases between the revised cases for the hospital '{file_info.hospital_name_db}' in {file_info.year} were joined.")
else:
    logger.info(f"{num_revised_case_normalized} of {num_df_revised_case_d2d} rows were joined with the cases from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}") 

In [None]:
df_revised_case_normalized = revised_case_normalized[COLS_TO_OUTPUT]

In [None]:
# Export normalized revision data as a csv: 

hospital = file_info.hospital_name_db.replace(' ', '_') # replace spaces with '_' for the file name
hospital_year = f'{hospital}_{file_info.year}'

output_folder = os.path.join(ROOT_DIR, 'normalized_revision_data/')

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

NEW = '_NEW'
    
output_path = os.path.join(ROOT_DIR, 'normalized_revision_data/') + hospital_year + NEW + '.csv'

df_revised_case_normalized.sort_values('aimedic_id').to_csv(output_path)


