# Revised case normalization for KSSG Linth 2019
Notes:

- 25 of 25 revised DtoD cases were joined with the BFS cases from the DB.


In [1]:
import sys

import pandas as pd
import numpy as np

from loguru import logger

sys.path.insert(0, '/home/jovyan/work')

from src.revised_case_normalization.py.global_configs import *
from src.revised_case_normalization.py.normalize import normalize
from src.service import bfs_cases_db_service as bfs_db
from src.service.bfs_cases_db_service import session, get_hospital_year_cases

In [2]:
file_info = FileInfo(
        os.path.join(ROOT_DIR, 'raw_data/Linth_Toggenburg_SRRWS_2019.xlsx'),
        'KSSG Linth',
        '2019',
        ['Änderungen_Spital_Linth_2019'])

print(file_info)

FileInfo(path='/home/jovyan/work/src/revised_case_normalization/raw_data/Linth_Toggenburg_SRRWS_2019.xlsx', hospital_name_db='KSSG Linth', year='2019', sheets=['Änderungen_Spital_Linth_2019'])


In [3]:
cols_to_rename = dict(COLUMNS_TO_RENAME)
# Replace 'admno' with 'fall nummer'
cols_to_rename.pop('admno')
cols_to_rename['fall nummer'] = CASE_ID_COL

df_revised_case_d2d = normalize(file_info, 0)

2022-10-19 16:06:21.829 | INFO     | src.revised_case_normalization.py.normalize:normalize:46 - Read 15 cases for KSSG Linth 2019
2022-10-19 16:06:21.833 | INFO     | src.revised_case_normalization.py.normalize:normalize:69 - TYPES:
case_id              int64
patient_id          object
gender              string
age_years            int64
duration_of_stay     int64
pccl                 int64
drg                 string
old_pd              string
new_pd              string
bfs_code            string
added_icds          string
removed_icds        string
added_chops         string
removed_chops       string
dtype: object
2022-10-19 16:06:21.836 | INFO     | src.utils.dataframe_utils:validate_icd_codes:68 - Validated ICD codes in 'added_icds' and stored them into 'added_icds': All rows contain valid codes
2022-10-19 16:06:21.838 | INFO     | src.utils.dataframe_utils:validate_icd_codes:68 - Validated ICD codes in 'removed_icds' and stored them into 'removed_icds': All rows contain valid cod

In [4]:
df_revised_case_d2d.head()

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
0,3108134918,DDB3BDB39DE3F695,M,49,7,3,J64C,A46,A46,M100,[K746],[],[],[]
1,3108136936,1C6A9D7FAA9A7548,W,89,5,2,G46C,K226,K226,M100,[N184],[D6833],[],[]
2,3108141419,69231F9C61534D42,M,69,10,2,G60B,C183,C183,M100,[E43],[],[],[]
3,3150158985,29F13DDB90E1976D,W,70,2,3,G48C,K922,K922,M100,"[D62, D684]",[D500],[],[]
4,3150164388,11721C6EB40D5BB5,M,78,5,2,G46C,K250,K250,M100,[D62],[],[],[]


# Match revised DtoD data with the BfS data from the database

In [5]:
cases_in_db = get_hospital_year_cases(file_info.hospital_name_db, file_info.year)[[AIMEDIC_ID] + VALIDATION_COLS]
cases_in_db.head()


Unnamed: 0,aimedic_id,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg
0,249041,3108130426,FA4FEB649DF59BF5,W,64,6,2,I13E
1,249042,3150153245,6DEC4D4214A30CBC,W,18,2,0,O62Z
2,249043,3108130687,7E86E2BF62F384CD,M,20,1,2,B85C
3,249044,3108131372,6BD39AF8128666A9,W,62,9,3,F60B
4,249045,3150154687,33E8D523439BF4E5,W,0,3,0,P67D


In [6]:
num_cases_in_db = cases_in_db.shape[0]
if num_cases_in_db == 0:
    raise ValueError(f"There is no data for the hospital '{file_info.hospital_name_db}' in {file_info.year}")
else:
    logger.info(f"Read {num_cases_in_db} rows from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}")    

2022-10-19 16:06:25.394 | INFO     | __main__:<module>:5 - Read 5110 rows from the DB, for the hospital 'KSSG Linth' in 2019


In [7]:
# Remove columns with empty 'case_id' in DB data

cases_in_db['case_id'].replace('', np.nan, inplace=True)
num_na_cases_in_db = cases_in_db['case_id'].isna().sum()
cases_in_db.dropna(subset=['case_id'], inplace=True)
logger.info(f"{num_na_cases_in_db} rows with empty 'case_id's were discovered and removed from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}")   

2022-10-19 16:06:25.406 | INFO     | __main__:<module>:6 - 0 rows with empty 'case_id's were discovered and removed from the DB, for the hospital 'KSSG Linth' in 2019


In [8]:
# get the case_id from revised_case 
df_revised_case_d2d['case_id_num'] = df_revised_case_d2d[CASE_ID_COL].astype(int)
cases_in_db['case_id_num'] = cases_in_db[CASE_ID_COL].astype(int)

In [9]:
# get case ids in revised case data from DtoD
case_id_d2d = df_revised_case_d2d['case_id_num'].values

In [10]:
# Check for duplicates in the case ids of DtoD

case_id_d2d_set = set(case_id_d2d)
if len(case_id_d2d) > len(case_id_d2d_set):
    logger.info('There are duplicated case_ids in the revised cases.' )
else:
    logger.info('There are no duplicated case_ids in the revised cases.' )

len(case_id_d2d_set)

2022-10-19 16:06:25.426 | INFO     | __main__:<module>:7 - There are no duplicated case_ids in the revised cases.


15

In [11]:
# Search for case_ids in the BFS DB data with the same case_ids as in revised case dataset of DtoD 
# Reason: check whether case ids of the revised data appear more than once in BFS DB data

revised_case_bfs_db = cases_in_db[cases_in_db['case_id_num'].isin(case_id_d2d)]
revised_case_bfs_db.shape[0]

15

In [12]:
# Join revised cases from DtoD with BFS Cases from Database

if revised_case_bfs_db.shape[0] <= len(case_id_d2d):
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_d2d_subset.merge(revised_case_bfs_db, how ='inner', on=['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'], suffixes=None)
    num_match_case_id = revised_case_normalized.shape[0]
else:
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_num', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_d2d_subset.merge(revised_case_bfs_db, how ='inner', on=['case_id_num', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
    num_match_case_patient_id = revised_case_normalized.shape[0]
print(revised_case_normalized.shape)
# revised_case_normalized.head()

(15, 16)


  revised_case_normalized = revised_case_d2d_subset.merge(revised_case_bfs_db, how ='inner', on=['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'], suffixes=None)


Unnamed: 0,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops,case_id_num,gender,age_years,duration_of_stay,pccl,drg,aimedic_id,case_id,patient_id
0,A46,A46,M100,[K746],[],[],[],3108134918,M,49,7,3,J64C,251228,3108134918,DDB3BDB39DE3F695
1,K226,K226,M100,[N184],[D6833],[],[],3108136936,W,89,5,2,G46C,251287,3108136936,1C6A9D7FAA9A7548
2,C183,C183,M100,[E43],[],[],[],3108141419,M,69,10,2,G60B,254034,3108141419,69231F9C61534D42
3,K922,K922,M100,"[D62, D684]",[D500],[],[],3150158985,W,70,2,3,G48C,253110,3150158985,29F13DDB90E1976D
4,K250,K250,M100,[D62],[],[],[],3150164388,M,78,5,2,G46C,250871,3150164388,11721C6EB40D5BB5


In [14]:
# Join the revised cases of DtoD with the BFS data (alternative solution to cross check results)

#if revised_case_bfs_db.shape[0] <= len(case_id_d2d):
#    revised_case_normalized = pd.merge(df_revised_case_d2d, cases_in_db, suffixes=('_drop', ''), on=['case_id_num', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
#    revised_case_normalized.drop([i for i in revised_case_normalized.columns if '_drop' in i], axis=1, inplace=True)
#    revised_case_normalized.head()
#else:
#    revised_case_normalized = pd.merge(df_revised_case_d2d, cases_in_db, suffixes=('_drop', ''), on=['case_id_num','patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
#    revised_case_normalized.drop([i for i in revised_case_normalized.columns if '_drop' in i], axis=1, inplace=True)
#    revised_case_normalized.head()
#print(revised_case_normalized.shape)
#revised_case_normalized.head()

In [16]:
num_revised_case_normalized = revised_case_normalized.shape[0]
num_df_revised_case_d2d = df_revised_case_d2d.shape[0] 

if num_revised_case_normalized == 0:
    raise ValueError(f"No cases between the revised cases for the hospital '{file_info.hospital_name_db}' in {file_info.year} were joined.")
else:
    logger.info(f"{num_revised_case_normalized} of {num_df_revised_case_d2d} rows were joined with the cases from the DB, for the hospital '{file_info.hospital_name_db}' in {file_info.year}") 

2022-10-19 16:06:25.477 | INFO     | __main__:<module>:7 - 15 of 15 rows were joined with the cases from the DB, for the hospital 'KSSG Linth' in 2019


In [13]:
df_revised_case_normalized = revised_case_normalized[COLS_TO_OUTPUT]

In [17]:
# Export normalized revision data as a csv: 

hospital = file_info.hospital_name_db.replace(' ', '_') # replace spaces with '_' for the file name
hospital_year = f'{hospital}_{file_info.year}'

output_folder = os.path.join(ROOT_DIR, 'normalized_revision_data/')

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

NEW = '_NEW'
    
output_path = os.path.join(ROOT_DIR, 'normalized_revision_data/') + hospital_year + NEW + '.csv'

df_revised_case_normalized.sort_values('aimedic_id').to_csv(output_path)


