# Revised case normalization for Hirslanden Salem 2017

Notes: 
- There is are no cases that can be matched with the bfs database without any modifications. The reason is that a case_id (e.g.`'41194601'`) in the revised cases from DtoD needs to be padded with '00' to match with a bfs case_id (e.g.'0041194601').




In [1]:
import pandas as pd
import sys
sys.path.insert(0, '/home/jovyan/work')

import os

from py.global_configs import *
from py.normalize import normalize

from src.service import bfs_cases_db_service as bfs_db
from src.utils.dataframe_utils import remove_duplicated_chops

  class BfsCase(Base):


In [2]:
file_info = FileInfo(
    os.path.join(ROOT_DIR, 'raw_data/HI-Bern_Salem_Beau Site_Linde.xlsx'),
    'Hirslanden Salem',
    '2017',
    ['Änderungen_SA_2017']
)

print(file_info)

FileInfo(path='/home/jovyan/work/src/revised_case_normalization/raw_data/HI-Bern_Salem_Beau Site_Linde.xlsx', hospital_name_db='Hirslanden Salem', year='2017', sheets=['Änderungen_SA_2017'])


In [3]:
cols_to_rename = dict(COLUMNS_TO_RENAME)
# Replace 'admno' with 'fall nummer'
cols_to_rename.pop('admno')
cols_to_rename['fall nummer'] = CASE_ID_COL

df_revised_case_d2d = normalize(file_info, 0)

2022-10-14 11:54:29.324 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 26: discarded ICDs after validation {''}
2022-10-14 11:54:29.325 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 34: discarded ICDs after validation {''}
2022-10-14 11:54:29.326 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 43: discarded ICDs after validation {''}
2022-10-14 11:54:29.327 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 53: discarded ICDs after validation {''}
2022-10-14 11:54:29.330 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 0: discarded ICDs after validation {''}
2022-10-14 11:54:29.331 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 1: discarded ICDs after validation {''}
2022-10-14 11:54:29.331 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 2: discarded ICDs after validation {''}
2022-10-14 11:54:29.332 | DEBUG    | src.utils.dataframe_utils:_validate

Read 57 cases for Hirslanden Salem 2017
TYPES:
case_id             string
patient_id          string
gender              string
age_years            int64
duration_of_stay     int64
pccl                 int64
drg                 string
old_pd              string
new_pd              string
bfs_code            string
added_icds          string
removed_icds        string
added_chops         string
removed_chops       string
dtype: object


2022-10-14 11:54:29.337 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 13: discarded ICDs after validation {''}
2022-10-14 11:54:29.338 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 14: discarded ICDs after validation {''}
2022-10-14 11:54:29.339 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 17: discarded ICDs after validation {''}
2022-10-14 11:54:29.339 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 18: discarded ICDs after validation {''}
2022-10-14 11:54:29.340 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 19: discarded ICDs after validation {''}
2022-10-14 11:54:29.341 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 20: discarded ICDs after validation {''}
2022-10-14 11:54:29.342 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 21: discarded ICDs after validation {''}
2022-10-14 11:54:29.343 | DEBUG    | src.utils.dataframe_utils:_valid

In [4]:
df_revised_case_d2d

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
0,41194601,8A6B7DE0296BEFB8,W,73,10,3,I09C,M4806,M4806,M200,[N183],[],[],[]
1,41248206,8DAB6117ADBB2AFE,M,84,8,3,I43B,M179,M179,M200,[I420],[],[],[]
2,41302785,5865672002F07F3F,W,97,13,3,F62C,I5001,I5001,M100,"[E876, A099]",[],[],[]
3,41304308,0CD7621F494A9140,M,36,3,3,L63D,N390,N390,M100,"[G8202, Q051]",[],[],[]
4,41237114,376800437C6F8D5B,M,78,7,3,I09D,M9963,M9963,M200,"[T842, F101]",[],[],[]
5,41253912,5E937981CA697D01,W,55,8,3,E77F,J181,J181,M100,[R650],[],[],[]
6,41255088,5491B8F1614701D0,M,70,7,3,E77F,J181,J181,M100,"[J4402, B952, B953, Z223]",[J4492],[],[]
7,41292119,E300D9957309095B,W,66,9,3,I76B,T7961,T7961,M100,"[E539, J4409]",[E538],[],[]
8,41338818,A5F45686FD659A16,M,76,6,3,E65C,J4402,J4402,M100,[K564],[],[],[]
9,41349287,D440E215C612652F,W,58,5,3,I27C,M751,M751,M200,[E1191],[E1190],[],[]


# Match revised DtoD data with the bfs data from the database


In [5]:
# get the case_id from revised_case

revised_case_id = df_revised_case_d2d['case_id'].values
revised_case_id

array(['41194601', '41248206', '41302785', '41304308', '41237114',
       '41253912', '41255088', '41292119', '41338818', '41349287',
       '41364918', '41389850', '41389983', '41375575', '41405774',
       '41407198', '41417724', '41422903', '41429357', '41438489',
       '41442434', '41475362', '41397953', '41433914', '41437552',
       '41447097', '41230075', '41506483', '41506527', '41506722',
       '41529453', '41456079', '41470794', '41487802', '41271168',
       '41300833', '41418204', '41420840', '41579881', '41581364',
       '41608546', '41367724', '41451537', '41464045', '41490828',
       '41597280', '41400942', '41409001', '41443673', '41452311',
       '41486224', '41577457', '41478672', '41502799', '41510657',
       '41495658', '41532546'], dtype=object)

In [6]:
# change the case_id the same format as bfs data e.g. '0041194601'
revised_case_id_pad0 = [case_id.zfill(10) for case_id in revised_case_id]
revised_case_id_pad0[0]
df_revised_case_d2d['case_id'] = revised_case_id_pad0

In [7]:
# match to the database
revised_case_db = bfs_db.get_bfs_cases_by_ids(revised_case_id_pad0)
revised_case_db.head()

Unnamed: 0,drg_cost_weight,aimedic_id,hospital_id,case_id,patient_id,age_years,age_days,gender,duration_of_stay,clinic_id,ventilation_hours,admission_weight,gestation_age,admission_date,admission_type,discharge_date,discharge_type,drg,adrg,pccl
0,2.801,23138,3,41194601,8A6B7DE0296BEFB8,73,0,W,10,4,0,0,0,2017-01-18,1,2017-01-28,0,I09C,I09,3
1,0.752,23713,3,41230075,DCCEF5165C8D4983,70,0,W,2,5,0,0,0,2017-02-01,1,2017-02-03,0,J25Z,J25,2
2,2.578,23920,3,41237114,376800437C6F8D5B,78,0,M,7,4,0,0,0,2017-01-10,1,2017-01-17,0,I09D,I09,3
3,1.965,24109,3,41248206,8DAB6117ADBB2AFE,84,0,M,8,4,0,0,0,2017-02-08,1,2017-02-16,0,I43B,I43,3
4,0.83,24213,3,41253912,5E937981CA697D01,55,0,W,8,3,0,0,0,2017-01-17,1,2017-01-25,0,E77F,E77,3


In [8]:
# 
print('Based on the column "case_id", there are {} out of {} revised cases from DtoD that can be matched with the bfs database data for {} {}'.format(len(revised_case_db), len(revised_case_id), file_info.hospital_name_db, file_info.year))

Based on the column "case_id", there are 57 out of 57 revised cases from DtoD that can be matched with the bfs database data for Hirslanden Salem 2017


In [9]:
# if we find the match cases, then we need to check if the case_id, gender, year....are match

In [10]:
df_revised_case_d2d.columns

Index(['case_id', 'patient_id', 'gender', 'age_years', 'duration_of_stay',
       'pccl', 'drg', 'old_pd', 'new_pd', 'bfs_code', 'added_icds',
       'removed_icds', 'added_chops', 'removed_chops'],
      dtype='object')

In [11]:
# substract the columns [case_id, new_pd_bfs_code	added_icds	removed_icds	added_chops	removed_chops] from revised_cases 

revised_case_d2d_subset = df_revised_case_d2d[['case_id', 'new_pd', 'added_icds', 'removed_icds', 'added_chops', 'removed_chops']]
revised_case_d2d_subset.head()

Unnamed: 0,case_id,new_pd,added_icds,removed_icds,added_chops,removed_chops
0,41194601,M4806,[N183],[],[],[]
1,41248206,M179,[I420],[],[],[]
2,41302785,I5001,"[E876, A099]",[],[],[]
3,41304308,N390,"[G8202, Q051]",[],[],[]
4,41237114,M9963,"[T842, F101]",[],[],[]


In [12]:
# merge revised_case_d2d_subset with revised_case_db that matched from bfs data
revised_case_normalized = revised_case_d2d_subset.merge(revised_case_db, how ='right', on='case_id')
revised_case_normalized.head()

Unnamed: 0,case_id,new_pd,added_icds,removed_icds,added_chops,removed_chops,drg_cost_weight,aimedic_id,hospital_id,patient_id,...,ventilation_hours,admission_weight,gestation_age,admission_date,admission_type,discharge_date,discharge_type,drg,adrg,pccl
0,41194601,M4806,[N183],[],[],[],2.801,23138,3,8A6B7DE0296BEFB8,...,0,0,0,2017-01-18,1,2017-01-28,0,I09C,I09,3
1,41230075,C509,[],[],[],[],0.752,23713,3,DCCEF5165C8D4983,...,0,0,0,2017-02-01,1,2017-02-03,0,J25Z,J25,2
2,41237114,M9963,"[T842, F101]",[],[],[],2.578,23920,3,376800437C6F8D5B,...,0,0,0,2017-01-10,1,2017-01-17,0,I09D,I09,3
3,41248206,M179,[I420],[],[],[],1.965,24109,3,8DAB6117ADBB2AFE,...,0,0,0,2017-02-08,1,2017-02-16,0,I43B,I43,3
4,41253912,J181,[R650],[],[],[],0.83,24213,3,5E937981CA697D01,...,0,0,0,2017-01-17,1,2017-01-25,0,E77F,E77,3
