# Revised case normalization for USZ 2019

Notes: 
- Duplicated case_id were found, need to check more detail for matching
- Based on the column "case_id", there are 224 out of 150 revised cases from DtoD that can be matched with the bfs database data for USZ 2019
- Based on the column "case_id" and "patient_id", there are 152 out of 150 revised cases from DtoD that can be matched with the bfs database data for USZ 2019 (2 cases)



In [1]:
import pandas as pd
import sys
sys.path.insert(0, '/home/jovyan/work')

import os

from py.global_configs import *
from py.normalize import normalize

from src.service import bfs_cases_db_service as bfs_db
from src.utils.dataframe_utils import remove_duplicated_chops

  class BfsCase(Base):


In [2]:
file_info = FileInfo(
         os.path.join(ROOT_DIR, 'raw_data/USZ_2018-2019_20200730.xlsx'),
         'USZ',
          '2019',
         ['Gesamtauffällige_USZ_2019'])


print(file_info)

FileInfo(path='/home/jovyan/work/src/revised_case_normalization/raw_data/USZ_2018-2019_20200730.xlsx', hospital_name_db='USZ', year='2019', sheets=['Gesamtauffällige_USZ_2019'])


In [3]:
cols_to_rename = dict(COLUMNS_TO_RENAME)
# Replace 'admno' with 'fall nummer'
cols_to_rename.pop('admno')
cols_to_rename['fall nummer'] = CASE_ID_COL

df_revised_case_d2d = normalize(file_info, 0)

2022-10-17 08:01:17.505 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 93: discarded ICDs after validation {''}
2022-10-17 08:01:17.506 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 99: discarded ICDs after validation {''}
2022-10-17 08:01:17.506 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 105: discarded ICDs after validation {''}
2022-10-17 08:01:17.507 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 108: discarded ICDs after validation {''}
2022-10-17 08:01:17.509 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 122: discarded ICDs after validation {''}
2022-10-17 08:01:17.510 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 143: discarded ICDs after validation {''}
2022-10-17 08:01:17.510 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 145: discarded ICDs after validation {''}
2022-10-17 08:01:17.513 | DEBUG    | src.utils.dataframe_utils:_

Read 150 cases for USZ 2019
TYPES:
case_id             string
patient_id          string
gender              string
age_years            int64
duration_of_stay     int64
pccl                 int64
drg                 string
old_pd              string
new_pd              string
bfs_code            string
added_icds          string
removed_icds        string
added_chops         string
removed_chops       string
dtype: object


2022-10-17 08:01:17.698 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 32: discarded duplicated and invalid CHOP entries after validation {'99B813::20191209', '009A12::20191209', '99C121::20191217'}
2022-10-17 08:01:17.698 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 33: discarded duplicated and invalid CHOP entries after validation {'890A32::20191218'}
2022-10-17 08:01:17.699 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 34: discarded duplicated and invalid CHOP entries after validation {''}
2022-10-17 08:01:17.699 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 35: discarded duplicated and invalid CHOP entries after validation {'948X42::20191212'}
2022-10-17 08:01:17.700 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 36: discarded duplicated and invalid CHOP entries after validation {'99B811::20191125', '346X23:R:20191125'}
2022-10-17 08:01:17.701 | DEBUG    | src.utils.dat

In [4]:
df_revised_case_d2d

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
0,6400282213,3ED8F10117FCF5F5,M,54,13,3,E06A,J4400,J4400,M200,[F058],[],[],[]
1,6400287076,55C5A17F71DADEA0,M,62,5,3,D62C,J068,J068,M800,"[I8028, C770, C787]",[],[],[]
2,6400306817,0150AC67E3196B63,W,74,36,3,G72C,M8088,M8088,M100,[R64],[],[],[]
3,6400316927,153929A139272585,W,79,6,3,B68C,G3531,G3531,M100,[R471],[],[],[]
4,6400278895,014A89BB816D9215,W,71,44,3,F75D,T827,T827,M200,[D508],[D648],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,6400305282,B2EFB1C7F864D6BE,M,54,4,2,F67B,R522,R522,M100,[],[],[],[]
146,6400306506,2BD4038A71FFAFCE,M,51,7,2,M60B,A410,A410,M200,"[R651, C795, C774, N184]",[],[],[]
147,6400307264,45BD6CAFB4B36938,W,84,14,2,J02B,L8924,L8924,M200,[B965],[],[],[]
148,6400308115,CC100ED2DB478B7E,M,46,8,2,H64B,K830,K830,M100,[K831],[],[],[]


# Match revised DtoD data with the bfs data from the database


In [5]:
# get the case_id from revised_case

revised_case_id = df_revised_case_d2d['case_id'].values
revised_case_id

array(['6400282213', '6400287076', '6400306817', '6400316927',
       '6400278895', '6400284826', '6400285318', '6400295190',
       '6400285972', '6400296127', '6400287392', '6400305514',
       '6400315570', '6400325817', '6400325819', '6400326160',
       '6400326265', '6400326306', '6400329277', '6400331178',
       '6400331492', '6400331625', '6400332257', '6400334309',
       '6400334358', '6400334421', '6400335091', '6400338217',
       '6400338301', '6400339053', '6400339098', '6400339110',
       '6400339524', '6400343882', '6400330582', '6400330942',
       '6400330984', '6400331207', '6400335350', '6400335613',
       '6400339577', '6400343386', '6400343864', '6400330417',
       '6400330825', '6400331098', '6400342944', '6400327431',
       '6400329606', '6400330783', '6400335620', '6400335765',
       '6400339824', '6400339830', '6400339976', '6400340009',
       '6400343280', '6400343475', '6400325852', '6400325863',
       '6400330299', '6400330346', '6400335461', '64002

In [6]:
# change the case_id the same format as bfs data e.g. '0041194601'
revised_case_id_pad0 = [case_id.zfill(10) for case_id in revised_case_id]
revised_case_id_pad0[0]
df_revised_case_d2d['case_id'] = revised_case_id_pad0

In [16]:
# match to the database
revised_case_db = bfs_db.get_bfs_cases_by_ids(revised_case_id_pad0)
revised_case_db.head(10)


Unnamed: 0,drg_cost_weight,aimedic_id,hospital_id,case_id,patient_id,age_years,age_days,gender,duration_of_stay,clinic_id,ventilation_hours,admission_weight,gestation_age,admission_date,admission_type,discharge_date,discharge_type,drg,adrg,pccl
0,0.72,337849,14,6400278895,014A89BB816D9215,71,0,W,44,4,0,0,0,2018-11-30,11,2019-01-13,0,F75D,F75,3
1,0.72,337850,14,6400278895,7BCC689D46B39487,71,0,W,44,4,0,0,0,2018-11-30,11,2019-01-13,0,F75D,F75,3
2,1.759,339676,14,6400281744,9E37EB46636D9B84,86,0,W,12,3,0,0,0,2018-12-27,1,2019-01-08,0,F59C,F59,3
3,1.759,339677,14,6400281744,AAAF218127945BA0,86,0,W,12,3,0,0,0,2018-12-27,1,2019-01-08,0,F59C,F59,3
4,0.519,339976,14,6400282164,47AC5130DDC9249B,76,0,M,8,4,0,0,0,2018-12-26,6,2019-01-03,0,X60C,X60,2
5,0.519,339977,14,6400282164,9B10D343F4E3F286,76,0,M,8,4,0,0,0,2018-12-26,6,2019-01-03,0,X60C,X60,2
6,2.583,340028,14,6400282213,3ED8F10117FCF5F5,54,0,M,13,4,0,0,0,2018-12-27,11,2019-01-09,0,E06A,E06,3
7,2.583,340029,14,6400282213,31327693C8667DB6,54,0,M,13,4,0,0,0,2018-12-27,11,2019-01-09,0,E06A,E06,3
8,0.8,340250,14,6400282367,836FC4207945C567,69,0,M,11,11,0,0,0,2018-12-28,1,2019-01-09,0,D20Z,D20,2
9,0.8,340251,14,6400282367,842B90A2A8577757,69,0,M,11,11,0,0,0,2018-12-28,1,2019-01-09,0,D20Z,D20,2


In [8]:
# same the number of matched case to a string for later report

matched_number_report = 'Based on the column "case_id", there are {} out of {} revised cases from DtoD that can be matched with the bfs database data for {} {}'.format(len(revised_case_db), len(revised_case_id), file_info.hospital_name_db, file_info.year)
print(matched_number_report)

Based on the column "case_id", there are 224 out of 150 revised cases from DtoD that can be matched with the bfs database data for USZ 2019


In [9]:
df_revised_case_d2d.columns

Index(['case_id', 'patient_id', 'gender', 'age_years', 'duration_of_stay',
       'pccl', 'drg', 'old_pd', 'new_pd', 'bfs_code', 'added_icds',
       'removed_icds', 'added_chops', 'removed_chops'],
      dtype='object')

In [10]:
# select the relevent columns from revised data

revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES]
revised_case_d2d_subset.head()

Unnamed: 0,case_id,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
0,6400282213,J4400,J4400,M200,[F058],[],[],[]
1,6400287076,J068,J068,M800,"[I8028, C770, C787]",[],[],[]
2,6400306817,M8088,M8088,M100,[R64],[],[],[]
3,6400316927,G3531,G3531,M100,[R471],[],[],[]
4,6400278895,T827,T827,M200,[D508],[D648],[],[]


Unnamed: 0_level_0,patient_id,gender,age_years,duration_of_stay,pccl,drg,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
6400282213,3ED8F10117FCF5F5,M,54,13,3,E06A,J4400,J4400,M200,[F058],[],[],[]
6400287076,55C5A17F71DADEA0,M,62,5,3,D62C,J068,J068,M800,"[I8028, C770, C787]",[],[],[]
6400306817,0150AC67E3196B63,W,74,36,3,G72C,M8088,M8088,M100,[R64],[],[],[]
6400316927,153929A139272585,W,79,6,3,B68C,G3531,G3531,M100,[R471],[],[],[]
6400278895,014A89BB816D9215,W,71,44,3,F75D,T827,T827,M200,[D508],[D648],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6400305282,B2EFB1C7F864D6BE,M,54,4,2,F67B,R522,R522,M100,[],[],[],[]
6400306506,2BD4038A71FFAFCE,M,51,7,2,M60B,A410,A410,M200,"[R651, C795, C774, N184]",[],[],[]
6400307264,45BD6CAFB4B36938,W,84,14,2,J02B,L8924,L8924,M200,[B965],[],[],[]
6400308115,CC100ED2DB478B7E,M,46,8,2,H64B,K830,K830,M100,[K831],[],[],[]


In [None]:
# change the case_id colum as index
for case_id in 

for col in extra_col:
    df_revised_case_d2d[col] = 

In [46]:
VALIDATION_COLS

['case_id',
 'patient_id',
 'gender',
 'age_years',
 'duration_of_stay',
 'pccl',
 'drg']

# Mulptiple cases found based on case_id, then match again with patient_id

In [39]:
# Match with patient_id
matched_case_and_patient_id = df_revised_case_d2d.merge(revised_case_db, on='patient_id')


In [41]:
from collections import Counter
Counter(matched_case_and_patient_id['patient_id'])

Counter({'3ED8F10117FCF5F5': 1,
         '55C5A17F71DADEA0': 1,
         '0150AC67E3196B63': 1,
         '153929A139272585': 1,
         '014A89BB816D9215': 1,
         'CF32C74CE1FAB68A': 1,
         '48B1ECF48E09C8FD': 1,
         '8D45AD7832912926': 1,
         'F403CAF798BDDB60': 1,
         '5EEABA6047B686D3': 1,
         'EBBADCAE914AC7AE': 4,
         '07BB3070FD46FA9B': 1,
         '05279AFD21356FB0': 1,
         '23E552CDD7FDB6A5': 1,
         '333F6B4AB3056A97': 1,
         '9AB43DF39B1567CF': 1,
         'E3BAB1B906C29079': 1,
         '7A8E561610FF29DE': 1,
         '2D4E6E929C6058C2': 1,
         '7ABF6D3669DE7B76': 1,
         '73E6700D2C0C2186': 1,
         'B975BAF0886D49EC': 1,
         '9053908FB1E9D3CD': 1,
         '8CBA4168B823B818': 1,
         '2706D0D8E45964D2': 1,
         '27FA08A9B0A486E6': 1,
         'AEA4319BDAF50A42': 1,
         '074E5FADCAE940E6': 1,
         'B27C875373E066F3': 1,
         'CF7D433871E6C9DE': 1,
         '3A9A342B91C52903': 1,
        

In [37]:
match_case_and_patient_id[match_case_and_patient_id['case_id_x'] == '6400287392']

Unnamed: 0,case_id_x,patient_id,gender_x,age_years_x,duration_of_stay_x,pccl_x,drg_x,old_pd,new_pd,bfs_code,...,ventilation_hours,admission_weight,gestation_age,admission_date,admission_type,discharge_date,discharge_type,drg_y,adrg,pccl_y
10,6400287392,EBBADCAE914AC7AE,M,69,15,3,X62B,T8578,T8578,M100,...,0,0,0,2019-01-10,1,2019-01-17,0,H60C,H60,3
11,6400287392,EBBADCAE914AC7AE,M,69,15,3,X62B,T8578,T8578,M100,...,0,0,0,2019-02-13,1,2019-02-28,0,X62B,X62,3


In [30]:
from collections import Counter
Counter(matched_case_and_patient_id['case_id_x'])

Counter({'6400282213': 1,
         '6400287076': 1,
         '6400306817': 1,
         '6400316927': 1,
         '6400278895': 1,
         '6400284826': 1,
         '6400285318': 1,
         '6400295190': 1,
         '6400285972': 1,
         '6400296127': 1,
         '6400287392': 2,
         '6400284510': 2,
         '6400305514': 1,
         '6400315570': 1,
         '6400325817': 1,
         '6400325819': 1,
         '6400326160': 1,
         '6400326265': 1,
         '6400326306': 1,
         '6400329277': 1,
         '6400331178': 1,
         '6400331492': 1,
         '6400331625': 1,
         '6400332257': 1,
         '6400334309': 1,
         '6400334358': 1,
         '6400334421': 1,
         '6400335091': 1,
         '6400338217': 1,
         '6400338301': 1,
         '6400339053': 1,
         '6400339098': 1,
         '6400339110': 1,
         '6400339524': 1,
         '6400343882': 1,
         '6400330582': 1,
         '6400330942': 1,
         '6400330984': 1,
         '64

# Forming the normalized dataframe: Merge the selected revised case data with the matched from bfs data


In [11]:

revised_case_normalized = revised_case_d2d_subset.merge(revised_case_db, how ='right', on='case_id')
revised_case_normalized.head()

Unnamed: 0,case_id,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops,drg_cost_weight,aimedic_id,...,ventilation_hours,admission_weight,gestation_age,admission_date,admission_type,discharge_date,discharge_type,drg,adrg,pccl
0,6400278895,T827,T827,M200,[D508],[D648],[],[],0.72,337849,...,0,0,0,2018-11-30,11,2019-01-13,0,F75D,F75,3
1,6400278895,T827,T827,M200,[D508],[D648],[],[],0.72,337850,...,0,0,0,2018-11-30,11,2019-01-13,0,F75D,F75,3
2,6400281744,I7025,I7025,M100,[I653],[I652],[],[],1.759,339676,...,0,0,0,2018-12-27,1,2019-01-08,0,F59C,F59,3
3,6400281744,I7025,I7025,M100,[I653],[I652],[],[],1.759,339677,...,0,0,0,2018-12-27,1,2019-01-08,0,F59C,F59,3
4,6400282164,S120,S120,M200,[],[],[],[],0.519,339976,...,0,0,0,2018-12-26,6,2019-01-03,0,X60C,X60,2


In [12]:
df_revised_case_normalized = revised_case_normalized[COLS_TO_SELECT]
df_revised_case_normalized.head()

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
0,6400278895,014A89BB816D9215,W,71,44,3,F75D,T827,T827,M200,[D508],[D648],[],[]
1,6400278895,7BCC689D46B39487,W,71,44,3,F75D,T827,T827,M200,[D508],[D648],[],[]
2,6400281744,9E37EB46636D9B84,W,86,12,3,F59C,I7025,I7025,M100,[I653],[I652],[],[]
3,6400281744,AAAF218127945BA0,W,86,12,3,F59C,I7025,I7025,M100,[I653],[I652],[],[]
4,6400282164,47AC5130DDC9249B,M,76,8,2,X60C,S120,S120,M200,[],[],[],[]


# Export the normalized revised cases with the neccessary columns to a csv file


In [13]:

hospital = file_info.hospital_name_db.replace(' ', '_') # replace spaces with '_' for the file name
hospital_year = f'{hospital}_{file_info.year}'

output_folder = os.path.join(ROOT_DIR, 'normalized_revision_data/')

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

output_path = os.path.join(ROOT_DIR, 'normalized_revision_data/') + hospital_year + '.csv'

df_revised_case_normalized.to_csv(output_path)


# Report

A small summary of:
- how many cases could be matched in the DB, and based on which columns
- how many cases were “correctly” revised, in terms of upcoding (still need to do after the `grouper` is available for python)

In [14]:
matched_number_report

'Based on the column "case_id", there are 224 out of 150 revised cases from DtoD that can be matched with the bfs database data for USZ 2019'

In [15]:
print('All the normalized revised cases are exported to here: {}'.format(output_path))

All the normalized revised cases are exported to here: /home/jovyan/work/src/revised_case_normalization/normalized_revision_data/USZ_2019.csv


In [34]:
print(f'Based on the column "case_id" and "patient_id", there are {matched_case_and_patient_id.shape[0]} out of 150 revised cases from DtoD that can be matched with the bfs database data for USZ 2019')

Based on the column "case_id" and "patient_id", there are 152 out of 150 revised cases from DtoD that can be matched with the bfs database data for USZ 2019
