# Revised case normalization for Winterthur 2017

Notes:

- 216 out of 218 revised cases from DtoD can be matched with the BFS DB Data of Winterthur 2017.

In [1]:
import pandas as pd
import sys
sys.path.insert(0, '/home/jovyan/work')

import os

from py.global_configs import *
from py.normalize import normalize

from src.service import bfs_cases_db_service as bfs_db

from src.utils.dataframe_utils import remove_duplicated_chops

  class BfsCase(Base):


In [2]:

file_info = FileInfo(
         os.path.join(ROOT_DIR, 'raw_data/Winterthur.xlsx'),
         'KSW',
         '2017',
         ['Änderungen _Winterthur_2017'])

print(file_info)


FileInfo(path='/home/jovyan/work/src/revised_case_normalization/raw_data/Winterthur.xlsx', hospital_name_db='KSW', year='2017', sheets=['Änderungen _Winterthur_2017'])


In [3]:
cols_to_rename = dict(COLUMNS_TO_RENAME)
# Replace 'admno' with 'fall nummer'
cols_to_rename.pop('admno')
cols_to_rename['fall nummer'] = CASE_ID_COL

df_revised_case_d2d = normalize(file_info, 0)

Read 218 cases for KSW 2017

2022-10-18 12:20:25.944 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 17: discarded ICDs after validation {''}
2022-10-18 12:20:25.944 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 30: discarded ICDs after validation {''}
2022-10-18 12:20:25.945 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 36: discarded ICDs after validation {''}
2022-10-18 12:20:25.947 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 58: discarded ICDs after validation {''}
2022-10-18 12:20:25.948 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 81: discarded ICDs after validation {''}
2022-10-18 12:20:25.950 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 106: discarded ICDs after validation {''}
2022-10-18 12:20:25.951 | DEBUG    | src.utils.dataframe_utils:_validate_icd_codes:56 - row 111: discarded ICDs after validation {''}
2022-10-18 12:20:25.952 | DEBUG    | src.utils.dataframe_utils:_val


TYPES:
case_id             string
patient_id          string
gender              string
age_years            int64
duration_of_stay     int64
pccl                 int64
drg                 string
old_pd              string
new_pd              string
bfs_code            string
added_icds          string
removed_icds        string
added_chops         string
removed_chops       string
dtype: object


2022-10-18 12:20:26.138 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 96: discarded duplicated and invalid CHOP entries after validation {'004b11:R:20171024'}
2022-10-18 12:20:26.139 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 97: discarded duplicated and invalid CHOP entries after validation {'99b811::20171213', '99b714::20171208', '890a32::20171215'}
2022-10-18 12:20:26.139 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 98: discarded duplicated and invalid CHOP entries after validation {'990a::20170708', '570x12::20170708'}
2022-10-18 12:20:26.140 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 99: discarded duplicated and invalid CHOP entries after validation {'741x20::20170927'}
2022-10-18 12:20:26.141 | DEBUG    | src.utils.dataframe_utils:_validate_chop_codes:94 - row 100: discarded duplicated and invalid CHOP entries after validation {''}
2022-10-18 12:20:26.142 | DEBUG    | src.utils.data

In [4]:
df_revised_case_d2d

Unnamed: 0,case_id,patient_id,gender,age_years,duration_of_stay,pccl,drg,old_pd,new_pd,bfs_code,added_icds,removed_icds,added_chops,removed_chops
0,20066273027,D1B4E3C92520FC01,M,35,4,3,M60B,C61,C61,M100,[C787],[],[],[]
1,20093870027,13320B741B5845D4,M,84,17,3,I46C,S7210,S7210,M200,[T840],[],[],[]
2,20158223027,BA43D16D9B8B472A,M,75,11,3,H01B,C787,C787,M200,[D684],[],[],[]
3,20219240027,7D29F200ABFD59CD,M,87,14,3,I46C,S7201,S7201,M100,[B370],[],[],[]
4,20237303027,2BDCB561504F6656,W,83,7,3,I46C,S7201,S7201,M200,[N182],[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,21190583027,4B61EE1D52949A9D,M,59,4,2,G46C,K922,K922,M100,[I81],[],[],[]
214,21305870027,25DA94C53B280F17,W,62,4,3,E08B,C343,C343,M100,[C771],[],[],[]
215,22510853027,FA8BE33FAB475106,M,77,4,3,B72C,B022,B022,M100,[E871],[],[],[]
216,22881800027,0A54ABA2AA2F5F6D,W,64,4,3,B66C,C793,C793,M100,[C780],[],[],[]


# Match revised DtoD data with the bfs data from the database


In [5]:
# get the case_id from revised_case

revised_case_id = df_revised_case_d2d['case_id'].values
revised_case_id

array(['20066273027', '20093870027', '20158223027', '20219240027',
       '20237303027', '20520053027', '20521883027', '20595506027',
       '20651153027', '20652950027', '20715770027', '20027270027',
       '20184413027', '20256623027', '20330843027', '20418443027',
       '20536136027', '20603690027', '20603840027', '20722103027',
       '20728766027', '20801606027', '20049053027', '20149040027',
       '20270936027', '20353373027', '20359340027', '20492120027',
       '20498453027', '20500583027', '20113853027', '20418260027',
       '20712080027', '20830463027', '20971223027', '21008423027',
       '20549333027', '20560670027', '20574206027', '20629073027',
       '20716973027', '20863823027', '20906963027', '20912420027',
       '20919833027', '20975933027', '20992130027', '20999783027',
       '21034103027', '21050840027', '21121763027', '21213383027',
       '21246380027', '21261323027', '21768383027', '21976340027',
       '21977336027', '21086843027', '21089240027', '211172300

In [6]:
revised_case_id_stripped = [case_id.lstrip('0') for case_id in revised_case_id]

In [7]:
# change the case_id the same format as bfs data e.g. '0041194601' 
# revised_case_id_pad0 = [case_id.zfill(10) for case_id in revised_case_id]
# revised_case_id_pad0[0]
# df_revised_case_d2d['case_id'] = revised_case_id_pad0

In [None]:
# match to the database
revised_case_db = bfs_db.get_bfs_cases_by_ids_no_pad0(revised_case_id_stripped)
revised_case_db.head()

In [None]:
# create a new column for case_id_stripped with '0' for joining to the revised cases
revised_case_db['case_id_stripped']=  revised_case_db['case_id'].apply(lambda x: x.lstrip('0')) 
revised_case_db.head()

In [None]:
df_revised_case_d2d['case_id_stripped']=  df_revised_case_d2d['case_id'].apply(lambda x: x.lstrip('0')) 
df_revised_case_d2d.head()


# Forming the normalized dataframe: Merge the selected revised case data with the matched from bfs data
  If there are multiple cases were found based on case_id, patient_id will be used to match once more  

In [None]:
# Check if there are multiple cases were found based on case_id, patient_id will be used to match once more
if revised_case_db.shape[0] <= len(revised_case_id):
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_stripped', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_d2d_subset.merge(revised_case_db, how ='inner', on=['case_id_stripped', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
    num_match_case_id = revised_case_normalized.shape[0]
else:
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_stripped', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_d2d_subset.merge(revised_case_db, how ='inner', on=['case_id_stripped', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
    num_match_case_patient_id = revised_case_normalized.shape[0]
print(revised_case_normalized.shape)
revised_case_normalized.head()

In [None]:
df_revised_case_normalized = revised_case_normalized[COLS_TO_OUTPUT]
df_revised_case_normalized.head()

# Export the normalized revised cases with the neccessary columns to a csv file


In [None]:

hospital = file_info.hospital_name_db.replace(' ', '_') # replace spaces with '_' for the file name
hospital_year = f'{hospital}_{file_info.year}'

output_folder = os.path.join(ROOT_DIR, 'normalized_revision_data/')

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

output_path = os.path.join(ROOT_DIR, 'normalized_revision_data/') + hospital_year + '.csv'

df_revised_case_normalized.to_csv(output_path)


# Report

A small summary of:
- how many cases could be matched in the DB, and based on which columns
- how many cases were “correctly” revised, in terms of upcoding (still need to do after the `grouper` is available for python)

In [None]:
if revised_case_db.shape[0] <= len(revised_case_id):
    print(f'Based on the columns "case_id", "gender, "age_years", "duration_of_stay", "pccl", "drg", there are {num_match_case_id} out of {len(revised_case_id)} revised cases from DtoD that can be matched with the bfs database data for {file_info.hospital_name_db} {file_info.year}' )
else:
    print(f'Based on the columns "case_id" and "Patient_id", "gender, "age_years", "duration_of_stay", "pccl", "drg", there are {num_match_case_patient_id} out of {len(revised_case_id)} revised cases from DtoD that can be matched with the bfs database data for {file_info.hospital_name_db} {file_info.year}' )

In [None]:
print('All the normalized revised cases are exported to here: {}'.format(output_path))