# Revised case normalization for KSSG Linth 2019

Note:

- All 15 of 15 cases from the revised data could be matched with the BFS DB data of KSSG Linth 2019. 



In [1]:
import pandas as pd
import sys
sys.path.insert(0, '/home/jovyan/work')

import os

from py.global_configs import *
from py.normalize import normalize

from src.service import bfs_cases_db_service as bfs_db

from src.utils.dataframe_utils import remove_duplicated_chops

ModuleNotFoundError: No module named 'py'

In [None]:

file_info = FileInfo(
         os.path.join(ROOT_DIR, 'raw_data/Linth_Toggenburg_SRRWS_2019.xlsx'),
         'KSSG Linth',
         '2019',
         ['Änderungen_Spital_Linth_2019'])

print(file_info)


In [None]:
cols_to_rename = dict(COLUMNS_TO_RENAME)
# Replace 'admno' with 'fall nummer'
cols_to_rename.pop('admno')
cols_to_rename['fall nummer'] = CASE_ID_COL

df_revised_case_d2d = normalize(file_info, 0)

In [None]:
df_revised_case_d2d

# Match revised DtoD data with the bfs data from the database


In [None]:
# get the case_id from revised_case

revised_case_id = df_revised_case_d2d['case_id'].values
revised_case_id

In [None]:
revised_case_id_stripped = [case_id.lstrip('0') for case_id in revised_case_id]

### Check if there are duplicated case_ids in the revised cases after stripping the 0 in front, if yes, delete duplicated cases

In [None]:

revised_case_id_stripped_set = set(revised_case_id_stripped)
if len(revised_case_id_stripped) > len(revised_case_id_stripped_set):
    print('There are duplicated case_ids in the revised cases.' )
else:
    print('There are no duplicated case_ids in the revised cases.' )

In [None]:
# match to the database and retrieve data from database using revised
revised_case_db = bfs_db.get_bfs_cases_by_ids_no_pad0(list(revised_case_id_stripped_set))
revised_case_db.head()

In [None]:
# create a new column for case_id_stripped with '0' for joining to the revised cases
revised_case_db['case_id_stripped']=  revised_case_db['case_id'].apply(lambda x: x.lstrip('0')) 
revised_case_db.head()

In [None]:
# create a new column for case_id_stripped with '0' for joining to the revised cases
revised_case_db['case_id_stripped']=  revised_case_db['case_id'].apply(lambda x: x.lstrip('0')) 
revised_case_db.head()
print(revised_case_db.shape)

In [None]:
# create a new column for case_id_stripped with '0' for joining to cases from database

df_revised_case_d2d['case_id_stripped']=  df_revised_case_d2d['case_id'].apply(lambda x: x.lstrip('0')) 
df_revised_case_d2d.head()
print(df_revised_case_d2d.shape)

In [None]:
# delete duplicated cases in the revised cases

df_revised_case_d2d['case_id_stripped'].apply(tuple) # make sure the drop_duplicates works

df_revised_case_d2d.drop_duplicates(subset=['case_id_stripped'], keep='last', inplace=True)
assert len(revised_case_id_stripped) == df_revised_case_d2d.shape[0] # make sure all cases are deleted


# Forming the normalized dataframe: Merge the selected revised case data with the matched from bfs data
  If there are multiple cases were found based on case_id, patient_id will be used to match once more  

In [None]:
# Check if there are multiple cases were found based on case_id, patient_id will be used to match once more
if revised_case_db.shape[0] <= len(revised_case_id):
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_stripped', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_db.merge(revised_case_d2d_subset, how ='left', on=['case_id_stripped', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
    num_match_case_id = revised_case_normalized.shape[0]
else:
    revised_case_d2d_subset = df_revised_case_d2d[COL_SUBSET_FROM_REVISED_CASES + ['case_id_stripped', 'patient_id', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg']]
    revised_case_normalized = revised_case_db.merge(revised_case_d2d_subset, how ='left', on=['case_id_stripped', 'gender', 'age_years', 'duration_of_stay', 'pccl', 'drg'])
    num_match_case_patient_id = revised_case_normalized.shape[0]
print(revised_case_normalized.shape)
revised_case_normalized.head()

In [None]:
# Choose column to export to a csv file
df_revised_case_normalized = revised_case_normalized[COLS_TO_OUTPUT]
assert len(df_revised_case_normalized.columns) == len(COLS_TO_OUTPUT)
df_revised_case_normalized.head()

# Export the normalized revised cases with the neccessary columns to a csv file


In [None]:

hospital = file_info.hospital_name_db.replace(' ', '_') # replace spaces with '_' for the file name
hospital_year = f'{hospital}_{file_info.year}'

output_folder = os.path.join(ROOT_DIR, 'normalized_revision_data/')

if not os.path.exists(output_folder):
    os.mkdir(output_folder)

output_path = os.path.join(ROOT_DIR, 'normalized_revision_data/') + hospital_year + '.csv'

df_revised_case_normalized.to_csv(output_path)


# Report

A small summary of:
- how many cases could be matched in the DB, and based on which columns
- how many cases were “correctly” revised, in terms of upcoding (still need to do after the `grouper` is available for python)

In [None]:
if revised_case_db.shape[0] <= len(revised_case_id):
    print(f'Based on the columns "case_id", "gender, "age_years", "duration_of_stay", "pccl", "drg", there are {num_match_case_id} out of {len(revised_case_id)} revised cases from DtoD that can be matched with the bfs database data for {file_info.hospital_name_db} {file_info.year}' )
else:
    print(f'Based on the columns "case_id" and "Patient_id", "gender, "age_years", "duration_of_stay", "pccl", "drg", there are {num_match_case_patient_id} out of {len(revised_case_id)} revised cases from DtoD that can be matched with the bfs database data for {file_info.hospital_name_db} {file_info.year}' )

In [None]:
print('All the normalized revised cases are exported to here: {}'.format(output_path))