In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# LU look up file for health codes
## downloaded "ICD-10 MS-DRG Definitions Manual Files V41.1 (ZIP)"...
### ...from https://www.cms.gov/medicare/payment/prospective-payment-systems/acute-inpatient-pps/ms-drg-classifications-and-software
LU_drg = pd.read_csv('/content/drive/MyDrive/Data/Input/DRG.csv')[['DRG', 'Description']]

# download ICD codes
## Pulled 2015-2025 ICD codes, created a database and looked up missing ones
## ...from https://www.cms.gov/medicare/coding-billing/icd-10-codes (2021-2025)
## ... & https://www.cms.gov/medicare/coding-billing/icd-10-codes/icd-10-cm-icd-10-pcs-gem-archive (2015-2020)
LU_ICD = pd.read_csv('/content/drive/MyDrive/Data/icd10/ICD10Diagnosis.csv')

In [None]:
LU_drg.rename(columns={'Description': 'DRG_Description'}, inplace=True)
LU_drg['DRG'] = LU_drg['DRG'].fillna(0).astype(int).astype(str).str.zfill(3)

LU_ICD.rename(columns={'Code': 'ICD_DIAG_CD', 'Description': 'ICD_Description'}, inplace=True)

In [None]:
# Read inpatient data
inpatient = pd.read_csv('/content/drive/MyDrive/Data/Input/inpatient.csv', sep="|")
col_inpatient = pd.DataFrame(inpatient.dtypes, columns=['type'])
#col_inpatient

  inpatient = pd.read_csv('/content/drive/MyDrive/Data/Input/inpatient.csv', sep="|")


In [None]:
# convert columns to appropriate data types
inpatient['BENE_ID'] = inpatient['BENE_ID'].astype(str)
inpatient['CLM_ID'] = inpatient['CLM_ID'].astype(str)
inpatient['PTNT_DSCHRG_STUS_CD'] = inpatient['PTNT_DSCHRG_STUS_CD'].astype(str)
inpatient['CLM_IP_ADMSN_TYPE_CD'] = inpatient['CLM_IP_ADMSN_TYPE_CD'].astype(str)

# date columns
inpatient['CLM_FROM_DT'] = pd.to_datetime(inpatient['CLM_FROM_DT'], format='%d-%b-%Y')
inpatient['CLM_THRU_DT'] = pd.to_datetime(inpatient['CLM_THRU_DT'], format='%d-%b-%Y')

# compute LOS and YR
inpatient['LOS'] = (inpatient['CLM_THRU_DT'] - inpatient['CLM_FROM_DT']).dt.days + 1
inpatient['YR'] = inpatient['CLM_THRU_DT'].dt.year

# convert 'CLM_DRG_CD' to numeric, handling errors by coercing NaN
inpatient['CLM_DRG_CD'] = pd.to_numeric(inpatient['CLM_DRG_CD'], errors='coerce')
inpatient['DRG'] = inpatient['CLM_DRG_CD'].fillna(0).astype(int).astype(str).str.zfill(3)

# drop and select columns
columns_to_drop = ['CLM_DRG_CD'] + [col for col in inpatient.columns if 'POA' in col or col.endswith('UPIN') or col.startswith('ICD_DGNS_E_CD') or col.startswith('PRCDR_DT')]
inpatient = inpatient.drop(columns=columns_to_drop)

# filter rows to keep header info, then create a flag for ER admissions...
inpatient = inpatient[inpatient['CLM_LINE_NUM'] == 1]
inpatient['ER_flag'] = np.where((inpatient['REV_CNTR'] == 450) & (inpatient['LOS'] == '1'), 1, 0)

In [None]:
# following data is in wide form, ie., each patient/beneficiary has at least several diagnosis per year
## but need it in long format, so we can display one diagnosis per row per patient/beneficiary
inpatient[['BENE_ID', 'YR', 'PRNCPAL_DGNS_CD'] + [col for col in inpatient.columns if col.startswith('ICD_DGNS_CD')]].head(5)

Unnamed: 0,BENE_ID,YR,PRNCPAL_DGNS_CD,ICD_DGNS_CD1,ICD_DGNS_CD2,ICD_DGNS_CD3,ICD_DGNS_CD4,ICD_DGNS_CD5,ICD_DGNS_CD6,ICD_DGNS_CD7,...,ICD_DGNS_CD16,ICD_DGNS_CD17,ICD_DGNS_CD18,ICD_DGNS_CD19,ICD_DGNS_CD20,ICD_DGNS_CD21,ICD_DGNS_CD22,ICD_DGNS_CD23,ICD_DGNS_CD24,ICD_DGNS_CD25
0,-10000010254618,2015,S134XX,S134XX,R4689,E781,J329,E119,D649,E849,...,,,,,,,,,,
1,-10000010254653,2015,Z3480,T7432X,E669,C50919,,,,,...,,,,,,,,,,
2,-10000010254653,2017,T7432X,T7432X,E669,C50929,,,,,...,,,,,,,,,,
3,-10000010254656,2017,S8290X,S8290X,G40909,R569,Z8669,,,,...,,,,,,,,,,
5,-10000010254656,2018,Z3480,Z5989,Z5941,G40909,R569,Z8669,,,...,,,,,,,,,,


In [None]:
diagnosis = inpatient[['BENE_ID', 'YR', 'PRNCPAL_DGNS_CD'] + [col for col in inpatient.columns if col.startswith('ICD_DGNS_CD')]]

# wide to long format data
diagnosis = pd.melt(diagnosis, id_vars=['BENE_ID', 'YR'], var_name='ICD_DIAG_COL', value_name='ICD_DIAG_CD')

# remove rows where ICD_DIAG_CD is not populated
diagnosis = diagnosis[diagnosis['ICD_DIAG_CD'].notna() & (diagnosis['ICD_DIAG_CD'] != '')]

# drop the 'ICD_DIAG_COL', then remove duplicates
diagnosis = diagnosis.drop(columns=['ICD_DIAG_COL'])
diagnosis = diagnosis.drop_duplicates()

#diagnosis[diagnosis['BENE_ID'] == '-10000010254618'] #randomly viewing a beneficiary

# merge with ICD data file
diagnosis = pd.merge(diagnosis, LU_ICD, on='ICD_DIAG_CD', how='left')
num_diagnosis = diagnosis.groupby(['BENE_ID', 'YR']).size().reset_index(name='NUM_DIAG')

In [None]:
inpatient_encounters = inpatient[['BENE_ID', 'CLM_ID', 'CLM_FROM_DT', 'CLM_THRU_DT', 'YR', 'LOS',
                                  'DRG', 'PRNCPAL_DGNS_CD', 'PTNT_DSCHRG_STUS_CD',
                                  'CLM_IP_ADMSN_TYPE_CD', 'ER_flag', 'CLM_TOT_CHRG_AMT']]

# perform left joins
inpatient_encounters = inpatient_encounters.merge(num_diagnosis, on=['BENE_ID', 'YR'], how='left')
inpatient_encounters = inpatient_encounters.merge(LU_ICD, left_on='PRNCPAL_DGNS_CD', right_on='ICD_DIAG_CD',
                                                  how='left').drop(columns=['ICD_DIAG_CD'])

inpatient_encounters = inpatient_encounters.merge(LU_drg, on='DRG', how='left')
#inpatient_encounters

In [None]:
inpatient_encounters.to_csv('/content/drive/MyDrive/Data/Output/inpatient_encounters.csv', index=False)
diagnosis.to_csv('/content/drive/MyDrive/Data/Output/diagnosis.csv', index=False)