In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# LU look up file for health codes
LU_ICD = pd.read_csv('/content/drive/MyDrive/Data/icd10/ICD10Diagnosis.csv')
LU_ICD.rename(columns={'Code': 'ICD_DIAG_CD', 'Description': 'ICD_Description'}, inplace=True)

outpatient = pd.read_csv('/content/drive/MyDrive/Data/Input/outpatient.csv', sep='|')
col_outpatient = pd.DataFrame(outpatient.dtypes, columns=['type'])
#col_outpatient

In [None]:
# convert columns to appropriate data types
outpatient['BENE_ID'] = outpatient['BENE_ID'].astype(str)
outpatient['CLM_ID'] = outpatient['CLM_ID'].astype(str)
outpatient['PTNT_DSCHRG_STUS_CD'] = outpatient['PTNT_DSCHRG_STUS_CD'].astype(str)

# dates columns
outpatient['CLM_FROM_DT'] = pd.to_datetime(outpatient['CLM_FROM_DT'], format='%d-%b-%Y')
outpatient['CLM_THRU_DT'] = pd.to_datetime(outpatient['CLM_THRU_DT'], format='%d-%b-%Y')
outpatient['YR'] = outpatient['CLM_THRU_DT'].dt.year

# drop and select columns
columns_to_drop = [col for col in outpatient.columns if col.endswith('UPIN') or col.startswith('ICD_DGNS_E_CD') or col.startswith('PRCDR_DT')]
outpatient = outpatient.drop(columns=columns_to_drop)

In [None]:
#rev_center (revenue center) of 450 means ER. So no ER claims exist in this data (but expect lots of 450s in the inpatient file)
##rev_center of 780 and 789 ralates with telemedicine, so we can ignore...
outpatient['REV_CNTR'].value_counts()
outpatient = outpatient[outpatient['CLM_LINE_NUM'] == 1]

In [None]:
#compute LOS (ND: usually 1 for all outpatient data, but could be more than 1 day for patients under observation/recovery)
outpatient['LOS'] = (outpatient['CLM_THRU_DT'] - outpatient['CLM_FROM_DT']).dt.days + 1
outpatient['LOS'].value_counts()

Unnamed: 0_level_0,count
LOS,Unnamed: 1_level_1
1,371460
2,31087
3,81
8,22
4,2
48,1


In [None]:
diagnosis = outpatient[['BENE_ID', 'YR', 'PRNCPAL_DGNS_CD'] + [col for col in outpatient.columns if col.startswith('ICD_DGNS_CD')]]

# wide to long format data, and remove rows where ICD_DIAG_CD is not populated
diagnosis = pd.melt(diagnosis, id_vars=['BENE_ID', 'YR'], var_name='ICD_DIAG_COL', value_name='ICD_DIAG_CD')
diagnosis = diagnosis[diagnosis['ICD_DIAG_CD'].notna() & (diagnosis['ICD_DIAG_CD'] != '')]

# drop the 'ICD_DIAG_COL', then remove duplicates
diagnosis = diagnosis.drop(columns=['ICD_DIAG_COL'])
diagnosis = diagnosis.drop_duplicates()

#  merge with ICD data
diagnosis = pd.merge(diagnosis, LU_ICD, on='ICD_DIAG_CD', how='left')
num_diagnosis = diagnosis.groupby(['BENE_ID', 'YR']).size().reset_index(name='NUM_DIAG')
#num_diagnosis

In [None]:
outpatient_encounters = outpatient[['BENE_ID', 'CLM_ID', 'CLM_FROM_DT', 'CLM_THRU_DT', 'YR', #'LOS',
                                  'PRNCPAL_DGNS_CD', 'PTNT_DSCHRG_STUS_CD', 'CLM_TOT_CHRG_AMT']]

# perform left joins
outpatient_encounters = outpatient_encounters.merge(num_diagnosis, on=['BENE_ID', 'YR'], how='left')
outpatient_encounters = outpatient_encounters.merge(LU_ICD, left_on='PRNCPAL_DGNS_CD', right_on='ICD_DIAG_CD',
                                                  how='left').drop(columns=['ICD_DIAG_CD'])

In [None]:
outpatient_encounters.to_csv('/content/drive/MyDrive/Data/Output/outpatient_encounters.csv', index=False)
diagnosis.to_csv('/content/drive/MyDrive/Data/Output/outpatient_diagnosis.csv', index=False)