In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load DRG and ICD10 lookup files and carier data file
LU_drg = pd.read_csv('/content/drive/MyDrive/Data/Input/DRG.csv')[['DRG', 'Description']]
LU_ICD = pd.read_csv('/content/drive/MyDrive/Data/icd10/ICD10Diagnosis.csv')

# organize (renaming; file types, etc.)
LU_drg.rename(columns={'Description': 'DRG_Description'}, inplace=True)
LU_drg['DRG'] = LU_drg['DRG'].fillna(0).astype(int).astype(str).str.zfill(3)
LU_ICD.rename(columns={'Code': 'ICD_DIAG_CD', 'Description': 'ICD_Description'}, inplace=True)

professional = pd.read_csv('/content/drive/MyDrive/Data/Input/carrier.csv', sep="|")
col_professional = pd.DataFrame(professional.dtypes, columns=['type'])
#col_professional

  professional = pd.read_csv('/content/drive/MyDrive/Data/Input/carrier.csv', sep="|")


In [None]:
professional['LINE_PLACE_OF_SRVC_CD'].value_counts()

# count only encounters from office visits, so as not double-count inpatients, outpatients, etc.
## ...(this assumes that in(out)patient encounters have already been counted in other analysis files)
professional = professional[professional['LINE_PLACE_OF_SRVC_CD'] == 11]

In [None]:
# convert columns to appropriate data types
professional['BENE_ID'] = professional['BENE_ID'].astype(str)
professional['CLM_ID'] = professional['CLM_ID'].astype(str)
professional['CLM_FROM_DT'] = pd.to_datetime(professional['CLM_FROM_DT'], format='%d-%b-%Y')
professional['CLM_THRU_DT'] = pd.to_datetime(professional['CLM_THRU_DT'], format='%d-%b-%Y')

# compute claim_days (similar to LOS) and YR
professional['CLAIM_DAYS'] = (professional['CLM_THRU_DT'] - professional['CLM_FROM_DT']).dt.days + 1
professional['YR'] = professional['CLM_THRU_DT'].dt.year

In [None]:
professional['CLAIM_DAYS'].value_counts()

# filter professional for only CLAIM_DAYS=1 (ignoring billings of 2 or more days)
professional = professional[professional['CLAIM_DAYS'] == 1]
professional = professional.drop(columns=['CLAIM_DAYS'])

In [None]:
# keeping header information only (ie., line_num=1)
## why? mutliple rows per claim_id represent only one encounter with the same physician.
print(len(professional))
professional = professional[professional['LINE_NUM'] == 1]
print(len(professional))

70159


In [None]:
diagnosis = professional[['BENE_ID', 'YR', 'PRNCPAL_DGNS_CD'] + [col for col in professional.columns if col.startswith('ICD_DGNS_CD')]]

# wide to long format data
diagnosis = pd.melt(diagnosis, id_vars=['BENE_ID', 'YR'], var_name='ICD_DIAG_COL', value_name='ICD_DIAG_CD')

# remove rows where ICD_DIAG_CD is not populated
diagnosis = diagnosis[diagnosis['ICD_DIAG_CD'].notna() & (diagnosis['ICD_DIAG_CD'] != '')]

# drop the 'ICD_DIAG_COL', then remove duplicates
diagnosis = diagnosis.drop(columns=['ICD_DIAG_COL'])
diagnosis = diagnosis.drop_duplicates()

# merge with ICD data file
diagnosis = pd.merge(diagnosis, LU_ICD, on='ICD_DIAG_CD', how='left')
num_diagnosis = diagnosis.groupby(['BENE_ID', 'YR']).size().reset_index(name='NUM_DIAG')

In [None]:
professional_encounters = professional[['BENE_ID', 'CLM_ID', 'CLM_FROM_DT', 'CLM_THRU_DT', 'YR',
                                  'PRNCPAL_DGNS_CD', 'CLM_PMT_AMT']].drop_duplicates()

professional_encounters = professional_encounters.merge(num_diagnosis, on=['BENE_ID', 'YR'], how='left')
professional_encounters = professional_encounters.merge(LU_ICD, left_on='PRNCPAL_DGNS_CD', right_on='ICD_DIAG_CD',
                                                  how='left').drop(columns=['ICD_DIAG_CD'])
professional_encounters

Unnamed: 0,BENE_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,YR,PRNCPAL_DGNS_CD,CLM_PMT_AMT,NUM_DIAG,ICD_Description
0,-10000010254618,-10000930037915,2015-09-28,2015-09-28,2015,R4689,932.69,9,Other symptoms and signs involving appearance ...
1,-10000010254618,-10000930037916,2016-10-03,2016-10-03,2016,R4689,745.12,9,Other symptoms and signs involving appearance ...
2,-10000010254618,-10000930037917,2017-10-09,2017-10-09,2017,R4689,1040.54,11,Other symptoms and signs involving appearance ...
3,-10000010254618,-10000930037918,2018-10-15,2018-10-15,2018,R4689,1131.18,11,Other symptoms and signs involving appearance ...
4,-10000010254618,-10000930037919,2019-10-21,2019-10-21,2019,R4689,1382.27,13,Other symptoms and signs involving appearance ...
...,...,...,...,...,...,...,...,...,...
70154,-10000010288007,-10000931486358,2018-06-12,2018-06-12,2018,Z608,1953.80,12,Other problems related to social environment
70155,-10000010288007,-10000931486359,2019-06-18,2019-06-18,2019,Z608,1677.17,12,Other problems related to social environment
70156,-10000010288007,-10000931486360,2020-06-23,2020-06-23,2020,Z608,273.51,11,Other problems related to social environment
70157,-10000010288007,-10000931486361,2021-06-29,2021-06-29,2021,Z608,1323.33,11,Other problems related to social environment


In [None]:
professional_encounters.to_csv('/content/drive/MyDrive/Data/Output/professional_encounters.csv', index=False)
diagnosis.to_csv('/content/drive/MyDrive/Data/Output/professional_diagnosis.csv', index=False)