In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
pd.set_option('display.max_columns', None)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# load in input files (diagnosis files)
inpatient_diagnosis = pd.read_csv('/content/drive/MyDrive/Data/Output/diagnosis.csv')
outpatient_diagnosis = pd.read_csv('/content/drive/MyDrive/Data/Output/outpatient_diagnosis.csv')
professional_diagnosis = pd.read_csv('/content/drive/MyDrive/Data/Output/professional_diagnosis.csv')

# CCSR lookup file for ICD10 regrouping
LU_CCSR = pd.read_csv('/content/drive/MyDrive/Data/Input/DXCCSR_v2025-1.csv')
LU_CCSR.head()

  LU_CCSR = pd.read_csv('/content/drive/MyDrive/Data/Input/DXCCSR_v2025-1.csv')


Unnamed: 0,'ICD-10-CM CODE','ICD-10-CM CODE DESCRIPTION','Default CCSR CATEGORY IP','Default CCSR CATEGORY DESCRIPTION IP','Default CCSR CATEGORY OP','Default CCSR CATEGORY DESCRIPTION OP','CCSR CATEGORY 1','CCSR CATEGORY 1 DESCRIPTION','CCSR CATEGORY 2','CCSR CATEGORY 2 DESCRIPTION','CCSR CATEGORY 3','CCSR CATEGORY 3 DESCRIPTION','CCSR CATEGORY 4','CCSR CATEGORY 4 DESCRIPTION','CCSR CATEGORY 5','CCSR CATEGORY 5 DESCRIPTION','CCSR CATEGORY 6','CCSR CATEGORY 6 DESCRIPTION','Rationale for Default Assignment'
0,'A000',"Cholera due to Vibrio cholerae 01, biovar chol...",'DIG001',Intestinal infection,'DIG001',Intestinal infection,'DIG001',Intestinal infection,'INF003',Bacterial infections,' ',,' ',,' ',,' ',,06 Infectious conditions
1,'A001',"Cholera due to Vibrio cholerae 01, biovar eltor",'DIG001',Intestinal infection,'DIG001',Intestinal infection,'DIG001',Intestinal infection,'INF003',Bacterial infections,' ',,' ',,' ',,' ',,06 Infectious conditions
2,'A009',"Cholera, unspecified",'DIG001',Intestinal infection,'DIG001',Intestinal infection,'DIG001',Intestinal infection,'INF003',Bacterial infections,' ',,' ',,' ',,' ',,06 Infectious conditions
3,'A0100',"Typhoid fever, unspecified",'DIG001',Intestinal infection,'DIG001',Intestinal infection,'DIG001',Intestinal infection,'INF003',Bacterial infections,' ',,' ',,' ',,' ',,06 Infectious conditions
4,'A0101',Typhoid meningitis,'NVS001',Meningitis,'NVS001',Meningitis,'INF003',Bacterial infections,'NVS001',Meningitis,' ',,' ',,' ',,' ',,06 Infectious conditions


In [None]:
# Remove '' from cells and column names in LU_CCSR
LU_CCSR.columns = LU_CCSR.columns.str.strip("'")
for column in LU_CCSR.columns:
  if LU_CCSR[column].dtype == object:
    LU_CCSR[column] = LU_CCSR[column].str.strip("'")

# select needed columns, then rename
LU_CCSR = LU_CCSR[['ICD-10-CM CODE', 'CCSR CATEGORY 1', 'CCSR CATEGORY 1 DESCRIPTION']]
LU_CCSR.rename(columns={'ICD-10-CM CODE':'ICD_CD', 'CCSR CATEGORY 1':'CCSR_CATEGORY_CD','CCSR CATEGORY 1 DESCRIPTION': 'CCSR_CATEGORY_DESCRIPTION'}, inplace=True)
LU_CCSR

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  LU_CCSR.rename(columns={'ICD-10-CM CODE':'ICD_CD', 'CCSR CATEGORY 1':'CCSR_CATEGORY_CD','CCSR CATEGORY 1 DESCRIPTION': 'CCSR_CATEGORY_DESCRIPTION'}, inplace=True)


Unnamed: 0,ICD_CD,CCSR_CATEGORY_CD,CCSR_CATEGORY_DESCRIPTION
0,A000,DIG001,Intestinal infection
1,A001,DIG001,Intestinal infection
2,A009,DIG001,Intestinal infection
3,A0100,DIG001,Intestinal infection
4,A0101,INF003,Bacterial infections
...,...,...,...
75233,Z9912,FAC012,Other specified encounters and counseling
75234,Z992,FAC025,Other specified status
75235,Z993,FAC025,Other specified status
75236,Z9981,FAC025,Other specified status


In [None]:
# prompt: for each of the three diagnosis files, create a column ICD_CD_TRUNC which is the first 3 characters of ICD_DIAG_CD
inpatient_diagnosis['ICD_CD_TRUNC'] = inpatient_diagnosis['ICD_DIAG_CD'].str[:3]
outpatient_diagnosis['ICD_CD_TRUNC'] = outpatient_diagnosis['ICD_DIAG_CD'].str[:3]
professional_diagnosis['ICD_CD_TRUNC'] = professional_diagnosis['ICD_DIAG_CD'].str[:3]

In [None]:
# prompt: Combine all three diagnosis files and create a dataframe of unique ICD_DIAG_CD only
# Combine all three diagnosis files, then get unique ICD_DIAG_CD
all_diagnosis = pd.concat([inpatient_diagnosis, outpatient_diagnosis, professional_diagnosis], ignore_index=True)
unique_icd_diag_cd = pd.DataFrame({'ICD_DIAG_CD': all_diagnosis['ICD_DIAG_CD'].unique()})
unique_icd_diag_cd

Unnamed: 0,ICD_DIAG_CD
0,S134XX
1,Z3480
2,T7432X
3,S8290X
4,Z3400
...,...
381,P269
382,K8020
383,R042
384,Z20822


In [None]:
# prompt: Let's merge LU_CCSR and unique_icd_diag_cd through ICD_CD=ICD_DIAG_CD. Are there any rows in unique_icd_diag_cd that have no corresponding values in CCSR_CATEGORY_DESCRIPTION?

# Merge LU_CCSR and unique_icd_diag_cd on ICD_CD=ICD_DIAG_CD
merged_df = pd.merge(unique_icd_diag_cd, LU_CCSR, left_on='ICD_DIAG_CD', right_on='ICD_CD', how='left')

# Check for rows in unique_icd_diag_cd with no corresponding values in CCSR_CATEGORY_DESCRIPTION
rows_without_match = merged_df[merged_df['CCSR_CATEGORY_DESCRIPTION'].isnull()]
print(len(rows_without_match))

# replace missing with UNK for unknown...
merged_df['CCSR_CATEGORY_CD'] = merged_df['CCSR_CATEGORY_CD'].fillna('UNK')
merged_df['CCSR_CATEGORY_DESCRIPTION'] = merged_df['CCSR_CATEGORY_DESCRIPTION'].fillna('UNK')
merged_df

# drop ICD_10 and CCSR_CATEGORY_CD
LU_CCSR_final = merged_df.drop(columns=['ICD_CD', 'CCSR_CATEGORY_CD'])

50


Unnamed: 0,ICD_DIAG_CD,ICD_CD,CCSR_CATEGORY_CD,CCSR_CATEGORY_DESCRIPTION
0,S134XX,,UNK,UNK
1,Z3480,Z3480,PRG029,"Uncomplicated pregnancy, delivery or puerperium"
2,T7432X,,UNK,UNK
3,S8290X,,UNK,UNK
4,Z3400,Z3400,PRG029,"Uncomplicated pregnancy, delivery or puerperium"
...,...,...,...,...
381,P269,P269,PNL006,Respiratory perinatal condition
382,K8020,K8020,DIG017,Biliary tract disease
383,R042,R042,SYM013,Respiratory signs and symptoms
384,Z20822,Z20822,FAC016,"Exposure, encounters, screening or contact wit..."


In [None]:
# merge diagnosis files with the LU file
inpatient_diagnosis = pd.merge(inpatient_diagnosis, LU_CCSR_final, on='ICD_DIAG_CD', how='left')
inpatient_diagnosis['CCSR_CATEGORY_DESCRIPTION'].isnull().sum()

outpatient_diagnosis = pd.merge(outpatient_diagnosis, LU_CCSR_final, on='ICD_DIAG_CD', how='left')
outpatient_diagnosis['CCSR_CATEGORY_DESCRIPTION'].isnull().sum()

professional_diagnosis = pd.merge(professional_diagnosis, LU_CCSR_final, on='ICD_DIAG_CD', how='left')
professional_diagnosis['CCSR_CATEGORY_DESCRIPTION'].isnull().sum()

np.int64(0)

In [None]:
# drop ICD_DIAG_CD and ICD_Description, then drop duplicates
inpatient_categories = inpatient_diagnosis.drop(columns=['ICD_DIAG_CD', 'ICD_Description']).drop_duplicates()
outpatient_categories = outpatient_diagnosis.drop(columns=['ICD_DIAG_CD', 'ICD_Description']).drop_duplicates()
professional_categories = professional_diagnosis.drop(columns=['ICD_DIAG_CD']).drop_duplicates()

In [None]:
# create flags to indicate which file categorization/regrouping comes from (useful in subsequent merger)
inpatient_categories['IP_FLAG'] = 'Y'
outpatient_categories['OP_FLAG'] = 'Y'
professional_categories['PR_FLAG'] = 'Y'

In [None]:
# Merge all three data files
master_categories = pd.merge(inpatient_categories, outpatient_categories,
                             on=['BENE_ID','YR','ICD_CD_TRUNC','CCSR_CATEGORY_DESCRIPTION'], how='outer')
master_categories['CCSR_CATEGORY_DESCRIPTION'].isnull().sum()

master_categories = pd.merge(master_categories, professional_categories,
                             on=['BENE_ID','YR','ICD_CD_TRUNC','CCSR_CATEGORY_DESCRIPTION'], how='outer')
master_categories['CCSR_CATEGORY_DESCRIPTION'].isnull().sum()

# replace missing values in flag columns with 'N' for "no"
master_categories['IP_FLAG'] = master_categories['IP_FLAG'].fillna('N')
master_categories['OP_FLAG'] = master_categories['OP_FLAG'].fillna('N')
master_categories['PR_FLAG'] = master_categories['PR_FLAG'].fillna('N')

np.int64(0)

In [None]:
# prompt: count the number of rows by CCSR_CATEGORY_DESCRIPTION and ICD_CD_TRUNC
row_counts = master_categories.groupby(['CCSR_CATEGORY_DESCRIPTION', 'ICD_CD_TRUNC'])['BENE_ID'].count().reset_index(name='Row_Count')
row_counts

Unnamed: 0,CCSR_CATEGORY_DESCRIPTION,ICD_CD_TRUNC,Row_Count
0,Abdominal pain and other digestive/abdomen sig...,R13,194
1,Abdominal pain and other digestive/abdomen sig...,R19,130
2,Abnormal findings without diagnosis,R73,3750
3,Abnormal findings without diagnosis,R76,17
4,Abnormal findings without diagnosis,R93,22134
...,...,...,...
242,Urinary tract infections,N30,681
243,Urinary tract infections,N39,527
244,Viral infection,B08,1784
245,Viral infection,B34,3507


Too many unknowns (ie., UNK; $n=32631$).
Most of the 3-character truncated codes range between S00 and T88, mainly corresponding to injury or poisoning, among others. [This reduces the number of UNKs]

In [None]:
# prompt: if ICD_CD_TRUNC in master_categories is in the range S00-T88 and CCSR_CATEGORY_DESCRIPTION is UNK, then recode CCSR_CATEGORY_DESCRIPTION as "Injury or poisoning and others"

# Create a boolean mask for rows where ICD_CD_TRUNC is within the range S00-T88 and CCSR_CATEGORY_DESCRIPTION is UNK
mask = (master_categories['ICD_CD_TRUNC'].between('S00', 'T88', inclusive="both")) & (master_categories['CCSR_CATEGORY_DESCRIPTION'] == 'UNK')

# Recode CCSR_CATEGORY_DESCRIPTION for the rows matching the mask
master_categories.loc[mask, 'CCSR_CATEGORY_DESCRIPTION'] = 'Injury or poisoning and others'

In [None]:
row_counts = master_categories.groupby(['CCSR_CATEGORY_DESCRIPTION'])['BENE_ID'].count().reset_index(name='Row_Count')
row_counts

Unnamed: 0,CCSR_CATEGORY_DESCRIPTION,Row_Count
0,Abdominal pain and other digestive/abdomen sig...,324
1,Abnormal findings without diagnosis,25901
2,Acquired absence of limb or organ,3595
3,Acute and chronic tonsillitis,2120
4,Acute bronchitis,4270
...,...,...
135,Trauma- and stressor-related disorders,11
136,UNK,631
137,"Uncomplicated pregnancy, delivery or puerperium",2056
138,Urinary tract infections,1632


In [None]:
master_categories.to_csv('/content/drive/MyDrive/Data/Output/master_categories.csv', index=False)