In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
dtype_mapping = {
    "DiagnosisGroupCode": str,
    "DOD": str,
    "ClmAdmitDiagnosisCode": str,
    **{f"ClmDiagnosisCode_{i}": str for i in range(1, 11)}
}

date_columns = ["ClaimStartDt", "ClaimEndDt", "DOB", "DOD", "AdmissionDt", "DischargeDt"]

train_data = pd.read_csv("data/train_data.csv", dtype=dtype_mapping, parse_dates=date_columns)
unlabeled_data = pd.read_csv("data/unlabeled_data.csv", dtype=dtype_mapping, parse_dates=date_columns)

date_format = "%Y-%m-%d"

for col in ["AdmissionDt", "DischargeDt", "DOD"]:
    if col in train_data.columns:
        train_data[col] = pd.to_datetime(train_data[col], format=date_format, errors="coerce")
    if col in unlabeled_data.columns:
        unlabeled_data[col] = pd.to_datetime(unlabeled_data[col], format=date_format, errors="coerce")
print("Data loaded successfully.")

Data loaded successfully.


In [3]:
train_data.replace({"nan": pd.NA, "0nan": pd.NA, "NaN": pd.NA, "": pd.NA}, inplace=True)

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558211 entries, 0 to 558210
Data columns (total 59 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   BeneID                           558211 non-null  object        
 1   ClaimID                          558211 non-null  object        
 2   ClaimStartDt                     558211 non-null  datetime64[ns]
 3   ClaimEndDt                       558211 non-null  datetime64[ns]
 4   Provider                         558211 non-null  object        
 5   InscClaimAmtReimbursed           558211 non-null  int64         
 6   AttendingPhysician               556703 non-null  object        
 7   OperatingPhysician               114447 non-null  object        
 8   OtherPhysician                   199736 non-null  object        
 9   AdmissionDt                      40474 non-null   datetime64[ns]
 10  ClmAdmitDiagnosisCode            145899 non-

In [5]:
train_data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age,ChronicCount,PotentialFraud,ClaimType
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,1,1,36000,3204,60,70,82,7,Yes,Inpatient
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,1,1,36000,3204,60,70,82,7,No,Inpatient
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,...,1,1,36000,3204,60,70,82,7,No,Inpatient
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,1,1,5000,1068,250,320,111,6,No,Inpatient
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,...,2,2,21260,2136,120,100,87,5,No,Inpatient


In [6]:
def map_category(code, category_dict):
    """Maps an ICD-9 code to a category based on provided ranges."""
    if pd.isna(code):
        return "Unknown"

    code = code.strip().upper()

    if code.startswith("V"):
        return "V-Codes"
    if code.startswith("E"):
        return "E-Codes"

    try:
        category_code = int(code.split(".")[0])
        for (low, high), category in category_dict.items():
            if low <= category_code <= high:
                return category
    except ValueError:
        return "Unknown"

    return "Unknown"

In [7]:
diagnosis_category_map = {
    (1, 139): "Infectious & Parasitic Diseases",
    (140, 239): "Neoplasms",
    (240, 279): "Endocrine, Nutritional, and Metabolic Diseases",
    (280, 289): "Diseases of the Blood",
    (290, 319): "Mental Disorders",
    (320, 389): "Diseases of the Nervous System and Sense Organs",
    (390, 459): "Diseases of the Circulatory System",
    (460, 519): "Diseases of the Respiratory System",
    (520, 579): "Diseases of the Digestive System",
    (580, 629): "Diseases of the Genitourinary System",
    (630, 679): "Pregnancy, Childbirth, and the Puerperium",
    (680, 709): "Diseases of the Skin and Subcutaneous Tissue",
    (710, 739): "Diseases of the Musculoskeletal System",
    (740, 759): "Congenital Anomalies",
    (780, 799): "Symptoms, Signs, and Ill-Defined Conditions",
    (800, 999): "Injury and Poisoning",
}

diagnosis_columns = [f"ClmDiagnosisCode_{i}" for i in range(1, 11)]
for col in diagnosis_columns:
    train_data[col] = train_data[col].astype(str).str.strip().str.upper()
    train_data[f"{col}_Category"] = train_data[col].apply(lambda x: map_category(x, diagnosis_category_map))

In [8]:
procedure_category_map = {
    (0, 0): "Miscellaneous Diagnostic & Therapeutic Procedures",
    (1, 5): "Procedures on the Nervous System",
    (6, 7): "Procedures on the Endocrine System",
    (8, 16): "Procedures on the Eye & Ear",
    (17, 20): "Operations on the Cardiovascular System",
    (21, 29): "Operations on the Respiratory System",
    (30, 34): "Operations on the Digestive System",
    (35, 39): "Cardiovascular Procedures",
    (40, 41): "Procedures on the Lymphatic & Hemic System",
    (42, 54): "Procedures on the Digestive System",
    (55, 59): "Procedures on the Urinary System",
    (60, 64): "Procedures on the Male Genital Organs",
    (65, 71): "Procedures on the Female Genital Organs",
    (72, 75): "Obstetric & Gynecological Procedures",
    (76, 84): "Orthopedic Procedures",
    (85, 86): "Operations on the Breast and Skin",
    (87, 99): "Radiology, Physical Therapy, and Other Miscellaneous Procedures",
}

procedure_columns = [f"ClmProcedureCode_{i}" for i in range(1, 7)]
for col in procedure_columns:
    train_data[col] = train_data[col].astype(str).str.zfill(4)
    train_data[f"{col}_Category"] = train_data[col].apply(lambda x: map_category(x, procedure_category_map))

In [9]:
train_data["NumProcedures"] = train_data[procedure_columns].notna().sum(axis=1)

In [10]:
proc_desc = pd.read_excel("medical_codes/CMS32_DESC_LONG_SHORT_SG.xlsx", dtype=str)
proc_desc.rename(columns={"PROCEDURE CODE": "ProcedureCode"}, inplace=True)
proc_desc["ProcedureCode"] = proc_desc["ProcedureCode"].astype(str).str.zfill(4)

train_data = train_data.merge(proc_desc, left_on="ClmProcedureCode_1", right_on="ProcedureCode", how="left")
train_data.drop(columns=["ProcedureCode"], inplace=True, errors="ignore")

print("Merged procedure descriptions.")

Merged procedure descriptions.


In [11]:
dx_desc = pd.read_excel("medical_codes/CMS32_DESC_LONG_SHORT_DX.xlsx", dtype=str)
dx_desc.rename(columns={"DIAGNOSIS CODE": "DiagnosisCode"}, inplace=True)
dx_desc["DiagnosisCode"] = dx_desc["DiagnosisCode"].astype(str).str.zfill(4)

for col in diagnosis_columns:
    train_data = train_data.merge(dx_desc, left_on=col, right_on="DiagnosisCode", how="left", suffixes=("", f"_{col}"))

print("Merged diagnosis descriptions.")

Merged diagnosis descriptions.


In [12]:
columns_to_drop = [
    "DiagnosisGroupCode",
    "PrimaryProcedure",
    *diagnosis_columns,
    *procedure_columns,
    *[f"DiagnosisCode_{col}" for col in diagnosis_columns]
]
train_data.drop(columns=columns_to_drop, inplace=True, errors="ignore")

In [13]:
for i in range(1, 11):
    train_data.rename(columns={
        f"LONG DESCRIPTION_ClmDiagnosisCode_{i}": f"ClmDiagnosisCode_{i}_LongDesc",
        f"SHORT DESCRIPTION_ClmDiagnosisCode_{i}": f"ClmDiagnosisCode_{i}_ShortDesc"
    }, inplace=True)

In [14]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 558683 entries, 0 to 558682
Data columns (total 82 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   BeneID                           558683 non-null  object        
 1   ClaimID                          558683 non-null  object        
 2   ClaimStartDt                     558683 non-null  datetime64[ns]
 3   ClaimEndDt                       558683 non-null  datetime64[ns]
 4   Provider                         558683 non-null  object        
 5   InscClaimAmtReimbursed           558683 non-null  int64         
 6   AttendingPhysician               557174 non-null  object        
 7   OperatingPhysician               114543 non-null  object        
 8   OtherPhysician                   199904 non-null  object        
 9   AdmissionDt                      40498 non-null   datetime64[ns]
 10  ClmAdmitDiagnosisCode            146000 non-

In [15]:
train_data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ClmDiagnosisCode_6_LongDesc,ClmDiagnosisCode_6_ShortDesc,ClmDiagnosisCode_7_LongDesc,ClmDiagnosisCode_7_ShortDesc,ClmDiagnosisCode_8_LongDesc,ClmDiagnosisCode_8_ShortDesc,ClmDiagnosisCode_9_LongDesc,ClmDiagnosisCode_9_ShortDesc,ClmDiagnosisCode_10_LongDesc,ClmDiagnosisCode_10_ShortDesc
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,"Osteoarthrosis, unspecified whether generalize...",Osteoarthros NOS-unspec,Other and unspecified hyperlipidemia,Hyperlipidemia NEC/NOS,Secondary malignant neoplasm of other specifie...,Secondary malig neo NEC,"Acute kidney failure, unspecified",Acute kidney failure NOS,,
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,,,,,,,,,,
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,...,Obstructive sleep apnea (adult)(pediatric),Obstructive sleep apnea,,,,,,,,
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,Other primary cardiomyopathies,Prim cardiomyopathy NEC,"Diabetes with neurological manifestations, typ...",DMII neuro uncntrld,"Hypertensive chronic kidney disease, unspecifi...",Hy kid NOS w cr kid I-IV,Unspecified essential hypertension,Hypertension NOS,,
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,...,,,Unspecified pleural effusion,Pleural effusion NOS,"Major depressive affective disorder, single ep...",Depress psychosis-unspec,"Multiple myeloma, without mention of having ac...",Mult mye w/o achv rmson,,


In [16]:
train_data.to_csv("data/train_data_cleaned.csv", index=False)