In [1]:
import pandas as pd

In [2]:
raw_data_location = "data/raw/"

train_inpatient = pd.read_csv(raw_data_location + "Train_Inpatientdata-1542865627584.csv", dtype={"DiagnosisGroupCode": str})
train_outpatient = pd.read_csv(raw_data_location + "Train_Outpatientdata-1542865627584.csv")
train_beneficiaries = pd.read_csv(raw_data_location + "Train_Beneficiarydata-1542865627584.csv", dtype={"DOD": str})
train_providers = pd.read_csv(raw_data_location + "Train-1542865627584.csv")

unlabeled_inpatient = pd.read_csv(raw_data_location + "Test_Inpatientdata-1542969243754.csv", dtype={"DiagnosisGroupCode": str})
unlabeled_outpatient = pd.read_csv(raw_data_location + "Test_Outpatientdata-1542969243754.csv")
unlabeled_beneficiaries = pd.read_csv(raw_data_location + "Test_Beneficiarydata-1542969243754.csv", dtype={"DOD": str})
unlabeled_providers = pd.read_csv(raw_data_location + "Test-1542969243754.csv")

In [3]:
train_beneficiaries.head()

Unnamed: 0,BeneID,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,...,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt
0,BENE11001,1943-01-01,,1,1,0,39,230,12,12,...,1,1,1,2,1,1,36000,3204,60,70
1,BENE11002,1936-09-01,,2,1,0,39,280,12,12,...,2,2,2,2,2,2,0,0,30,50
2,BENE11003,1936-08-01,,1,1,0,52,590,12,12,...,2,2,1,2,2,2,0,0,90,40
3,BENE11004,1922-07-01,,1,1,0,39,270,12,12,...,2,1,1,1,1,2,0,0,1810,760
4,BENE11005,1935-09-01,,1,1,0,24,680,12,12,...,2,1,2,2,2,2,0,0,1790,1200


In [4]:
date_columns = ["ClaimStartDt", "ClaimEndDt", "AdmissionDt", "DischargeDt", "DOB", "DOD"]
for col in date_columns:
    for df in [train_inpatient, train_outpatient, train_beneficiaries, unlabeled_inpatient, unlabeled_outpatient, unlabeled_beneficiaries]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors="coerce")

In [5]:
train_beneficiaries.head()

Unnamed: 0,BeneID,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,...,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt
0,BENE11001,1943-01-01,NaT,1,1,0,39,230,12,12,...,1,1,1,2,1,1,36000,3204,60,70
1,BENE11002,1936-09-01,NaT,2,1,0,39,280,12,12,...,2,2,2,2,2,2,0,0,30,50
2,BENE11003,1936-08-01,NaT,1,1,0,52,590,12,12,...,2,2,1,2,2,2,0,0,90,40
3,BENE11004,1922-07-01,NaT,1,1,0,39,270,12,12,...,2,1,1,1,1,2,0,0,1810,760
4,BENE11005,1935-09-01,NaT,1,1,0,24,680,12,12,...,2,1,2,2,2,2,0,0,1790,1200


In [6]:
from datetime import datetime

current_year = datetime.now().year
train_beneficiaries["Age"] = current_year - train_beneficiaries["DOB"].dt.year
unlabeled_beneficiaries["Age"] = current_year - unlabeled_beneficiaries["DOB"].dt.year

chronic_cols = [col for col in train_beneficiaries.columns if "ChronicCond" in col]
train_beneficiaries["ChronicCount"] = train_beneficiaries[chronic_cols].apply(lambda x: sum(x == 1), axis=1)
unlabeled_beneficiaries["ChronicCount"] = unlabeled_beneficiaries[chronic_cols].apply(lambda x: sum(x == 1), axis=1)

train_inpatient["TotalClaims"] = train_inpatient.groupby("Provider")["ClaimID"].transform("count")
train_outpatient["TotalClaims"] = train_outpatient.groupby("Provider")["ClaimID"].transform("count")

In [14]:
train_data = train_inpatient.merge(train_beneficiaries, on="BeneID", how="left")
train_data = train_data.merge(train_providers, on="Provider", how="left")

train_data_outpatient = train_outpatient.merge(train_beneficiaries, on="BeneID", how="left")
train_data_outpatient = train_data_outpatient.merge(train_providers, on="Provider", how="left")

train_data["ClaimType"] = "Inpatient"
train_data_outpatient["ClaimType"] = "Outpatient"

train_data = pd.concat([train_data, train_data_outpatient], axis=0)

print(train_data["ClaimType"].value_counts())
print(train_data["ClaimStartDt"].notna().sum())
print(train_data["AdmissionDt"].notna().sum())

ClaimType
Outpatient    517737
Inpatient      40474
Name: count, dtype: int64
558211
40474


In [17]:
print(train_data["AdmissionDt"].isna().sum())
print(train_data["AdmissionDt"].notna().sum())
print(train_data["AdmissionDt"].dtype)
print(train_data["AdmissionDt"].head())

517737
40474
datetime64[ns]
0   2009-04-12
1   2009-08-31
2   2009-09-17
3   2009-02-14
4   2009-08-13
Name: AdmissionDt, dtype: datetime64[ns]


In [16]:
unlabeled_data = unlabeled_inpatient.merge(unlabeled_beneficiaries, on="BeneID", how="left")
unlabeled_data = unlabeled_data.merge(unlabeled_providers, on="Provider", how="left")

unlabeled_data_outpatient = unlabeled_outpatient.merge(unlabeled_beneficiaries, on="BeneID", how="left")
unlabeled_data_outpatient = unlabeled_data_outpatient.merge(unlabeled_providers, on="Provider", how="left")

unlabeled_data["ClaimType"] = "Inpatient"
unlabeled_data_outpatient["ClaimType"] = "Outpatient"

unlabeled_data = pd.concat([unlabeled_data, unlabeled_data_outpatient], axis=0)

print(unlabeled_data["ClaimType"].value_counts())
print(unlabeled_data["ClaimStartDt"].notna().sum())
print(unlabeled_data["AdmissionDt"].notna().sum())

ClaimType
Outpatient    125841
Inpatient       9551
Name: count, dtype: int64
135392
9551


In [9]:
import os

os.makedirs("data", exist_ok=True)

train_data.to_csv("data/train_data.csv", index=False)
unlabeled_data.to_csv("data/unlabeled_data.csv", index=False)

In [10]:
train_data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age,ChronicCount,PotentialFraud,ClaimType
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,1,1,36000,3204,60,70,82,7,Yes,Inpatient
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,1,1,36000,3204,60,70,82,7,No,Inpatient
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,...,1,1,36000,3204,60,70,82,7,No,Inpatient
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,1,1,5000,1068,250,320,111,6,No,Inpatient
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,...,2,2,21260,2136,120,100,87,5,No,Inpatient


In [11]:
unlabeled_data.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age,ChronicCount,ClaimType
0,BENE11014,CLM67387,2009-09-09,2009-09-16,PRV57070,9000,PHY317786,PHY427017,,2009-09-09,...,2,2,2,21260,2136,120,100,87,5,Inpatient
1,BENE11017,CLM31237,2008-12-25,2009-01-08,PRV54750,14000,PHY314656,PHY426644,,2008-12-25,...,2,1,1,22000,2136,1400,840,85,9,Inpatient
2,BENE11026,CLM78930,2009-12-09,2009-12-13,PRV53758,2000,PHY349495,,,2009-12-09,...,2,2,2,2000,1068,0,0,87,1,Inpatient
3,BENE11031,CLM56810,2009-06-23,2009-07-06,PRV55825,16000,PHY429538,PHY371893,,2009-06-23,...,1,2,2,23650,2136,40,0,81,4,Inpatient
4,BENE11085,CLM34625,2009-01-20,2009-01-31,PRV52338,19000,PHY397161,,,2009-01-20,...,1,2,2,19000,1068,1670,520,62,9,Inpatient


In [12]:
train_data.describe()

Unnamed: 0,ClaimStartDt,ClaimEndDt,InscClaimAmtReimbursed,AdmissionDt,DeductibleAmtPaid,DischargeDt,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,...,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,Age,ChronicCount
count,558211,558211,558211.0,40474,557312.0,40474,23310.0,5490.0,969.0,118.0,...,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0,558211.0
mean,2009-06-24 23:39:21.603766528,2009-06-26 17:07:35.601913600,997.012133,2009-06-19 17:38:12.493946880,78.421085,2009-06-25 09:36:02.988585216,5896.154612,4106.358106,4221.123839,4070.262712,...,1.240735,1.682353,1.688829,1.89828,5227.971466,568.756807,2278.225348,649.698745,89.27682,4.498616
min,2008-11-27 00:00:00,2008-12-28 00:00:00,0.0,2008-11-27 00:00:00,0.0,2009-01-01 00:00:00,11.0,42.0,42.0,42.0,...,1.0,1.0,1.0,1.0,-8000.0,0.0,-70.0,0.0,42.0,0.0
25%,2009-03-27 00:00:00,2009-03-29 00:00:00,40.0,2009-03-20 00:00:00,0.0,2009-03-26 00:00:00,3848.0,2724.0,2724.0,2754.25,...,1.0,1.0,1.0,2.0,0.0,0.0,460.0,120.0,84.0,3.0
50%,2009-06-23 00:00:00,2009-06-24 00:00:00,80.0,2009-06-16 00:00:00,0.0,2009-06-22 00:00:00,5363.0,4019.0,4019.0,4019.0,...,1.0,2.0,2.0,2.0,0.0,0.0,1170.0,340.0,90.0,5.0
75%,2009-09-22 00:00:00,2009-09-23 00:00:00,300.0,2009-09-17 00:00:00,0.0,2009-09-23 00:00:00,8669.0,4439.0,5185.0,4439.0,...,1.0,2.0,2.0,2.0,6000.0,1068.0,2590.0,790.0,98.0,6.0
max,2009-12-31 00:00:00,2009-12-31 00:00:00,125000.0,2009-12-31 00:00:00,1068.0,2009-12-31 00:00:00,9999.0,9999.0,9999.0,9986.0,...,2.0,2.0,2.0,2.0,161470.0,38272.0,102960.0,13840.0,116.0,11.0
std,,,3821.534891,,274.016812,,3050.489933,2031.640878,2281.849885,2037.62699,...,0.42753,0.465562,0.462973,0.302279,11786.274732,1179.172616,3881.846386,1002.020811,13.011761,2.332301


In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 558211 entries, 0 to 517736
Data columns (total 59 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   BeneID                           558211 non-null  object        
 1   ClaimID                          558211 non-null  object        
 2   ClaimStartDt                     558211 non-null  datetime64[ns]
 3   ClaimEndDt                       558211 non-null  datetime64[ns]
 4   Provider                         558211 non-null  object        
 5   InscClaimAmtReimbursed           558211 non-null  int64         
 6   AttendingPhysician               556703 non-null  object        
 7   OperatingPhysician               114447 non-null  object        
 8   OtherPhysician                   199736 non-null  object        
 9   AdmissionDt                      40474 non-null   datetime64[ns]
 10  ClmAdmitDiagnosisCode            145899 non-null 