In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Importing Data

In [2]:
partd_2018 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTD/PartD_Prescriber_PUF_NPI_Drug_18.txt', sep='\t')
partb_2018 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTB/Medicare_Provider_Util_Payment_PUF_CY2018/Medicare_Provider_Util_Payment_PUF_CY2018.txt', sep='\t')
dmepos_2018 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/DMEPOS/Medicare_Referring_Provider_DMEPOS_PUF_CY2018/Medicare_Referring_Provider_DMEPOS_PUF_CY2018.csv')
partd_2017 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTD/PartD_Prescriber_PUF_NPI_DRUG_17/PartD_Prescriber_PUF_NPI_Drug_17.txt', sep='\t')
partb_2017 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTB/Medicare_Provider_Util_Payment_PUF_CY2017/Medicare_Provider_Util_Payment_PUF_CY2017.txt', sep='\t')
dmepos_2017 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/DMEPOS/medicare_referring_provider_dmepos_puf_cy2017/Medicare_Referring_Provider_DMEPOS_PUF_CY2017.tab', sep='\t')
partd_2016 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTD/PartD_Prescriber_PUF_NPI_DRUG_16/PartD_Prescriber_PUF_NPI_Drug_16.txt', sep='\t')
partb_2016 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTB/Medicare-Physician-and-Other-Supplier-PUF/Medicare_Provider_Util_Payment_PUF_CY2016.txt', sep='\t')
dmepos_2016 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/DMEPOS/medicare_referring_provider_dmepos_puf_cy2016/Medicare_Referring_Provider_DMEPOS_PUF_CY2016.tab', sep='\t')
leie = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/LEIE.csv')

# Selecting features and targets.

Filtering out specific exclusion codes for more severe offenses. These will be used to create `FRAUD` and `NOT FRAUD` targets.

In [3]:
#mandatory minimum penalty based on offense category

minimum_exclusion_periods = { 
    '1128a1':5, #Conviction of program-related crimes. Minimum Period: 5 years
    '1128a2':5, #Conviction relating to patient abuse or neglect. Minimum Period: 5 years
    '1128a3':5, #Felony conviction relating to health care fraud. Minimum Period: 5 years
    '1128b4':np.nan, #License revocation, suspension, or surrender. Minimum Period: Period imposed by the state licensing authority.    
    '1128b7':np.nan, #Fraud, kickbacks, and other prohibited activities. Minimum Period: None
    '1128c3gi':10, #Conviction of second mandatory exclusion offense. Minimum Period: 10 years
    '1128c3gii':100 #Conviction of third or more mandatory exclusion offenses. Permanent Exclusion
}

In [4]:
exclusion_codes = ['1128a1', '1128a2', '1128a3', '1128b4', '1128b7', '1128c3gi', '1128c3gii']
filtered_leie = leie[leie.EXCLTYPE.isin(exclusion_codes)]
filtered_leie.head(5)

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE
0,,,,"#1 MARKETING SERVICE, INC",OTHER BUSINESS,SOBER HOME,,0,,239 BRIGHTON BEACH AVENUE,BROOKLYN,NY,11235,1128a1,20200319,0,0,
1,,,,14 LAWRENCE AVE PHARMACY,PHARMACY,,,0,,14 LAWRENCE AVENUE,SMITHTOWN,NY,11787,1128a1,19880830,0,0,
2,,,,143 MEDICAL EQUIPMENT CO,DME COMPANY,DME - OXYGEN,,0,,701 NW 36 AVENUE,MIAMI,FL,33125,1128b7,19970620,0,0,
3,,,,184TH STREET PHARMACY CORP,OTHER BUSINESS,PHARMACY,,1922348218,,69 E 184TH ST,BRONX,NY,10468,1128a1,20180419,0,0,
5,,,,"1ST COMMUNITY HEALTH CTR, LTD",CLINIC,,,0,,3138 W CERMAK ROAD,CHICAGO,IL,60623,1128a1,19940524,0,0,


Converting the exclusion date into a datetime.

In [5]:
filtered_leie['EXCLDATE'] = pd.to_datetime(filtered_leie.EXCLDATE, format='%Y%m%d')

As we can see, there are only 11 dates for the `WAIVERDATE` column and only 1 for the `REINDATE` column, so to infer the exclusion end dates we can add the `minimum_exclusion_periods` value to the exclusion date (`EXCLDATE`) column.

In [6]:
print("Unique `WAIVERDATE` values: ", len(filtered_leie['WAIVERDATE'].unique()))
print("Unique `REINDATE` values: ", len(filtered_leie['REINDATE'].unique()))

Unique `WAIVERDATE` values:  11
Unique `REINDATE` values:  1


In [7]:
#mapping the penalty to the minimum exclusions period, placing minimum exclusions on their own col

filtered_leie['MIN_EXCLUSION_PERIOD'] = filtered_leie['EXCLTYPE'].map(minimum_exclusion_periods)

Now we have to add the minimum exclusion periods to the original exclusion dates. This will give us an estimate of the exclusion end date. 

In [8]:
filtered_leie['END_EXCLDATE'] = filtered_leie.EXCLDATE.dt.year + filtered_leie.MIN_EXCLUSION_PERIOD

The `calculate_exclusion_end` function takes in the beginning exclusion date, then if the month is greater that `6` , adds a year to the end year. This is because the provider would have been fraudulant for most of the year.

In [9]:
def calculate_exclusion_end(exclusion_date, end_year):
    month = exclusion_date.month
    
    if month > 6:
        return end_year + 1
    else:
        return end_year
        

In [10]:
#applying the function to both columns

filtered_leie['END_EXCLDATE'] = filtered_leie[["EXCLDATE", "END_EXCLDATE"]].apply(lambda x: calculate_exclusion_end(*x), axis=1)

In [11]:
filtered_leie.head(2)

Unnamed: 0,LASTNAME,FIRSTNAME,MIDNAME,BUSNAME,GENERAL,SPECIALTY,UPIN,NPI,DOB,ADDRESS,CITY,STATE,ZIP,EXCLTYPE,EXCLDATE,REINDATE,WAIVERDATE,WVRSTATE,MIN_EXCLUSION_PERIOD,END_EXCLDATE
0,,,,"#1 MARKETING SERVICE, INC",OTHER BUSINESS,SOBER HOME,,0,,239 BRIGHTON BEACH AVENUE,BROOKLYN,NY,11235,1128a1,2020-03-19,0,0,,5.0,2025.0
1,,,,14 LAWRENCE AVE PHARMACY,PHARMACY,,,0,,14 LAWRENCE AVENUE,SMITHTOWN,NY,11787,1128a1,1988-08-30,0,0,,5.0,1994.0


Filtering out features from the three datasets.

In [12]:
partb_feats = ['npi', 'hcpcs_code', 'hcpcs_description', 'hcpcs_drug_indicator', 'provider_type', 'nppes_provider_gender', 'line_srvc_cnt', 'bene_unique_cnt', 'bene_day_srvc_cnt', 'average_submitted_chrg_amt', 'average_Medicare_payment_amt']
partd_feats = ['npi', 'specialty_description', 'bene_count', 'total_claim_count', 'total_30_day_fill_count', 'total_day_supply', 'total_drug_cost']
dmepos_feats = ['REFERRING_NPI', 'REFERRING_PROVIDER_TYPE', 'REFERRING_PROVIDER_GENDER', 'NUMBER_OF_SUPPLIERS', 'NUMBER_OF_SUPPLIER_BENEFICIARIES', 'NUMBER_OF_SUPPLIER_CLAIMS', 'NUMBER_OF_SUPPLIER_SERVICES', 'AVG_SUPPLIER_SUBMITTED_CHARGE', 'AVG_SUPPLIER_MEDICARE_PMT_AMT']

partb_features_2018 = partb_2018[partb_feats]
partd_features_2018 = partd_2018[partd_feats]
dmepos_features_2018 = dmepos_2018[dmepos_feats]

#Add a year column so we know where to add fraud labels. Activity before exclusion dates will be considered fraudulent. 
partb_features_2018['YEAR'] = 2018
partd_features_2018['YEAR'] = 2018
dmepos_features_2018['YEAR'] = 2018

partb_features_2017 = partb_2017[partb_feats]
partd_features_2017 = partd_2017[partd_feats]
dmepos_features_2017 = dmepos_2017[dmepos_feats]

partb_features_2017['YEAR'] = 2017
partd_features_2017['YEAR'] = 2017
dmepos_features_2017['YEAR'] = 2017

partb_features_2016 = partb_2016[[x.upper() for x in partb_feats]]
partd_features_2016 = partd_2016[partd_feats]
dmepos_features_2016 = dmepos_2016[dmepos_feats]

partb_features_2016['YEAR'] = 2016
partd_features_2016['YEAR'] = 2016
dmepos_features_2016['YEAR'] = 2016

In [13]:
#make 2016 headers lowercase like the other years. 

partb_features_2016.columns = [x.lower() for x in partb_features_2016.columns]

# Concatenate all the yearly data.

In [14]:
# make NPI lowercase so we can merge on it later. 

filtered_leie.rename({'NPI':'npi'}, axis=1, inplace=True)
dmepos_features_2018.rename({'REFERRING_NPI':'npi'}, axis=1, inplace=True)
dmepos_features_2017.rename({'REFERRING_NPI':'npi'}, axis=1, inplace=True)
dmepos_features_2016.rename({'REFERRING_NPI':'npi'}, axis=1, inplace=True)

In [15]:
partb_features = pd.concat([partb_features_2018, partb_features_2017, partb_features_2016])
partd_features = pd.concat([partd_features_2018, partd_features_2017, partd_features_2016])
dmepos_features = pd.concat([dmepos_features_2018, dmepos_features_2017, dmepos_features_2016])

In [16]:
partb_features.iloc[2]

YEAR                                                                       2018
average_Medicare_payment_amt                                            76.7953
average_medicare_payment_amt                                                NaN
average_submitted_chrg_amt                                              476.947
bene_day_srvc_cnt                                                            19
bene_unique_cnt                                                              19
hcpcs_code                                                                99218
hcpcs_description               Hospital observation care, typically 30 minutes
hcpcs_drug_indicator                                                          N
line_srvc_cnt                                                                19
npi                                                                  1003000126
nppes_provider_gender                                                         M
provider_type                           

# Merge all the combined yearly data on NPI

In [17]:
partb = pd.merge(filtered_leie, partb_features, on='npi', how='outer')
partd = pd.merge(filtered_leie, partd_features, on='npi', how='outer')
dmepos = pd.merge(filtered_leie, dmepos_features, on='npi', how='outer')

# Part B dataset processing.

Filter out HCPCS codes referring to prescriptions, the `line_srvc_cnt` feature for these entries refer to weight/volume of a drug and not procedure counts. Including both in the same dataset would cause conflicts. 

In [18]:
partb = partb[partb.hcpcs_drug_indicator.eq('N')]

In [19]:
partb.head(5).iloc[1]

LASTNAME                                                                      NaN
FIRSTNAME                                                                     NaN
MIDNAME                                                                          
BUSNAME                                                   ARROW-MED AMBULANCE INC
GENERAL                                                            OTHER BUSINESS
SPECIALTY                                                       AMBULANCE COMPANY
UPIN                                                                          NaN
npi                                                                    1437418506
DOB                                                                           NaN
ADDRESS                                                     2470 HIGHWAY 15 NORTH
CITY                                                                      JACKSON
STATE                                                                          KY
ZIP             

In [20]:
partb = partb.groupby(['npi','provider_type', 'nppes_provider_gender', 'year']).agg({'line_srvc_cnt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'bene_unique_cnt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'bene_day_srvc_cnt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'average_submitted_chrg_amt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'average_medicare_payment_amt': ['sum', 'mean', 'median', np.std, 'min', 'max']})

Flatten the multiindex to make it easier to work with.

In [21]:
partb.columns = ['_'.join(col) for col in partb.columns.values]

Physicians who only performed and operation or perscribed one type of drug or procedure show as `null` values, we can replace these values with 0 because there is no variability. 

In [22]:
partb.isna().sum()

line_srvc_cnt_sum                           0
line_srvc_cnt_mean                          0
line_srvc_cnt_median                        0
line_srvc_cnt_std                      101102
line_srvc_cnt_min                           0
line_srvc_cnt_max                           0
bene_unique_cnt_sum                         0
bene_unique_cnt_mean                        0
bene_unique_cnt_median                      0
bene_unique_cnt_std                    101102
bene_unique_cnt_min                         0
bene_unique_cnt_max                         0
bene_day_srvc_cnt_sum                       0
bene_day_srvc_cnt_mean                      0
bene_day_srvc_cnt_median                    0
bene_day_srvc_cnt_std                  101102
bene_day_srvc_cnt_min                       0
bene_day_srvc_cnt_max                       0
average_submitted_chrg_amt_sum              0
average_submitted_chrg_amt_mean             0
average_submitted_chrg_amt_median           0
average_submitted_chrg_amt_std    

In [23]:
#replacing NA with 0

partb.fillna(0, inplace=True)

In [67]:
partb = partb.reset_index(level=['npi', 'provider_type', 'nppes_provider_gender', 'year'])

# Part D dataset processing.

In [25]:
partd.columns

Index(['LASTNAME', 'FIRSTNAME', 'MIDNAME', 'BUSNAME', 'GENERAL', 'SPECIALTY',
       'UPIN', 'npi', 'DOB', 'ADDRESS', 'CITY', 'STATE', 'ZIP', 'EXCLTYPE',
       'EXCLDATE', 'REINDATE', 'WAIVERDATE', 'WVRSTATE',
       'MIN_EXCLUSION_PERIOD', 'END_EXCLDATE', 'specialty_description',
       'bene_count', 'total_claim_count', 'total_30_day_fill_count',
       'total_day_supply', 'total_drug_cost', 'YEAR'],
      dtype='object')

In [26]:
partd.iloc[500]

LASTNAME                                              NaN
FIRSTNAME                                             NaN
MIDNAME                                                  
BUSNAME                    HEMATOLOGY AND ONCOLOGY CENTER
GENERAL                              PHYSICIAN PRACTICE (
SPECIALTY                                      HEMATOLOGY
UPIN                                                  NaN
npi                                                     0
DOB                                                   NaN
ADDRESS                         401 BOGLE STREET, STE 101
CITY                                             SOMERSET
STATE                                                  KY
ZIP                                                 42503
EXCLTYPE                                           1128b7
EXCLDATE                              2014-01-03 00:00:00
REINDATE                                                0
WAIVERDATE                                              0
WVRSTATE      

Beneficiary counts were supressed to 0 if they were below 11, so we replace 0's with 5's as per CMS documentation.

In [31]:
partd.bene_count = partd.bene_count.replace(0,5)

Aggregate over numerical columns.

In [27]:
partd = partd.groupby(['npi','SPECIALTY', 'YEAR']).agg({'bene_count':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'total_claim_count':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'total_30_day_fill_count':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'total_day_supply':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'total_drug_cost': ['sum', 'mean', 'median', np.std, 'min', 'max']})

In [51]:
partd

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bene_count_sum,bene_count_mean,bene_count_median,bene_count_std,bene_count_min,bene_count_max,total_claim_count_sum,total_claim_count_mean,total_claim_count_median,total_claim_count_std,...,total_day_supply_median,total_day_supply_std,total_day_supply_min,total_day_supply_max,total_drug_cost_sum,total_drug_cost_mean,total_drug_cost_median,total_drug_cost_std,total_drug_cost_min,total_drug_cost_max
npi,SPECIALTY,YEAR,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1003000407,FAMILY PRACTICE,2016.0,312.0,18.352941,14.0,11.135199,11.0,53.0,1741.0,32.240741,23.0,27.136229,...,444.0,514.103957,38.0,2213.0,86421.07,1600.390185,269.280,2957.850733,51.08,15281.17
1003000407,FAMILY PRACTICE,2017.0,218.0,15.571429,14.0,3.631365,11.0,21.0,2286.0,37.475410,25.0,38.698668,...,489.0,582.702746,47.0,2369.0,73501.23,1204.938197,311.750,2316.260818,48.11,12322.78
1003000407,FAMILY PRACTICE,2018.0,217.0,15.500000,15.0,4.090326,11.0,22.0,2255.0,37.583333,28.0,32.324253,...,373.5,401.410176,74.0,1736.0,76239.32,1270.655333,491.500,1948.498336,86.38,12313.73
1003066838,INTERNAL MEDICINE,2016.0,68.0,17.000000,14.0,8.124038,11.0,29.0,159.0,19.875000,17.5,7.510707,...,285.5,286.766705,93.0,967.0,2551.02,318.877500,360.465,186.531801,41.92,606.44
1003066838,INTERNAL MEDICINE,2017.0,247.0,17.642857,13.5,9.418534,11.0,45.0,1175.0,22.596154,18.0,13.397775,...,465.0,358.197612,96.0,1468.0,33908.28,652.082308,175.735,1206.227148,37.51,6594.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992768477,GENERAL PRACTICE,2016.0,5.0,,,,,,154.0,15.400000,14.5,4.742245,...,627.0,270.523690,241.0,1140.0,2037.85,203.785000,136.690,165.760781,51.96,524.87
1992768477,GENERAL PRACTICE,2017.0,5.0,,,,,,16.0,16.000000,16.0,,...,160.0,,160.0,160.0,229.49,229.490000,229.490,,229.49,229.49
1992999759,NURSE PRACTITIONER (,2016.0,65.0,32.500000,32.5,3.535534,30.0,35.0,107.0,21.400000,15.0,11.414903,...,308.0,166.318971,150.0,540.0,1216.68,243.336000,166.280,222.235355,60.22,580.16
1992999759,NURSE PRACTITIONER (,2017.0,212.0,15.142857,15.0,2.626994,12.0,20.0,573.0,19.758621,18.0,7.448179,...,810.0,662.739179,54.0,2520.0,15881.82,547.648966,174.750,1655.416719,48.91,9068.62


Flattening the multiindex!

In [50]:
partd.columns = ['_'.join(col) for col in partd.columns.values]

In [68]:
partd = partd.reset_index(level=['SPECIALTY', 'npi', 'YEAR'])

In [69]:
partd

Unnamed: 0,npi,SPECIALTY,YEAR,bene_count_sum,bene_count_mean,bene_count_median,bene_count_std,bene_count_min,bene_count_max,total_claim_count_sum,...,total_day_supply_median,total_day_supply_std,total_day_supply_min,total_day_supply_max,total_drug_cost_sum,total_drug_cost_mean,total_drug_cost_median,total_drug_cost_std,total_drug_cost_min,total_drug_cost_max
0,1003000407,FAMILY PRACTICE,2016.0,312.0,18.352941,14.0,11.135199,11.0,53.0,1741.0,...,444.0,514.103957,38.0,2213.0,86421.07,1600.390185,269.280,2957.850733,51.08,15281.17
1,1003000407,FAMILY PRACTICE,2017.0,218.0,15.571429,14.0,3.631365,11.0,21.0,2286.0,...,489.0,582.702746,47.0,2369.0,73501.23,1204.938197,311.750,2316.260818,48.11,12322.78
2,1003000407,FAMILY PRACTICE,2018.0,217.0,15.500000,15.0,4.090326,11.0,22.0,2255.0,...,373.5,401.410176,74.0,1736.0,76239.32,1270.655333,491.500,1948.498336,86.38,12313.73
3,1003066838,INTERNAL MEDICINE,2016.0,68.0,17.000000,14.0,8.124038,11.0,29.0,159.0,...,285.5,286.766705,93.0,967.0,2551.02,318.877500,360.465,186.531801,41.92,606.44
4,1003066838,INTERNAL MEDICINE,2017.0,247.0,17.642857,13.5,9.418534,11.0,45.0,1175.0,...,465.0,358.197612,96.0,1468.0,33908.28,652.082308,175.735,1206.227148,37.51,6594.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,1992768477,GENERAL PRACTICE,2016.0,5.0,,,,,,154.0,...,627.0,270.523690,241.0,1140.0,2037.85,203.785000,136.690,165.760781,51.96,524.87
1118,1992768477,GENERAL PRACTICE,2017.0,5.0,,,,,,16.0,...,160.0,,160.0,160.0,229.49,229.490000,229.490,,229.49,229.49
1119,1992999759,NURSE PRACTITIONER (,2016.0,65.0,32.500000,32.5,3.535534,30.0,35.0,107.0,...,308.0,166.318971,150.0,540.0,1216.68,243.336000,166.280,222.235355,60.22,580.16
1120,1992999759,NURSE PRACTITIONER (,2017.0,212.0,15.142857,15.0,2.626994,12.0,20.0,573.0,...,810.0,662.739179,54.0,2520.0,15881.82,547.648966,174.750,1655.416719,48.91,9068.62
