In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Importing Data

In [None]:
partd_2018 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTD/PartD_Prescriber_PUF_NPI_Drug_18.txt', sep='\t')
partb_2018 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTB/Medicare_Provider_Util_Payment_PUF_CY2018/Medicare_Provider_Util_Payment_PUF_CY2018.txt', sep='\t')
dmepos_2018 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/DMEPOS/Medicare_Referring_Provider_DMEPOS_PUF_CY2018/Medicare_Referring_Provider_DMEPOS_PUF_CY2018.csv')
partd_2017 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTD/PartD_Prescriber_PUF_NPI_DRUG_17/PartD_Prescriber_PUF_NPI_Drug_17.txt', sep='\t')
partb_2017 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTB/Medicare_Provider_Util_Payment_PUF_CY2017/Medicare_Provider_Util_Payment_PUF_CY2017.txt', sep='\t')
dmepos_2017 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/DMEPOS/medicare_referring_provider_dmepos_puf_cy2017/Medicare_Referring_Provider_DMEPOS_PUF_CY2017.tab', sep='\t')
partd_2016 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTD/PartD_Prescriber_PUF_NPI_DRUG_16/PartD_Prescriber_PUF_NPI_Drug_16.txt', sep='\t')
partb_2016 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTB/Medicare-Physician-and-Other-Supplier-PUF/Medicare_Provider_Util_Payment_PUF_CY2016.txt', sep='\t')
dmepos_2016 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/DMEPOS/medicare_referring_provider_dmepos_puf_cy2016/Medicare_Referring_Provider_DMEPOS_PUF_CY2016.tab', sep='\t')
partd_2015 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTD/PartD_Prescriber_PUF_NPI_DRUG_15/PartD_Prescriber_PUF_NPI_Drug_15.txt', sep='\t')
partb_2015 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTB/Medicare_Provider_Util_Payment_PUF_CY2015/Medicare_Provider_Util_Payment_PUF_CY2015.txt', sep='\t')
dmepos_2015 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/DMEPOS/medicare_referring_provider_dmepos_puf_cy2015/Medicare_Referring_Provider_DMEPOS_PUF_CY2015.txt', sep='\t')
partd_2014 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTD/PartD_Prescriber_PUF_NPI_DRUG_14/PartD_Prescriber_PUF_NPI_Drug_14.txt', sep='\t')
partb_2014 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/PARTB/Medicare_Provider_Util_Payment_PUF_CY2014/Medicare_Provider_Util_Payment_PUF_CY2014.txt', sep='\t')
dmepos_2014 = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/DMEPOS/medicare_referring_provider_dmepos_puf_cy2014/Medicare_Referring_Provider_DMEPOS_PUF_CY2014.txt', sep='\t')

In [None]:
leie = pd.read_csv('/Volumes/ML_projects/Medicare_Fraud_Datasets/LEIE.csv')

# Selecting features and targets.

Filtering out specific exclusion codes for more severe offenses. These will be used to create `FRAUD` and `NOT FRAUD` targets.

In [None]:
#mandatory minimum penalty based on offense category

minimum_exclusion_periods = { 
    '1128a1':5, #Conviction of program-related crimes. Minimum Period: 5 years
    '1128a2':5, #Conviction relating to patient abuse or neglect. Minimum Period: 5 years
    '1128a3':5, #Felony conviction relating to health care fraud. Minimum Period: 5 years
    '1128b4':np.nan, #License revocation, suspension, or surrender. Minimum Period: Period imposed by the state licensing authority.    
    '1128b7':np.nan, #Fraud, kickbacks, and other prohibited activities. Minimum Period: None
    '1128c3gi':10, #Conviction of second mandatory exclusion offense. Minimum Period: 10 years
    '1128c3gii':100 #Conviction of third or more mandatory exclusion offenses. Permanent Exclusion
}

In [None]:
exclusion_codes = ['1128a1', '1128a2', '1128a3', '1128b4', '1128b7', '1128c3gi', '1128c3gii']
filtered_leie = leie[leie.EXCLTYPE.isin(exclusion_codes)]
filtered_leie.shape

Converting the dates into a datetime.

In [None]:
filtered_leie['EXCLDATE'] = pd.to_datetime(filtered_leie.EXCLDATE, format='%Y%m%d') #most important value
filtered_leie['WAIVERDATE'] = pd.to_datetime(filtered_leie.WAIVERDATE, format='%Y%m%d', errors='coerce')
filtered_leie['REINDATE'] = pd.to_datetime(filtered_leie.REINDATE, format='%Y%m%d', errors='coerce')

As we can see, there are only 11 dates for the `WAIVERDATE` column and only 1 for the `REINDATE` column, so to infer the exclusion end dates we can add the `minimum_exclusion_periods` value to the exclusion date (`EXCLDATE`) column.

In [None]:
print("Unique `WAIVERDATE` values: ", len(filtered_leie['WAIVERDATE'].unique()))
print("Unique `REINDATE` values: ", len(filtered_leie['REINDATE'].unique()))

In [None]:
#mapping the penalty to the minimum exclusions period, placing minimum exclusions on their own col

filtered_leie['MIN_EXCLUSION_PERIOD'] = filtered_leie['EXCLTYPE'].map(minimum_exclusion_periods)

Now we have to add the minimum exclusion periods to the original exclusion dates. This will give us an estimate of the exclusion end date. 

In [None]:
filtered_leie['END_EXCLDATE'] = filtered_leie.EXCLDATE.dt.year + filtered_leie.MIN_EXCLUSION_PERIOD

The `calculate_exclusion_end` function takes in the beginning exclusion date, then if the month is greater that `6` , adds a year to the end year. This is because the provider would have been fraudulant for most of the year.

In [None]:
def calculate_exclusion_end(exclusion_date, end_year):
    month = exclusion_date.month
    

    if month > 6:
        return end_year + 1
    else:
        return end_year
        

In [None]:
#applying the function that calculates end dates

filtered_leie['END_EXCLDATE'] = filtered_leie[["EXCLDATE","END_EXCLDATE"]].apply(lambda x: calculate_exclusion_end(*x), axis=1)

Filtering out features from the three datasets.

In [None]:
partb_feats = ['npi', 'hcpcs_code', 'hcpcs_description', 'hcpcs_drug_indicator', 'provider_type', 'nppes_provider_gender', 'line_srvc_cnt', 'bene_unique_cnt', 'bene_day_srvc_cnt', 'average_submitted_chrg_amt', 'average_Medicare_payment_amt']
partd_feats = ['npi', 'specialty_description', 'bene_count', 'total_claim_count', 'total_30_day_fill_count', 'total_day_supply', 'total_drug_cost']
dmepos_feats = ['REFERRING_NPI', 'REFERRING_PROVIDER_TYPE', 'REFERRING_PROVIDER_GENDER', 'NUMBER_OF_SUPPLIERS', 'NUMBER_OF_SUPPLIER_BENEFICIARIES', 'NUMBER_OF_SUPPLIER_CLAIMS', 'NUMBER_OF_SUPPLIER_SERVICES', 'AVG_SUPPLIER_SUBMITTED_CHARGE', 'AVG_SUPPLIER_MEDICARE_PMT_AMT']

partb_features_2018 = partb_2018[partb_feats]
partd_features_2018 = partd_2018[partd_feats]
dmepos_features_2018 = dmepos_2018[dmepos_feats]

#Add a year column so we know where to add fraud labels. Activity before exclusion dates will be considered fraudulent. 
partb_features_2018['DATA_YEAR'] = 2018
partd_features_2018['DATA_YEAR'] = 2018
dmepos_features_2018['DATA_YEAR'] = 2018

partb_features_2017 = partb_2017[partb_feats]
partd_features_2017 = partd_2017[partd_feats]
dmepos_features_2017 = dmepos_2017[dmepos_feats]

partb_features_2017['DATA_YEAR'] = 2017
partd_features_2017['DATA_YEAR'] = 2017
dmepos_features_2017['DATA_YEAR'] = 2017

partb_features_2016 = partb_2016[[x.upper() for x in partb_feats]]
partd_features_2016 = partd_2016[partd_feats]
dmepos_features_2016 = dmepos_2016[dmepos_feats]

#make 2016 headers lowercase like the other years. 
partb_features_2016.columns = [x.lower() for x in partb_features_2016.columns]

partb_features_2016['DATA_YEAR'] = 2016
partd_features_2016['DATA_YEAR'] = 2016
dmepos_features_2016['DATA_YEAR'] = 2016

partb_features_2015 = partb_2015[partb_feats]
partd_features_2015 = partd_2015[partd_feats]
dmepos_features_2015 = dmepos_2015[dmepos_feats]
 
partb_features_2015['DATA_YEAR'] = 2015
partd_features_2015['DATA_YEAR'] = 2015
dmepos_features_2015['DATA_YEAR'] = 2015

partb_features_2014 = partb_2014[partb_feats]
partd_features_2014 = partd_2014[partd_feats]
dmepos_features_2014 = dmepos_2014[dmepos_feats]

partb_features_2014['DATA_YEAR'] = 2014
partd_features_2014['DATA_YEAR'] = 2014
dmepos_features_2014['DATA_YEAR'] = 2014

In [None]:
partb_features_2016

# Concatenate all the yearly data.

In [None]:
# make NPI lowercase so we can merge on it later. 

filtered_leie.rename({'NPI':'npi'}, axis=1, inplace=True)
dmepos_features_2018.rename({'REFERRING_NPI':'npi'}, axis=1, inplace=True)
dmepos_features_2017.rename({'REFERRING_NPI':'npi'}, axis=1, inplace=True)
dmepos_features_2016.rename({'REFERRING_NPI':'npi'}, axis=1, inplace=True)
dmepos_features_2015.rename({'REFERRING_NPI':'npi'}, axis=1, inplace=True)
dmepos_features_2014.rename({'REFERRING_NPI':'npi'}, axis=1, inplace=True)

In [None]:
partb_features = pd.concat([partb_features_2018, partb_features_2017, partb_features_2016, partb_features_2015, partb_features_2014])
partd_features = pd.concat([partd_features_2018, partd_features_2017, partd_features_2016, partd_features_2015, partd_features_2014])
dmepos_features = pd.concat([dmepos_features_2018, dmepos_features_2017, dmepos_features_2016, dmepos_features_2015, dmepos_features_2014])

# Part B dataset processing.

Filter out HCPCS codes referring to prescriptions, the `line_srvc_cnt` feature for these entries refer to weight/volume of a drug and not procedure counts. Including both in the same dataset would cause conflicts. 

In [None]:
partb_features = partb_features[partb_features.hcpcs_drug_indicator.eq('N')]

In [None]:
partb_features.head(5).iloc[1]

In [None]:
partb_features = partb_features.groupby(['npi','provider_type', 'nppes_provider_gender', 'DATA_YEAR']).agg({'line_srvc_cnt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'bene_unique_cnt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'bene_day_srvc_cnt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'average_submitted_chrg_amt':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                                     'average_medicare_payment_amt': ['sum', 'mean', 'median', np.std, 'min', 'max']})

Flatten the multiindex to make it easier to work with.

In [None]:
partb_features.columns = ['_'.join(col) for col in partb_features.columns.values]

Physicians who only performed and operation or perscribed one type of drug or procedure show as `null` values, we can replace these values with 0 because there is no variability. 

In [None]:
partb_features.isna().sum()

In [None]:
#replacing NA with 0

partb_features.fillna(0, inplace=True)

In [None]:
partb_features = partb_features.reset_index(level=['npi', 'provider_type', 'nppes_provider_gender', 'DATA_YEAR'])

# Part D dataset processing.

In [None]:
partd_features.columns

Beneficiary counts were supressed to 0 if they were below 11, so we replace 0's with 5's as per CMS documentation.

In [None]:
partd_features.bene_count = partd_features.bene_count.replace(0,5)

Aggregate over numerical columns.

In [None]:
partd_features = partd_features.groupby(['npi','specialty_description', 'DATA_YEAR']).agg({'bene_count':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'total_claim_count':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'total_30_day_fill_count':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'total_day_supply':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                     'total_drug_cost': ['sum', 'mean', 'median', np.std, 'min', 'max']})

Flattening the multiindex!

In [None]:
partd_features.columns = ['_'.join(col) for col in partd_features.columns.values]

In [None]:
partd_features = partd_features.reset_index(level=['specialty_description', 'npi', 'DATA_YEAR'])

In [None]:
partd_features

# DMEPOS dataset processing.

In [None]:
dmepos_features.columns

Beneficiary counts were supressed to 0 if they were below 11, so we replace 0's with 5's as per CMS documentation.

In [None]:
dmepos_features.NUMBER_OF_SUPPLIER_BENEFICIARIES = dmepos_features.NUMBER_OF_SUPPLIER_BENEFICIARIES.replace(0,5)

In [None]:
dmepos_features = dmepos_features.groupby(['npi','REFERRING_PROVIDER_TYPE', 
                         'REFERRING_PROVIDER_GENDER', 'DATA_YEAR']).agg({'NUMBER_OF_SUPPLIERS':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                 'NUMBER_OF_SUPPLIER_BENEFICIARIES':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                 'NUMBER_OF_SUPPLIER_CLAIMS':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                 'NUMBER_OF_SUPPLIER_SERVICES':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                 'AVG_SUPPLIER_MEDICARE_PMT_AMT':['sum', 'mean', 'median', np.std, 'min', 'max'],
                                                                 'AVG_SUPPLIER_SUBMITTED_CHARGE': ['sum', 'mean', 'median', np.std, 'min', 'max']})

Flattening the multiindex!

In [None]:
dmepos_features.columns = ['_'.join(col) for col in dmepos_features.columns.values]

In [None]:
dmepos_features = dmepos_features.reset_index(level=['REFERRING_PROVIDER_TYPE', 'npi', 'DATA_YEAR', 'REFERRING_PROVIDER_GENDER'])

# Merge all the combined yearly data on with `LEIE` on NPI

In [None]:
partb = pd.merge(filtered_leie, partb_features, on='npi', how='outer')
partd = pd.merge(filtered_leie, partd_features, on='npi', how='outer')
dmepos = pd.merge(filtered_leie, dmepos_features, on='npi', how='outer')

In [None]:
partb = partb[['npi', 'provider_type', 'DATA_YEAR', 'nppes_provider_gender', 'line_srvc_cnt_sum',
       'line_srvc_cnt_mean', 'line_srvc_cnt_median', 'line_srvc_cnt_std',
       'line_srvc_cnt_min', 'line_srvc_cnt_max', 'bene_unique_cnt_sum',
       'bene_unique_cnt_mean', 'bene_unique_cnt_median', 'bene_unique_cnt_std',
       'bene_unique_cnt_min', 'bene_unique_cnt_max', 'bene_day_srvc_cnt_sum',
       'bene_day_srvc_cnt_mean', 'bene_day_srvc_cnt_median',
       'bene_day_srvc_cnt_std', 'bene_day_srvc_cnt_min',
       'bene_day_srvc_cnt_max', 'average_submitted_chrg_amt_sum',
       'average_submitted_chrg_amt_mean', 'average_submitted_chrg_amt_median',
       'average_submitted_chrg_amt_std', 'average_submitted_chrg_amt_min',
       'average_submitted_chrg_amt_max', 'average_medicare_payment_amt_sum',
       'average_medicare_payment_amt_mean',
       'average_medicare_payment_amt_median',
       'average_medicare_payment_amt_std', 'average_medicare_payment_amt_min',
       'average_medicare_payment_amt_max', 'EXCLTYPE',
       'EXCLDATE', 'REINDATE', 'WAIVERDATE', 'WVRSTATE',
       'MIN_EXCLUSION_PERIOD', 'END_EXCLDATE']]

In [None]:
partd = partd[['npi', 'specialty_description', 'DATA_YEAR', 'bene_count_sum', 'bene_count_mean', 'bene_count_median',
       'bene_count_std', 'bene_count_min', 'bene_count_max',
       'total_claim_count_sum', 'total_claim_count_mean',
       'total_claim_count_median', 'total_claim_count_std',
       'total_claim_count_min', 'total_claim_count_max',
       'total_30_day_fill_count_sum', 'total_30_day_fill_count_mean',
       'total_30_day_fill_count_median', 'total_30_day_fill_count_std',
       'total_30_day_fill_count_min', 'total_30_day_fill_count_max',
       'total_day_supply_sum', 'total_day_supply_mean',
       'total_day_supply_median', 'total_day_supply_std',
       'total_day_supply_min', 'total_day_supply_max', 'total_drug_cost_sum',
       'total_drug_cost_mean', 'total_drug_cost_median', 'total_drug_cost_std',
       'total_drug_cost_min', 'total_drug_cost_max', 'EXCLTYPE',
       'EXCLDATE', 'REINDATE', 'WAIVERDATE', 'WVRSTATE',
       'MIN_EXCLUSION_PERIOD', 'END_EXCLDATE']]

In [None]:
dmepos = dmepos[['npi','EXCLTYPE',
       'EXCLDATE', 'REINDATE', 'WAIVERDATE', 'WVRSTATE',
       'MIN_EXCLUSION_PERIOD', 'END_EXCLDATE', 'REFERRING_PROVIDER_TYPE',
       'REFERRING_PROVIDER_GENDER', 'DATA_YEAR', 'NUMBER_OF_SUPPLIERS_sum',
       'NUMBER_OF_SUPPLIERS_mean', 'NUMBER_OF_SUPPLIERS_median',
       'NUMBER_OF_SUPPLIERS_std', 'NUMBER_OF_SUPPLIERS_min',
       'NUMBER_OF_SUPPLIERS_max', 'NUMBER_OF_SUPPLIER_BENEFICIARIES_sum',
       'NUMBER_OF_SUPPLIER_BENEFICIARIES_mean',
       'NUMBER_OF_SUPPLIER_BENEFICIARIES_median',
       'NUMBER_OF_SUPPLIER_BENEFICIARIES_std',
       'NUMBER_OF_SUPPLIER_BENEFICIARIES_min',
       'NUMBER_OF_SUPPLIER_BENEFICIARIES_max', 'NUMBER_OF_SUPPLIER_CLAIMS_sum',
       'NUMBER_OF_SUPPLIER_CLAIMS_mean', 'NUMBER_OF_SUPPLIER_CLAIMS_median',
       'NUMBER_OF_SUPPLIER_CLAIMS_std', 'NUMBER_OF_SUPPLIER_CLAIMS_min',
       'NUMBER_OF_SUPPLIER_CLAIMS_max', 'NUMBER_OF_SUPPLIER_SERVICES_sum',
       'NUMBER_OF_SUPPLIER_SERVICES_mean',
       'NUMBER_OF_SUPPLIER_SERVICES_median', 'NUMBER_OF_SUPPLIER_SERVICES_std',
       'NUMBER_OF_SUPPLIER_SERVICES_min', 'NUMBER_OF_SUPPLIER_SERVICES_max',
       'AVG_SUPPLIER_MEDICARE_PMT_AMT_sum',
       'AVG_SUPPLIER_MEDICARE_PMT_AMT_mean',
       'AVG_SUPPLIER_MEDICARE_PMT_AMT_median',
       'AVG_SUPPLIER_MEDICARE_PMT_AMT_std',
       'AVG_SUPPLIER_MEDICARE_PMT_AMT_min',
       'AVG_SUPPLIER_MEDICARE_PMT_AMT_max',
       'AVG_SUPPLIER_SUBMITTED_CHARGE_sum',
       'AVG_SUPPLIER_SUBMITTED_CHARGE_mean',
       'AVG_SUPPLIER_SUBMITTED_CHARGE_median',
       'AVG_SUPPLIER_SUBMITTED_CHARGE_std',
       'AVG_SUPPLIER_SUBMITTED_CHARGE_min',
       'AVG_SUPPLIER_SUBMITTED_CHARGE_max']]

### Remove all rows without npi or provider info.

In [None]:
partb = partb[(partb.npi != 0) & (partb.provider_type)]
partd = partd[(partd.npi != 0) & (partd.specialty_description)]
dmepos = dmepos[(dmepos.npi != 0) & (dmepos.REFERRING_PROVIDER_TYPE)]

# Add Labels to each dataset

Add the target column

In [None]:
partb['TARGET'] = '0'
partd['TARGET'] = '0'
dmepos['TARGET'] = '0'

Extract the exclusion year into it's own column -- we will compare this to the data collection year then set the label as `FRAUD` or `NOT_FRAUD`.

In [None]:
partb['START_EXCLDATE'] = partb['EXCLDATE'].dt.year
partd['START_EXCLDATE'] = partd['EXCLDATE'].dt.year
dmepos['START_EXCLDATE'] = dmepos['EXCLDATE'].dt.year

In [None]:
partb

In [None]:
def make_labels(start_exc, data_yr):
    if start_exc > data_yr:
        return 'FRAUD'
    else:
        return 'NOT_FRAUD'
        

In [None]:
#applying the fraud labeling function to datasets

partb['TARGET'] = partb[["START_EXCLDATE","DATA_YEAR"]].apply(lambda x: make_labels(*x), axis=1)
partd['TARGET'] = partd[["START_EXCLDATE","DATA_YEAR"]].apply(lambda x: make_labels(*x), axis=1)
dmepos['TARGET'] = dmepos[["START_EXCLDATE","DATA_YEAR"]].apply(lambda x: make_labels(*x), axis=1)

As you can see from the outputs below, each of the datasets are highly imbalanced. This is an issue that would need to be revisited.

In [None]:
partd.TARGET.value_counts()

In [None]:
partb.TARGET.value_counts()

In [None]:
dmepos.TARGET.value_counts()

### Making the combined dataset

In [None]:
combined = pd.merge(partb,partd, left_on=['npi', 'provider_type', 'DATA_YEAR'], right_on=['npi', 'specialty_description', 'DATA_YEAR']).merge(dmepos, left_on=['npi', 'provider_type', 'DATA_YEAR'],right_on=['npi','REFERRING_PROVIDER_TYPE', 'DATA_YEAR'])

In [None]:
#Drop one gender column because it is also present in the partb dataset

combined.drop(columns=['REFERRING_PROVIDER_GENDER','EXCLTYPE_y',
 'EXCLDATE_y',
 'REINDATE_y',
 'WAIVERDATE_y',
 'WVRSTATE_y',
 'MIN_EXCLUSION_PERIOD_y',
 'END_EXCLDATE_y',
 'TARGET_y',
 'START_EXCLDATE_y','REFERRING_PROVIDER_TYPE'], inplace=True) 

# One Hot Encoding of Categorical Variables.

In [None]:
partb_category_columns = ['provider_type', 'nppes_provider_gender']
partd_category_columns = ['specialty_description']
dmepos_category_columns = ['REFERRING_PROVIDER_GENDER', 'REFERRING_PROVIDER_TYPE']
combined_category_columns = ['provider_type', 'nppes_provider_gender']

In [None]:
partb = pd.get_dummies(partb, columns=partb_category_columns, drop_first=True)
partd = pd.get_dummies(partd, columns=partd_category_columns, drop_first=True)
dmepos = pd.get_dummies(dmepos, columns=dmepos_category_columns, drop_first=True)
combined = pd.get_dummies(combined, columns=combined_category_columns, drop_first=True)

### Drop the columns that we don't need for modeling.

In [None]:
columns_to_drop = [ 'EXCLTYPE','EXCLDATE','REINDATE','WAIVERDATE',
                   'WVRSTATE','MIN_EXCLUSION_PERIOD','END_EXCLDATE',
                   'START_EXCLDATE', 'npi', 'DATA_YEAR']

partb.drop(columns_to_drop, axis=1, inplace=True)
partd.drop(columns_to_drop, axis=1, inplace=True)
dmepos.drop(columns_to_drop, axis=1, inplace=True)
combined.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
partb.shape

In [None]:
partd.shape

In [None]:
dmepos.shape

In [None]:
combined.shape

### Fill in `NaN` values with `0`.

In [None]:
partd.fillna(0, inplace=True)
dmepos.fillna(0, inplace=True)
combined.fillna(0, inplace=True)

# Export data for modeling.

In [None]:
pd.to_pickle(partd, '/Volumes/ML_projects/Medicare_Fraud_Datasets/processed_data/partd.pkl')
pd.to_pickle(partb, '/Volumes/ML_projects/Medicare_Fraud_Datasets/processed_data/partb.pkl')
pd.to_pickle(dmepos, '/Volumes/ML_projects/Medicare_Fraud_Datasets/processed_data/dmepos.pkl')
pd.to_pickle(combined, '/Volumes/ML_projects/Medicare_Fraud_Datasets/processed_data/combined.pkl')