In [1]:
## Importing the required modules
import numpy as np
import pandas as pd

## Medicare Provider Utilization and Payment Data

In [2]:
## A function that cleans and pivots "Medicare Provider Utilization and Payment Data"
def rem_comma(s):
    s1 = s.replace(',', '')
    return s1

def clean_pivot(period):
    data = pd.read_csv(f'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\RawDatasets\Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY{period}_CA_Ophthalmology.csv')
    if period == '2013':
        data.rename(columns={'National Provider Identifier ':'National Provider Identifier'}, inplace=True)
    data.sort_values(by=['National Provider Identifier'], inplace=True)
    
    if period == '2013':
        data['Number of Services'] = data['LINE_SRVC_CNT'].apply(rem_comma)
        data['Number of Medicare Beneficiaries'] = data['BENE_UNIQUE_CNT'].apply(rem_comma)
        data['Average Medicare Payment Amount'] = data['AVERAGE_MEDICARE_PAYMENT_AMT'].apply(rem_comma)
        data['HCPCS Code'] = data['HCPCS_CODE']
    else: 
        data['Number of Services'] = data['Number of Services'].apply(rem_comma)
        data['Number of Medicare Beneficiaries'] = data['Number of Medicare Beneficiaries'].apply(rem_comma)
        data['Average Medicare Payment Amount'] = data['Average Medicare Payment Amount'].apply(rem_comma)
        
    data['Number of Services'] = data['Number of Services'].apply(float)
    data['Number of Medicare Beneficiaries'] = data['Number of Medicare Beneficiaries'].apply(float)
    data['Average Medicare Payment Amount'] = data['Average Medicare Payment Amount'].apply(float)
    data['Medicare Payment Amount'] = data['Number of Services'] * data['Average Medicare Payment Amount']
    data = data[['National Provider Identifier', 'HCPCS Code', 'Number of Services', 'Number of Medicare Beneficiaries', 'Medicare Payment Amount']]

    idx = 0
    while (idx + 1) < data.shape[0]:
        s = data.shape[0]
        NPI = data.iat[idx, 0]
        c = 1
        while idx + c < data.shape[0] and data.iat[idx + c, 0] == NPI:
            if data.iat[idx + c, 1] == data.iat[idx, 1]:
                data.iat[idx, 2] += data.iat[idx + c, 2]
                data.iat[idx, 3] += data.iat[idx + c, 3] # Warning: Some of the beneficiaries received the service in a facility location may be the same as those who received it in a non-facility location. 
                data.iat[idx, 4] += data.iat[idx + c, 4]
                data = data.drop([idx + c])
                data = data.reset_index(drop=True)
                break
            else:
                c += 1
        if s == data.shape[0]:
            idx += 1
    
    data_NoS = data[['National Provider Identifier', 'HCPCS Code', 'Number of Services']]
    data_NoMB = data[['National Provider Identifier', 'HCPCS Code', 'Number of Medicare Beneficiaries']]
    data_MP = data[['National Provider Identifier', 'HCPCS Code', 'Medicare Payment Amount']]

    data_pivoted_NoS = data_NoS.pivot(index='National Provider Identifier', columns='HCPCS Code', values='Number of Services')
    data_pivoted_NoMB = data_NoMB.pivot(index='National Provider Identifier', columns='HCPCS Code', values='Number of Medicare Beneficiaries')
    data_pivoted_MP = data_MP.pivot(index='National Provider Identifier', columns='HCPCS Code', values='Medicare Payment Amount')
  
    return data_pivoted_NoS, data_pivoted_NoMB, data_pivoted_MP

## Generating pivoted datasets
periods = ['2012', '2013', '2014', '2015', '2016', '2017', '2018']
pivoted = []

for p in periods:
    data_pivoted_NoS, data_pivoted_NoMB, data_pivoted_MP = clean_pivot(p)
    data = [data_pivoted_NoS.fillna(0.0), data_pivoted_NoMB.fillna(0.0), data_pivoted_MP.fillna(0.0)]
    pivoted.append(data)
    
## Same indices and columns in all the pivoted datasets
HCPCS_pool = []
for i in range(len(pivoted)):
    HCPCS_pool = [*HCPCS_pool, *pivoted[i][0].columns]
HCPCS_pool = set(HCPCS_pool)
    
NPI_pool = []
for i in range(len(pivoted)):
    NPI_pool = [*NPI_pool, *pivoted[i][0].index]
NPI_pool = set(NPI_pool)

std_pivoted = []
for i in range(len(pivoted)):
    bucket = []
    for j in range(len(pivoted[i])):
        data = pivoted[i][j].copy()
        
        clmn_list = set(data.columns)
        diff_clmn_list = HCPCS_pool - clmn_list
        for hcpcs in diff_clmn_list:
            data[hcpcs] = 0.0
            
        idx_list = set(data.index)
        diff_idx_list = NPI_pool - idx_list
        for npi in diff_idx_list:
            data.loc[npi] = np.nan # or 0.0

        data = data.reindex(sorted(data.columns), axis=1)
        data.sort_index(inplace=True)
        bucket.append(data)
    std_pivoted.append(bucket)
    
## Merging the datasets across different features
merged_std_pivoted = []
index_list = list(NPI_pool)
columns_list = list(HCPCS_pool)
index_list.sort()
columns_list.sort()
tag = ['NoS', 'NoMB', 'MP']
for i in range(len(std_pivoted)):
    data = std_pivoted[i].copy()
    data_merged = pd.DataFrame(index=index_list)
    for hcpcs in columns_list:
        for j in range(len(data)):
            year = str(i + 2012)
            column_name = hcpcs + '-' + year + '-' + tag[j]
            data_merged[column_name] = data[j][hcpcs]
    merged_std_pivoted.append(data_merged)

## Concatenating merged datasets across different years
dataset_1 = None
for i in range(len(merged_std_pivoted)):
    dataset_1 = pd.concat((dataset_1, merged_std_pivoted[i]), axis=1)

## Medicare Physician and Other Supplier Data

In [3]:
## A function that cleans and pivots "Medicare Physician and Other Supplier Data"
def clean_pivot_2(period):
    data = pd.read_csv(f'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\RawDatasets\Medicare_Physician_and_Other_Supplier_National_Provider_Identifier__NPI__Aggregate_Report__Calendar_Year_{period}.csv')
    if period == "2012" or period == "2013":
        data.rename(columns = {"NPI": "National Provider Identifier", 
                               "Number of Unique Beneficiaries": "Number of Medicare Beneficiaries"}, inplace = True)
    if period == '2014' or period == "2015":
        data.rename(columns = {"Total Drug Medicare Payment Amount ": "Total Drug Medicare Payment Amount"}, inplace = True)
    data = data[["National Provider Identifier", "Number of Medicare Beneficiaries", "Total Drug Medicare Payment Amount", "Total Medical Medicare Payment Amount"]]
    data = data.set_index("National Provider Identifier")
    return data

## Generating required datasets
periods = ['2012', '2013', '2014', '2015', '2016', '2017', '2018']
dfs = []

for p in periods:
    data = clean_pivot_2(p)
    dfs.append(data)   
    
## Modify the index list of the datasets
index_list = dataset_1.index
index_set = set(index_list)
dfs_2 = []
for i in range(len(dfs)):
    df = pd.DataFrame(data=np.nan, index=index_list, columns=dfs[i].columns + '-' + periods[i])
    index_set_int = index_set.intersection(set(dfs[i].index))
    df.loc[index_set_int] = dfs[i].loc[index_set_int]
    dfs_2.append(df)
    
## Concatenating merged datasets across different years
dataset_2 = None
for i in range(len(dfs_2)):
    dataset_2 = pd.concat((dataset_2, dfs_2[i]), axis=1)
    
## Saving the datasets locally
dataset_1.to_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\ProcessedDatasets\dataset_codes.csv')
dataset_2.to_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\ProcessedDatasets\dataset_aggregate.csv')

## Merging all the datasets

In [2]:
## Merging dataset_1 and dataset_2
dataset_1 = pd.read_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\ProcessedDatasets\dataset_codes.csv', index_col=0)
dataset_2 = pd.read_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\ProcessedDatasets\dataset_aggregate.csv', index_col=0)

dataset = pd.concat((dataset_1, dataset_2), axis=1)
dataset.head()

Unnamed: 0,00140-2012-NoS,00140-2012-NoMB,00140-2012-MP,00142-2012-NoS,00142-2012-NoMB,00142-2012-MP,00145-2012-NoS,00145-2012-NoMB,00145-2012-MP,0191T-2012-NoS,...,Total Medical Medicare Payment Amount-2015,Number of Medicare Beneficiaries-2016,Total Drug Medicare Payment Amount-2016,Total Medical Medicare Payment Amount-2016,Number of Medicare Beneficiaries-2017,Total Drug Medicare Payment Amount-2017,Total Medical Medicare Payment Amount-2017,Number of Medicare Beneficiaries-2018,Total Drug Medicare Payment Amount-2018,Total Medical Medicare Payment Amount-2018
1003015504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,87725.83,227.0,0.0,44332.19,91.0,0.0,8236.58,,,
1003028580,,,,,,,,,,,...,,,,,,,,20.0,0.0,7157.95
1003056516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,76285.52,471.0,0.0,102250.61,588.0,0.0,130708.9,667.0,0.0,154779.03
1003072786,,,,,,,,,,,...,,,,,42.0,1235.58,16085.85,145.0,45919.63,88693.3
1003093774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


## Labels

In [4]:
## Labels
fraud_database = pd.read_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\Labels\CA_Ophthalmology\Fraud_Database.csv', index_col=0)
high_utilization_database = pd.read_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\Labels\CA_Ophthalmology\HighUtilization_Database.csv', index_col=0)
non_fraud_database = pd.read_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\Labels\CA_Ophthalmology\NonFraud_Database.csv', index_col=0)
mislabeled_database = pd.read_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\Labels\CA_Ophthalmology\Mislabeled_Database.csv', index_col=0)

fraud = list(fraud_database.index)
high_utilization = list(high_utilization_database.index)
non_fraud = list(non_fraud_database.index)
mislabeled = list(mislabeled_database.index)

## Dropping the mislabeled providers from the datasets
dataset.drop(index=mislabeled, inplace=True)

labels = pd.Series(data=-1, index=dataset.index, dtype='int', name='Label')
for npi in non_fraud:
    labels.at[npi] = 0
for npi in high_utilization:
    labels.at[npi] = 1
for npi in fraud:
    labels.at[npi] = 2
    
## Adding the labels to the dataset
dataset_all_cleaned_labeled = pd.concat((dataset, labels), axis=1)
dataset_all_cleaned_labeled.head()   

## Saving the modified, labeled dataset locally
dataset_all_cleaned_labeled.to_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\ProcessedDatasets\dataset_all.csv')

## Inspect the dataset

In [5]:
data = pd.read_csv(r'D:\Anaconda\Jupyter Directory\Medicare Fraud Detection\California_Ophthalmology\ProcessedDatasets\dataset_all.csv', index_col=0)
data.head()

Unnamed: 0,00140-2012-NoS,00140-2012-NoMB,00140-2012-MP,00142-2012-NoS,00142-2012-NoMB,00142-2012-MP,00145-2012-NoS,00145-2012-NoMB,00145-2012-MP,0191T-2012-NoS,...,Number of Medicare Beneficiaries-2016,Total Drug Medicare Payment Amount-2016,Total Medical Medicare Payment Amount-2016,Number of Medicare Beneficiaries-2017,Total Drug Medicare Payment Amount-2017,Total Medical Medicare Payment Amount-2017,Number of Medicare Beneficiaries-2018,Total Drug Medicare Payment Amount-2018,Total Medical Medicare Payment Amount-2018,Label
1003015504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,227.0,0.0,44332.19,91.0,0.0,8236.58,,,,-1
1003028580,,,,,,,,,,,...,,,,,,,20.0,0.0,7157.95,-1
1003056516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,471.0,0.0,102250.61,588.0,0.0,130708.9,667.0,0.0,154779.03,0
1003072786,,,,,,,,,,,...,,,,42.0,1235.58,16085.85,145.0,45919.63,88693.3,-1
1003093774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,0
