In [48]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

#for data preprocessing
from sklearn.decomposition import PCA
#for modeling
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [49]:
import bios

my_dict = bios.read('config.yaml')

In [50]:
my_dict

{'Sorce_input_data_path_with_filename': 'data/Healthcare Providers.csv',
 'Row_wise_anomaly_output_data_path': 'output/Row_wise_result.csv',
 'n_estimators': 300,
 'max_samples': 'auto',
 'contamination': 'auto',
 'max_features': 1.0,
 'bootstrap': False,
 'n_jobs': -1,
 'verbose': 1,
 'warm_start': False,
 'random_state': 2020}

In [61]:
df = pd.read_csv(my_dict['Sorce_input_data_path_with_filename'])



In [62]:
df.head()

Unnamed: 0,index,National Provider Identifier,Last Name/Organization Name of the Provider,First Name of the Provider,Middle Initial of the Provider,Credentials of the Provider,Gender of the Provider,Entity Type of the Provider,Street Address 1 of the Provider,Street Address 2 of the Provider,...,HCPCS Code,HCPCS Description,HCPCS Drug Indicator,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Submitted Charge Amount,Average Medicare Payment Amount,Average Medicare Standardized Amount
0,8774979,1891106191,UPADHYAYULA,SATYASREE,,M.D.,F,I,1402 S GRAND BLVD,FDT 14TH FLOOR,...,99223,"Initial hospital inpatient care, typically 70 ...",N,27,24,27,200.58777778,305.21111111,157.26222222,160.90888889
1,3354385,1346202256,JONES,WENDY,P,M.D.,F,I,2950 VILLAGE DR,,...,G0202,"Screening mammography, bilateral (2-view study...",N,175,175,175,123.73,548.8,118.83,135.31525714
2,3001884,1306820956,DUROCHER,RICHARD,W,DPM,M,I,20 WASHINGTON AVE,STE 212,...,99348,"Established patient home visit, typically 25 m...",N,32,13,32,90.65,155.0,64.4396875,60.5959375
3,7594822,1770523540,FULLARD,JASPER,,MD,M,I,5746 N BROADWAY ST,,...,81002,"Urinalysis, manual test",N,20,18,20,3.5,5.0,3.43,3.43
4,746159,1073627758,PERROTTI,ANTHONY,E,DO,M,I,875 MILITARY TRL,SUITE 200,...,96372,Injection beneath the skin or into muscle for ...,N,33,24,31,26.52,40.0,19.539393939,19.057575758


In [60]:
DropCols = ['index', 'National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider','Street Address 1 of the Provider',
       'Street Address 2 of the Provider','Zip Code of the Provider',"HCPCS Code"]
df = df.drop(DropCols, axis = 1)

Index(['Average Medicare Allowed Amount', 'Average Submitted Charge Amount',
       'Average Medicare Payment Amount',
       'Average Medicare Standardized Amount', 'Credentials of the Provider_0',
       'Credentials of the Provider_1', 'Credentials of the Provider_2',
       'Credentials of the Provider_3', 'Credentials of the Provider_4',
       'Credentials of the Provider_5', 'Credentials of the Provider_6',
       'Credentials of the Provider_7', 'Credentials of the Provider_8',
       'Credentials of the Provider_9', 'Credentials of the Provider_10',
       'Gender of the Provider_0', 'Gender of the Provider_1',
       'Entity Type of the Provider_0', 'Entity Type of the Provider_1',
       'City of the Provider_0', 'City of the Provider_1',
       'City of the Provider_2', 'City of the Provider_3',
       'City of the Provider_4', 'City of the Provider_5',
       'City of the Provider_6', 'City of the Provider_7',
       'City of the Provider_8', 'City of the Provider_9',
    

In [52]:
def RemoveComma(x):
    return x.replace(",","")
df["Average Medicare Allowed Amount"] = pd.to_numeric(df["Average Medicare Allowed Amount"].apply(lambda x: RemoveComma(x)),
                                                             errors= "ignore")
df["Average Submitted Charge Amount"] = pd.to_numeric(df["Average Submitted Charge Amount"].apply(lambda x: RemoveComma(x)),
                                                       errors = "ignore")
df["Average Medicare Payment Amount"] = pd.to_numeric(df["Average Medicare Payment Amount"].apply(lambda x: RemoveComma(x)),
                                                       errors = "ignore")
df["Average Medicare Standardized Amount"] = pd.to_numeric(df["Average Medicare Standardized Amount"].apply(lambda x: RemoveComma(x)),
                                                             errors = "ignore")
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

def RemoveComma(x):
    return x.replace(",","")

In [53]:
def Preprocessing(data):
    
    
    #1.Imputing Missing Values

    data["Credentials of the Provider"] = data["Credentials of the Provider"].fillna(data["Credentials of the Provider"].mode()[0])
    data["Gender of the Provider"] = data["Gender of the Provider"].fillna(data["Gender of the Provider"].mode()[0])
    

   #2.Binary Encoding.

    
    BEcols = [var for var in data.columns if data[var].dtype == "O"]
    print(BEcols)
    
    for col in BEcols:
        encoder = ce.BinaryEncoder(cols = [col])
        dfbin = encoder.fit_transform(data[col])
        data = pd.concat([data,dfbin], axis = 1)
        del data[col]

    #3. One-Hot-Encoding

#     data = pd.get_dummies(data,drop_first = True)
    
 
    #4. Standardization
 
    data_columns = data.columns
    std = StandardScaler()
    data = std.fit_transform(data)
    data = pd.DataFrame(data, columns = data_columns)
    
    return data

In [54]:

my_dict['n_estimators']

300

In [55]:
df = Preprocessing(df)
from sklearn.ensemble import IsolationForest

model = IsolationForest(n_estimators=my_dict['n_estimators'], max_samples=my_dict['max_samples'], 
                        contamination=my_dict['contamination'], max_features=my_dict['max_features'], bootstrap=my_dict['bootstrap'], n_jobs=my_dict['n_jobs'], 
                         verbose=my_dict['verbose'], warm_start=my_dict['warm_start'], random_state=my_dict['random_state'])
model.fit(df)

Y = model.predict(df)

Y[Y == 1] = 0
Y[Y == -1] = 1

['Credentials of the Provider', 'Gender of the Provider', 'Entity Type of the Provider', 'City of the Provider', 'State Code of the Provider', 'Country Code of the Provider', 'Provider Type', 'Medicare Participation Indicator', 'Place of Service', 'HCPCS Description', 'HCPCS Drug Indicator', 'Number of Services', 'Number of Medicare Beneficiaries', 'Number of Distinct Medicare Beneficiary/Per Day Services']


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    7.9s remaining:   23.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    8.1s finished


In [57]:
df_shal = pd.read_csv(my_dict['Sorce_input_data_path_with_filename'])

{'Sorce_input_data_path_with_filename': 'data/Healthcare Providers.csv',
 'Row_wise_anomaly_output_data_path': 'output/Row_wise_result.csv',
 'n_estimators': 300,
 'max_samples': 'auto',
 'contamination': 'auto',
 'max_features': 1.0,
 'bootstrap': False,
 'n_jobs': -1,
 'verbose': 1,
 'warm_start': False,
 'random_state': 2020}

In [58]:
df_shal['Anomaly_Flag']=Y

In [59]:
df_shal.to_csv(my_dict['Row_wise_anomaly_output_data_path'],index=False)