<a href="https://colab.research.google.com/github/akrity8521/Medicare-Fraud-Detection/blob/main/Code_Thesis_Implementation_catboost_partD(with_autoencoders_and_without_SMOTE).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install catboost

In [None]:
#importing libraries 
import pandas as pd
import numpy as np
import scipy
import os 
import matplotlib.pyplot as plt
import random
import seaborn as sns
from pandas.api.types import is_numeric_dtype
from scipy.stats import ttest_ind

from sklearn.metrics import brier_score_loss, precision_score, recall_score,f1_score, roc_auc_score, accuracy_score 
from sklearn.metrics import confusion_matrix, roc_curve, classification_report
from sklearn.preprocessing import StandardScaler 
from sklearn.feature_extraction import DictVectorizer
from sklearn.manifold import TSNE
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split

from keras.models import Model, load_model, Sequential
from keras.layers import Input, Dense
from sklearn.manifold import TSNE
from keras import regularizers

import catboost as cb
import xgboost as xgb
from sklearn import ensemble 
import lightgbm as lgb

In [None]:
#connnecting drive with google colab
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
#reading part D dataset
dataset = pd.read_csv('/content/drive/My Drive/PartD_Prescriber_PUF_NPI_Drug_18.txt',sep="\t",nrows=1500000)

In [None]:
dataset.shape

In [None]:
dataset.columns

In [None]:
dataset= dataset.loc[:,['npi', 'specialty_description','nppes_provider_state',\
                        'drug_name','bene_count','total_claim_count','total_30_day_fill_count','total_day_supply','total_drug_cost']]

In [None]:
dataset = dataset.drop_duplicates()

In [None]:
dataset=dataset.dropna()

In [None]:
dataset.shape

In [None]:
dataset.columns

In [None]:
dataset.dtypes

In [None]:
# one-hot encoding
rated_dummies1 = pd.get_dummies(dataset.nppes_provider_state)
dataset=pd.concat([dataset, rated_dummies1], axis=1)
rated_dummies2 = pd.get_dummies(dataset.specialty_description)
dataset=pd.concat([dataset, rated_dummies2], axis=1)
rated_dummies3 = pd.get_dummies(dataset.drug_name)
dataset=pd.concat([dataset, rated_dummies3], axis=1)
dataset = dataset.drop(['nppes_provider_state','specialty_description','drug_name'], axis=1)

In [None]:
dataset.shape

In [None]:
dataset.dtypes

In [None]:
#reading LEIE dataset
IELErawdata = "/content/drive/My Drive/LEIE.csv"
IELE_pd = pd.read_csv(IELErawdata)

In [None]:
npifraud_pd0 = IELE_pd.loc[:,['NPI','EXCLTYPE']]

In [None]:
npifraud_pd1 = npifraud_pd0.query('NPI !=0')

In [None]:
rename_dict = {'NPI':'npi', 'EXCLTYPE':'is_fraud'}
npi_fraud_pd = npifraud_pd1.rename(columns=rename_dict)

In [None]:
npi_fraud_pd['is_fraud'] = 1

In [None]:
# merging the two datatset
Features_pd1 = pd.merge(dataset,npi_fraud_pd, how ='left',on = 'npi')

In [None]:
Features_pd1.fillna(0, inplace=True)

In [None]:
Features_pd1['is_fraud'].value_counts()

In [None]:
Features_pd1[Features_pd1['is_fraud']==1].count()

In [None]:
FeaturesAll_pd=Features_pd1

In [None]:
FeaturesAll_pd.nunique()

In [None]:
FeaturesAll_pd = FeaturesAll_pd.drop(['npi'], axis=1)

In [None]:
FeaturesAll_pd

In [None]:
x = FeaturesAll_pd.drop(columns=['is_fraud'], axis=1)
y = FeaturesAll_pd['is_fraud']

x_scale = preprocessing.MinMaxScaler().fit_transform(x.values)
x_norm, x_fraud = x_scale[y == 0], x_scale[y == 1]

In [None]:
x

In [None]:
## input layer 
input_layer = Input(shape=(x.shape[1],))

## encoding part
encoded = Dense(100, activation='relu', activity_regularizer=regularizers.l1(10e-5))(input_layer)
encoded = Dense(50, activation='relu')(encoded)

## decoding part
decoded = Dense(50, activation='tanh')(encoded)
decoded = Dense(100, activation='tanh')(decoded)

## output layer
output_layer = Dense(x.shape[1], activation='relu')(decoded)


In [None]:
#autoencoder training
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adam", loss="mse")

In [None]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
es = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=20, restore_best_weights=True)
history=autoencoder.fit(x_norm[0:2000], x_norm[0:2000], 
                batch_size = 128, epochs = 50,  callbacks=[es],
                shuffle = True, validation_split =0.20);

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
hid_rep = Sequential()
hid_rep .add(autoencoder.layers[0])
hid_rep .add(autoencoder.layers[1])
hid_rep .add(autoencoder.layers[2])

In [None]:
norm_hid_rep = hid_rep .predict(x_norm[:3000])
fraud_hid_rep = hid_rep .predict(x_fraud)

In [None]:
non_fraud = FeaturesAll_pd[FeaturesAll_pd['is_fraud'] == 0].sample(1000)
fraud = FeaturesAll_pd[FeaturesAll_pd['is_fraud'] == 1]

df = non_fraud.append(fraud).sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
X = df.drop(['is_fraud'], axis = 1).values
Y = df["is_fraud"].values

In [None]:
def tsne_plot(x1, y1, name="graph.png"):
    tsne = TSNE(n_components=2, random_state=0)
    X_t = tsne.fit_transform(x1)

    plt.figure(figsize=(12, 8))
    plt.scatter(X_t[np.where(y1 == 0), 0], X_t[np.where(y1 == 0), 1], marker='o', color='g', linewidth='1', alpha=0.8, label='Non Fraud')
    plt.scatter(X_t[np.where(y1 == 1), 0], X_t[np.where(y1 == 1), 1], marker='o', color='r', linewidth='1', alpha=0.8, label='Fraud')

    plt.legend(loc='best');
    plt.savefig(name);
    plt.show();
    
tsne_plot(X, Y, "original.png")

In [None]:
rep_x = np.append(norm_hid_rep, fraud_hid_rep, axis = 0)
y_n = np.zeros(norm_hid_rep.shape[0])
y_f = np.ones(fraud_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)
tsne_plot(rep_x, rep_y, "latent_representation.png")


In [None]:
#train-test split
train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.25, random_state=101, stratify=rep_y)

In [None]:
#training catboost classifier
clf = cb.CatBoostClassifier()
clf.fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("")
print ("confusion_matrixt: ")
print (confusion_matrix(val_y, pred_y))

print ("")
print (classification_report(val_y, pred_y))
print ("Precision: ", precision_score(val_y, pred_y))
print ("Recall: ", recall_score(val_y, pred_y))
print ("F1 Score: ", f1_score(val_y, pred_y))
print ("Auc Score: ", roc_auc_score(val_y, pred_y))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y))

In [None]:
#training adaboost classifier
clf = ensemble.AdaBoostClassifier()
clf.fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("")
print ("confusion_matrixt: ")
print (confusion_matrix(val_y, pred_y))

print ("")
print (classification_report(val_y, pred_y))
print ("Precision: ", precision_score(val_y, pred_y))
print ("Recall: ", recall_score(val_y, pred_y))
print ("F1 Score: ", f1_score(val_y, pred_y))
print ("Auc Score: ", roc_auc_score(val_y, pred_y))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y))

In [None]:
#training xgboost classifier
clf =  xgb.XGBClassifier()
clf.fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("")
print ("confusion_matrixt: ")
print (confusion_matrix(val_y, pred_y))

print ("")
print (classification_report(val_y, pred_y))
print ("Precision: ", precision_score(val_y, pred_y))
print ("Recall: ", recall_score(val_y, pred_y))
print ("F1 Score: ", f1_score(val_y, pred_y))
print ("Auc Score: ", roc_auc_score(val_y, pred_y))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y))

In [None]:
#training lightGBM classifier
clf =  lgb.LGBMClassifier()
clf.fit(train_x, train_y)
pred_y = clf.predict(val_x)

print ("")
print ("confusion_matrixt: ")
print (confusion_matrix(val_y, pred_y))

print ("")
print (classification_report(val_y, pred_y))
print ("Precision: ", precision_score(val_y, pred_y))
print ("Recall: ", recall_score(val_y, pred_y))
print ("F1 Score: ", f1_score(val_y, pred_y))
print ("Auc Score: ", roc_auc_score(val_y, pred_y))
print ("Accuracy Score: ", accuracy_score(val_y, pred_y))