In [None]:
## Loading the required modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout

In [None]:
## Reading the main dataset
data = pd.read_csv('dataset_all.csv', index_col=0)

In [None]:
## Preprocessing

## Dropping Nan values
df = data.copy()
labels = df['Label']
df = df.iloc[:, :-22]
df = pd.concat((df, labels), axis=1)
df.dropna(axis=0, inplace=True)

## Manual feature selection
labels = df['Label']

codes_list_1 = ['14061', '65800', '65855', '67973', '67975', '68326', '68335', 
                '76510', '76511', '76512', '76513', '92060', '92100', '92235', 
                '92240', '92275', '92284', '95004', '95930']
codes_list_2 = ['65100', '65210', '65222', '65400', '65426', '65722', '65815', 
                '66030', '66175', '66183', '66761', '66982', '67938', '68761', 
                '68801', '76514', '92020', '92025', '92065', '92083', '92140', 
                '92225', '92226', '92250', '92286', '99354']
codes_list_3 = ['99204', '99205',
                '99214', '99215',
                '99284', '99285',
                '99304', '99305', '99306', '99307', '99308', '99309', '99310']
codes_list_4 = ['92004', 
                '92014']
codes_to_use = [*codes_list_1, *codes_list_2, *codes_list_3, *codes_list_4]
columns = list(df.columns)
columns_to_use = [c for c in columns if c[0:5] in codes_to_use]

df = pd.concat((df[columns_to_use], labels), axis=1)

## Separate labeled (train) and unlabeled (test) observations
df_labeled = df[df['Label'] != -1]
df_unlabeled = df[df['Label'] == -1]

## Scaling the input (Use MinMaxScaler to preserve the sparsity) 
## One-hot encoding the labels 
X = df_labeled.iloc[:, :-1]
y = df_labeled['Label']

scaler = MinMaxScaler(feature_range=(0, 1)) ## Use scaler.transform at test time
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(data=X_scaled, index=X.index, columns=X.columns)

y = to_categorical(y)

## Splitting the input into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

## Reshaping the training and validation sets into a 3D format expected 
## by the LSTMs
X_train = X_train.values.reshape((X_train.shape[0], 7, X_train.shape[1] // 7))
X_val = X_val.values.reshape((X_val.shape[0], 7, X_val.shape[1] // 7))

## Encoding step 
# To be implemented

## Fitting the LSTM model
model = Sequential()
model.add(LSTM(1024, input_shape=(X_train.shape[1], X_train.shape[2])))
# model.add(Dense(256, activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(256, activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam',
              metrics=['acc'])

history = model.fit(X_train, y_train,
          batch_size=32,
          epochs=20, 
          validation_data=(X_val, y_val),
          shuffle=False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
## Predicting probabilities and generating lists of names
npi_list = df_unlabeled.index
n = 20
npi_list = np.random.choice(npi_list, size=n, replace=False)
df_out = pd.DataFrame(index=npi_list, columns=['Last Name', 'First Name', 
                                               'Nonfraudulent', 
                                               'High Utilization', 
                                               'Fraudulent'])

data2018 = pd.read_csv('Medicare_Provider_Utilization_and_Payment_Data__\
Physician_and_Other_Supplier_PUF_CY2018_CA_Ophthalmology.csv')

for npi in npi_list:
    d = data2018.loc[data2018['National Provider Identifier']==npi]
    if d.shape[0] != 0:
        df_out.loc[npi][['Last Name', 'First Name']] = \
        d.iloc[0][['Last Name/Organization Name of the Provider', 
                   'First Name of the Provider']].values
        # print(d.iloc[0][['Last Name/Organization Name of the Provider', 
        #            'First Name of the Provider']].values)
    
    f = df_unlabeled.loc[npi][:-1].values
    f = f.reshape((1, len(f)))
    f = scaler.transform(f)
    f = f.reshape((1, 7, f.shape[1] // 7))
    df_out.loc[npi][['Nonfraudulent', 'High Utilization', 'Fraudulent']] = \
    model.predict(f).squeeze()

df_out
# df_out.to_csv('ListofNames_June7.csv')

Unnamed: 0,Last Name,First Name,Nonfraudulent,High Utilization,Fraudulent
1225004450,LOZIER,JEFFREY,0.478945,0.504824,0.0162305
1437175262,GARCIA,GEORGE,0.636364,0.344638,0.0189981
1659338788,JANSSON,ERIK,0.826455,0.152107,0.0214384
1396738530,ROBERTS,MICHAEL,0.867265,0.127253,0.00548161
1639160112,BACHARACH,JASON,0.987887,0.00243106,0.00968229
1902881972,PARK,SUSANNA,0.935455,0.0368542,0.0276906
1336177245,PASCAL,STEVEN,0.908812,0.0850001,0.00618781
1992770804,JAMESON,NANCY,0.421508,0.56053,0.0179619
1073516597,WHISLER,CHARLES,0.840687,0.150933,0.00838038
1699954644,LUKAC,JAN,0.60954,0.324126,0.0663342


In [None]:
df_out.to_csv('ListofNames_June7.csv')