In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore")            # Suppressing Warnings

# Data Cleaning and Preparation

## Beneficiary Details Data

In [None]:
train_beneficiary_df = pd.read_csv('../input/healthcare-provider-fraud-detection-analysis/Train_Beneficiarydata-1542865627584.csv')
train_beneficiary_df.head()

In [None]:
print('There are ' + str(train_beneficiary_df.shape[0]) + ' rows and ' + str(train_beneficiary_df.shape[1]) + ' columns in the beneficiary details dataset.')

In [None]:
train_beneficiary_df = train_beneficiary_df.loc[:, (train_beneficiary_df != 0).any(axis=0)]

In [None]:
train_beneficiary_df.info()

In [None]:
train_beneficiary_df['DOD'] = train_beneficiary_df.DOD.fillna(train_beneficiary_df.DOD.value_counts().index[0])

train_beneficiary_df['DOD'] = pd.to_datetime(train_beneficiary_df['DOD'], format = '%Y-%m-%d')
train_beneficiary_df['DOB'] = pd.to_datetime(train_beneficiary_df['DOB'], format = '%Y-%m-%d')
train_beneficiary_df['Age'] = ((train_beneficiary_df['DOD'] - train_beneficiary_df['DOB']).dt.days/365).astype('int64')

train_beneficiary_df = train_beneficiary_df.drop(columns = ['DOB', 'DOD'])

train_beneficiary_df.head()

In [None]:
# Checking on the object column - RenalDiseaseIndicator
train_beneficiary_df.RenalDiseaseIndicator.unique()

In [None]:
label_encoder = preprocessing.LabelEncoder()
train_beneficiary_df['RenalDiseaseIndicator'] = label_encoder.fit_transform(train_beneficiary_df['RenalDiseaseIndicator'])
train_beneficiary_df.RenalDiseaseIndicator.unique()

In [None]:
train_beneficiary_df.info()

## Inpatient Data

In [None]:
train_inPatient_df = pd.read_csv("../input/healthcare-provider-fraud-detection-analysis/Train_Inpatientdata-1542865627584.csv");
train_inPatient_df.head()

In [None]:
print('There are ' + str(train_inPatient_df.shape[0]) + ' rows and ' + str(train_inPatient_df.shape[1]) + ' columns in the in-patient dataset.')

In [None]:
train_inPatient_df.info()

In [None]:
train_inPatient_df['OperatingPhysician'].fillna('None', inplace = True)
train_inPatient_df['OtherPhysician'].fillna('None', inplace = True)

In [None]:
train_inPatient_df['ClaimStartDt'] = pd.to_datetime(train_inPatient_df['ClaimStartDt'], format = '%Y-%m-%d')
train_inPatient_df['ClaimEndDt'] = pd.to_datetime(train_inPatient_df['ClaimEndDt'], format = '%Y-%m-%d')
train_inPatient_df['ClaimDurationInDays'] = ((train_inPatient_df['ClaimEndDt'] - train_inPatient_df['ClaimStartDt']).dt.days).astype('int64')

train_inPatient_df['AdmissionDt'] = pd.to_datetime(train_inPatient_df['AdmissionDt'], format = '%Y-%m-%d')
train_inPatient_df['DischargeDt'] = pd.to_datetime(train_inPatient_df['DischargeDt'], format = '%Y-%m-%d')
train_inPatient_df['AdmissionDurationInDays'] = ((train_inPatient_df['DischargeDt'] - train_inPatient_df['AdmissionDt']).dt.days).astype('int64')

train_inPatient_df = train_inPatient_df.drop(columns = ['ClaimStartDt', 'ClaimEndDt', 'AdmissionDt', 'DischargeDt'])

train_inPatient_df.head()

In [None]:
train_inPatient_df.drop(columns = ['ClmProcedureCode_5', 'ClmProcedureCode_6'], axis=1, inplace= True)

for ClmProcedureCodeCount in range(4):
    train_inPatient_df['ClmProcedureCode_{}'.format(ClmProcedureCodeCount + 1)].fillna(0, inplace = True)
    
for ClmDiagnosisCodeCount in range(10):
    train_inPatient_df['ClmDiagnosisCode_{}'.format(ClmDiagnosisCodeCount + 1)].fillna('0', inplace = True)

In [None]:
print('Unique values in the DeductibleAmtPaid Column:', train_inPatient_df.DeductibleAmtPaid.unique())

In [None]:
train_inPatient_df.DeductibleAmtPaid.fillna(0, inplace = True)

In [None]:
train_inPatient_df.apply(lambda x: x.fillna(x.value_counts().index[0], inplace = True))
train_inPatient_df.head()

In [None]:
train_inPatient_df.info()

## Outpatient Data

In [None]:
train_outPatient_df = pd.read_csv('/kaggle/input/healthcare-provider-fraud-detection-analysis/Train_Outpatientdata-1542865627584.csv')
train_outPatient_df.head()

In [None]:
print('There are ' + str(train_outPatient_df.shape[0]) + ' rows and ' + str(train_outPatient_df.shape[1]) + ' columns in the out-patient dataset.')

In [None]:
train_outPatient_df.info()

In [None]:
train_outPatient_df['OperatingPhysician'].fillna('None', inplace = True)
train_outPatient_df['OtherPhysician'].fillna('None', inplace = True)

In [None]:
train_outPatient_df['ClaimStartDt'] = pd.to_datetime(train_outPatient_df['ClaimStartDt'], format = '%Y-%m-%d')
train_outPatient_df['ClaimEndDt'] = pd.to_datetime(train_outPatient_df['ClaimEndDt'], format = '%Y-%m-%d')
train_outPatient_df['ClaimDurationInDays'] = ((train_outPatient_df['ClaimEndDt'] - train_outPatient_df['ClaimStartDt']).dt.days).astype('int64')

train_outPatient_df = train_outPatient_df.drop(columns = ['ClaimStartDt', 'ClaimEndDt'])

train_outPatient_df.head()

In [None]:
train_outPatient_df.drop(columns = ['ClmProcedureCode_3' , 'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6'], axis=1, inplace= True)

for ClmProcedureCodeCount in range(2):
    train_outPatient_df['ClmProcedureCode_{}'.format(ClmProcedureCodeCount + 1)].fillna(0, inplace = True)
    
for ClmDiagnosisCodeCount in range(10):
    train_outPatient_df['ClmDiagnosisCode_{}'.format(ClmDiagnosisCodeCount + 1)].fillna('0', inplace = True)

In [None]:
train_outPatient_df.apply(lambda x: x.fillna(x.value_counts().index[0], inplace = True))
train_outPatient_df.head()

In [None]:
train_outPatient_df.info()

## Merging Datasets

In [None]:
train_inPatientMerged = pd.merge(train_inPatient_df, train_beneficiary_df, on = 'BeneID', how = 'inner')
train_inPatientMerged.head()

In [None]:
train_inPatientMerged.shape

In [None]:
train_inPatientMerged.info()

In [None]:
train_outPatientMerged = pd.merge(train_outPatient_df, train_beneficiary_df, on = 'BeneID', how = 'inner')
train_outPatientMerged.head()

In [None]:
train_outPatientMerged.shape

In [None]:
train_outPatientMerged.info()

In [None]:
train_providerFraud_df = pd.read_csv('../input/healthcare-provider-fraud-detection-analysis/Train-1542865627584.csv')
train_providerFraud_df.head()

In [None]:
train_providerFraud_df.shape

In [None]:
train_providerFraud_df['PotentialFraud'] = label_encoder.fit_transform(train_providerFraud_df['PotentialFraud'])
train_providerFraud_df.PotentialFraud.unique()

In [None]:
train_inPatientMergedWithProviderFraud_df = pd.merge(train_inPatientMerged, train_providerFraud_df, on = 'Provider', how = 'inner')
train_inPatientMergedWithProviderFraud_df.head()

In [None]:
train_inPatientMergedWithProviderFraud_df.shape

In [None]:
train_inPatientMergedWithProviderFraud_df.info()

In [None]:
train_inPatientMergedWithProviderFraud_df = train_inPatientMergedWithProviderFraud_df.apply(label_encoder.fit_transform)
train_inPatientMergedWithProviderFraud_df.head()

In [None]:
train_outPatientMergedWithProviderFraud_df = pd.merge(train_outPatientMerged, train_providerFraud_df, on = 'Provider', how = 'inner')
train_outPatientMergedWithProviderFraud_df.head()

In [None]:
train_outPatientMergedWithProviderFraud_df.shape

In [None]:
train_outPatientMergedWithProviderFraud_df.info()

In [None]:
train_outPatientMergedWithProviderFraud_df = train_outPatientMergedWithProviderFraud_df.apply(label_encoder.fit_transform)
train_outPatientMergedWithProviderFraud_df.head()

In [None]:
train_inPatientMergedWithProviderFraud_df = train_inPatientMergedWithProviderFraud_df.drop(columns = ['BeneID', 'ClaimID', 'Provider'])
train_outPatientMergedWithProviderFraud_df = train_outPatientMergedWithProviderFraud_df.drop(columns = ['BeneID', 'ClaimID', 'Provider'])
print(train_inPatientMergedWithProviderFraud_df.shape, train_outPatientMergedWithProviderFraud_df.shape)

# Modeling

## In-Patient Data

In [None]:
labels = train_inPatientMergedWithProviderFraud_df['PotentialFraud']
features = train_inPatientMergedWithProviderFraud_df.drop('PotentialFraud', axis = 1)

feature_list = list(features.columns)

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_features, train_labels)

In [None]:
lrPredictions = lr.predict(test_features)

In [None]:
lrPredictions

In [None]:
from sklearn.metrics import recall_score, precision_score, f1_score

print('Recall:', recall_score(test_labels, lrPredictions) * 100)
print('Precision:', precision_score(test_labels, lrPredictions) * 100)
print('F1 Score:', f1_score(test_labels, lrPredictions) * 100)