# Import Library

`data_loading and `feature_engineering` are python files (.py). `data_loading` contains a function called load_data which loads the dataframe, and `feature_engineering` contains a function called apply_feature_engineering that applies the feature engineering.

**Before running this script, make sure you have downloaded each of the raw data files from Kaggle in a folder called "raw" within a folder called "data". An example file path is:**
> `data/raw/Test_Beneficiarydata-1542969243754.csv` 

The load_data function will call the merge_data and correct_codes to merge all of the raw datasets and apply the code correction. You can upload these as normal libaries, as seen below:

In [1]:
from data_loading import load_data
from feature_engineering import apply_feature_engineering
import pandas as pd
import numpy as np

# Data Loading
Here is where you will call the load_data function from data_loading --> there are no parameters needed

In [None]:
# Call the load_data to get the data as a pandas dataframe
df = load_data()
df.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ClmDiagnosisCode_7_LongDesc,ClmDiagnosisCode_7_ShortDesc,ClmDiagnosisCode_8_LongDesc,ClmDiagnosisCode_8_ShortDesc,ClmDiagnosisCode_9_LongDesc,ClmDiagnosisCode_9_ShortDesc,ClmDiagnosisCode_10_LongDesc,ClmDiagnosisCode_10_ShortDesc,Flag_Unknown_Procedures,Flag_Unknown_Diagnoses
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,Other and unspecified hyperlipidemia,Hyperlipidemia NEC/NOS,Secondary malignant neoplasm of other specifie...,Secondary malig neo NEC,"Acute kidney failure, unspecified",Acute kidney failure NOS,,,False,False
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,,,,,,,,,False,False
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,...,,,,,,,,,False,False
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,"Diabetes with neurological manifestations, typ...",DMII neuro uncntrld,"Hypertensive chronic kidney disease, unspecifi...",Hy kid NOS w cr kid I-IV,Unspecified essential hypertension,Hypertension NOS,,,False,False
4,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,"Diabetes with neurological manifestations, typ...",DMII neuro uncntrld,"Hypertensive chronic kidney disease, unspecifi...",Hy kid NOS w cr kid I-IV,Unspecified essential hypertension,Hypertension NOS,,,False,False


# Feature Engineering
Here is where you will call the apply_feature_engineering function from feature_engineering --> there are no parameters needed

In [None]:
# Call the apply_feature_engineering function from feature_engineering to get the data ready for ML Modeling
df = apply_feature_engineering(df)

Starting feature engineering...
Adding datetime features...
Added datetime features. Time elapsed: 26.82s
Discretizing age...
Discretized age. Time elapsed: 26.83s
Filling in missing values...
Filled in missing values. Time elapsed: 30.24s
Transforming skewed distributions...
Transformed skewed distributions. Time elapsed: 30.26s
Encoding categorical columns...
Encoded categorical columns. Time elapsed: 35.47s
Dropping unnecessary columns...
Dropped unnecessary columns. Time elapsed: 35.60s
Feature engineering complete!


In [None]:
# You should see the following columns and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559877 entries, 0 to 559876
Data columns (total 57 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Provider                         559877 non-null  int64  
 1   AttendingPhysician               559877 non-null  int64  
 2   OperatingPhysician               559877 non-null  int64  
 3   OtherPhysician                   559877 non-null  int64  
 4   ClmAdmitDiagnosisCode            559877 non-null  int64  
 5   TotalClaims                      559877 non-null  int64  
 6   Gender                           559877 non-null  int64  
 7   Race                             559877 non-null  int64  
 8   RenalDiseaseIndicator            559877 non-null  int64  
 9   State                            559877 non-null  int64  
 10  County                           559877 non-null  int64  
 11  NoOfMonths_PartACov              559877 non-null  int64  
 12  No

In [9]:
# The first 5 rows should look like this
df.head()

Unnamed: 0,Provider,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmAdmitDiagnosisCode,TotalClaims,Gender,Race,RenalDiseaseIndicator,State,...,NumProcedures,HospitalStayDays,ClaimDuration,DaysBeforeAdmission,ClaimStartMonth,ClaimStartWeekday,ClaimStartYear,DaysSinceLastClaim,AgeAtClaim,AgeGroup
0,3928,53275,0,0,3022,60,1,1,0,39,...,6,6.0,6,0.0,4,6,2009,0.0,66,2
1,3923,4951,2133,0,2298,60,1,1,0,39,...,6,2.0,2,0.0,8,0,2009,141.0,66,2
2,4034,40843,0,5202,851,14,1,1,0,39,...,6,3.0,3,0.0,9,3,2009,17.0,66,2
3,1125,39020,23576,14718,1602,22,2,2,0,1,...,6,8.0,8,0.0,2,5,2009,0.0,95,3
4,1125,39020,23576,14718,1602,22,2,2,0,1,...,6,8.0,8,0.0,2,5,2009,0.0,95,3


# Example ML Modeling: Decision Tree
An example decision tree applied below:

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Our X variables in these models will be all columns that are not PotentialFraud
X = df.drop(columns=['PotentialFraud'], axis= 1)

# Our y variable is of course PotentialFraud
y = df['PotentialFraud']

# Split the data into train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [None]:
# Instantiate the decision tree classifier
dt = DecisionTreeClassifier(random_state= 0, max_depth= 3)

# Fit and predict the data
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [None]:
# Print the classification
print('Classification Report:')
print(classification_report(y_test, y_pred, target_names=['No Fraud','Fraud']))

# Print the confusion matrix
print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    No Fraud       0.79      1.00      0.88     68902
       Fraud       0.99      0.59      0.74     43074

    accuracy                           0.84    111976
   macro avg       0.89      0.79      0.81    111976
weighted avg       0.87      0.84      0.83    111976

[[68579   323]
 [17774 25300]]
