In [1]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Imports for decision tree visualization
import pydotplus
from IPython.display import Image

In [2]:
# Read in the Dataset
file_path = Path("Resources/reimbursement_data.csv")
reimburse_df = pd.read_csv(file_path)
reimburse_df.head()

Unnamed: 0,Resolution,Manufacturer Rejection Code,Days to resolve,Adjustment Amount,DC,DC City,DM Create Date,DM ID #,Manufacturer Name,Manufacturer ID,...,Product Name,Customer ID,Customer Group,Customer Group ID,days from order to invoice,Units Sold,Reimbursement Amount,Contract Lead,Contract ID,Chrgbk Doc #
0,Write Off,YY,12,-184.1,8182,W.Sacramento,10/1/2020,1275GTVASM,Supplier 9,9,...,Drug # 11,730048,Customer Group 9,9,61,-1,-184.1,40019,7,2896083484
1,Write Off,KK,19,-131.88,8148,Duluth,10/1/2020,1275GAKNAL,Supplier 1,1,...,Drug # 2,155260,Customer Group 2,2,51,-1,-131.88,785477,163,2895901196
2,Write Off,TT,14,-52.4,8149,Memphis,10/1/2020,1275GAKNMD,Supplier 1,1,...,Drug # 1,528125,Customer Group 1,1,98,-1,-68.51,474335,78,2895873878
3,Resubmit,A2,7,-38.64,8126,McCalla,10/1/2020,1275GVRTBI,Supplier 17,17,...,Drug # 49,504084,Customer Group 4,4,388,-1,-38.64,586439,96,2896465786
4,Write Off,A2,134,-36.0,8165,Oklahoma City,10/1/2020,1275GPIIOC,Supplier 7,7,...,Drug # 9,467382,Customer Group 1,1,416,-4,-36.0,474335,78,2896018433


In [7]:
# Clean Data, drop unnecessary feature columns and those that are too detailed and would lead to overfitting (such as customer # and individual product IDs)
reimburse_df = reimburse_df[["Resolution","Manufacturer Rejection Code","Days to resolve","DC","Manufacturer ID","Customer Group ID", "days from order to invoice","Units Sold","Reimbursement Amount","Contract Lead"]]
reimburse_df.head()

Unnamed: 0,Resolution,Manufacturer Rejection Code,Days to resolve,Adjustment Amount,DC,Manufacturer ID,Product ID,Customer ID,Customer Group ID,days from order to invoice,Units Sold,Reimbursement Amount,Contract Lead
0,Write Off,YY,12,-$184.10,8182,26107,2272227,730048,184,61,-1,-$184.10,40019
1,Write Off,KK,19,-$131.88,8148,564,3964368,155260,202,51,-1,-$131.88,785477
2,Write Off,TT,14,-$52.40,8149,564,1196047,528125,227,98,-1,-$68.51,474335
3,Resubmit,A2,7,-$38.64,8126,49012,3428547,504084,386,388,-1,-$38.64,586439
4,Write Off,A2,134,-$36.00,8165,35367,1605567,467382,227,416,-4,-$36.00,474335


In [5]:
# perform binary encoding on resolution and rejection code columns

reimburse_binary_df = pd.get_dummies(reimburse_df, columns=["Resolution", "Manufacturer Rejection Code"])
reimburse_binary_df.head()

Unnamed: 0,Days to resolve,Adjustment Amount,DC,DC City,DM Create Date,DM ID #,Manufacturer ID,Manufacturer Name,Product ID,Product Name,...,Manufacturer Rejection Code_MM,Manufacturer Rejection Code_NN,Manufacturer Rejection Code_RR,Manufacturer Rejection Code_SS,Manufacturer Rejection Code_TT,Manufacturer Rejection Code_UU,Manufacturer Rejection Code_VV,Manufacturer Rejection Code_WW,Manufacturer Rejection Code_XX,Manufacturer Rejection Code_YY
0,12,-$184.10,8182,W.Sacramento,10/1/2020,1275GTVASM,26107,Supplier 9,2272227,Drug # 11,...,0,0,0,0,0,0,0,0,0,1
1,19,-$131.88,8148,Duluth,10/1/2020,1275GAKNAL,564,Supplier 1,3964368,Drug # 2,...,0,0,0,0,0,0,0,0,0,0
2,14,-$52.40,8149,Memphis,10/1/2020,1275GAKNMD,564,Supplier 1,1196047,Drug # 1,...,0,0,0,0,1,0,0,0,0,0
3,7,-$38.64,8126,McCalla,10/1/2020,1275GVRTBI,49012,Supplier 17,3428547,Drug # 49,...,0,0,0,0,0,0,0,0,0,0
4,134,-$36.00,8165,Oklahoma City,10/1/2020,1275GPIIOC,35367,Supplier 7,1605567,Drug # 9,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Specify features 
X = reimburse_df.copy()
X.drop("Resolution", axis=1, inplace=True)
X.head()

Unnamed: 0,Manufacturer Rejection Code,Days to resolve,Adjustment Amount,DC,DC City,DM Create Date,DM ID #,Manufacturer ID,Manufacturer Name,Product ID,Product Name,Customer ID,Customer Group,Customer Group ID,days from order to invoice,Units Sold,Reimbursement Amount,Contract Lead,Contract ID,Chrgbk Doc #
0,YY,12,-$184.10,8182,W.Sacramento,10/1/2020,1275GTVASM,26107,Supplier 9,2272227,Drug # 11,730048,Customer Group 9,184,61,-1,-$184.10,40019,661468-1,2896083484
1,KK,19,-$131.88,8148,Duluth,10/1/2020,1275GAKNAL,564,Supplier 1,3964368,Drug # 2,155260,Customer Group 2,202,51,-1,-$131.88,785477,800227,2895901196
2,TT,14,-$52.40,8149,Memphis,10/1/2020,1275GAKNMD,564,Supplier 1,1196047,Drug # 1,528125,Customer Group 1,227,98,-1,-$68.51,474335,800227,2895873878
3,A2,7,-$38.64,8126,McCalla,10/1/2020,1275GVRTBI,49012,Supplier 17,3428547,Drug # 49,504084,Customer Group 4,386,388,-1,-$38.64,586439,VD2CLRS118,2896465786
4,A2,134,-$36.00,8165,Oklahoma City,10/1/2020,1275GPIIOC,35367,Supplier 7,1605567,Drug # 9,467382,Customer Group 1,227,416,-4,-$36.00,474335,C22878-1,2896018433


In [11]:
# check data type
type(X), type(y)

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [14]:
# Define dependent variable/target vector
y = reimburse_df[["Resolution"]]

In [15]:
# check shape
X.shape, y.shape

((56399, 20), (56399, 1))

In [None]:
# convert text to number

In [16]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [17]:
# Create StandardScaler instance
scaler = StandardScaler()

In [18]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

ValueError: could not convert string to float: 'XX'