In [26]:
# Import dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


# Load Data

In [27]:
# Load the data into a Pandas DataFrame
df = pd.read_csv("../Resources/Health_insurance_clean.csv")

# Display data
df.head()

Unnamed: 0,Patient,Age,Age_Group,Sex,Diagnosis_Code,Diagnosis_Group,Diagnosis_Family,Diagnosis_Description,Med_Code,Med_Description,Med_Description_Simp,Quantity,Status,Amount_Billed,Amount_Paid
0,2112140237,37,26-45,Male,K21.9,K2,K,Gastro-esophageal reflux disease without esoph...,17381119111006,(SODIUM CHLORIDE : 9 MG/ML) SOLUTION FOR INFU...,SODIUM CHLORIDE,1,Paid,3.0,3.0
1,2002110188,38,26-45,Male,I21.3,I0,I,ST elevation (STEMI) myocardial infarction of ...,9933855010391,(CLOPIDOGREL (AS BESILATE) : 75 MG) FILM COAT...,CLOPIDOGREL,4,Rejected,17.44,0.0
2,1510110229,59,46-65,Male,B34.2,B3,B,"Coronavirus infection, unspecified",1372428020342,(PANTOPRAZOLE (AS SODIUM) : 40 MG) ENTERIC CO...,PANTOPRAZOLE,2,Paid,5.36,5.36
3,2312040128,38,26-45,Male,I69.354,I1,I,Hemiplegia and hemiparesis following cerebral ...,271792030391,(AMLODIPINE : 5 MG) (VALSARTAN : 160 MG) FILM...,"AMLODIPINE, VALSARTAN",7,Paid,51.66,51.66
4,2311110151,44,26-45,Male,J32.9,J3,J,"Chronic sinusitis, unspecified",3551202010381,(GENTAMICIN : 0.3%) EYE OINTMENT,GENTAMICIN,1,Paid,8.5,8.5


# One-hot Encode input date

In [28]:
# View datatypes again, and take note of 'object' type for dummy conversion
df.dtypes

Patient                    int64
Age                        int64
Age_Group                 object
Sex                       object
Diagnosis_Code            object
Diagnosis_Group           object
Diagnosis_Family          object
Diagnosis_Description     object
Med_Code                   int64
Med_Description           object
Med_Description_Simp      object
Quantity                   int64
Status                    object
Amount_Billed            float64
Amount_Paid              float64
dtype: object

In [29]:
# Encode using get_dummies() for input 
dummies_for_x = pd.get_dummies(df[[
                                'Diagnosis_Group',
                                'Sex',
                                'Age_Group']], prefix = None)

In [30]:
# Counting the number of features, to avoid calculation errors later on
print("Number of features:", dummies_for_x.shape[1])

dummies_for_x.columns

Number of features: 81


Index(['Diagnosis_Group_A0', 'Diagnosis_Group_A1', 'Diagnosis_Group_A4',
       'Diagnosis_Group_A5', 'Diagnosis_Group_A8', 'Diagnosis_Group_A9',
       'Diagnosis_Group_B3', 'Diagnosis_Group_B4', 'Diagnosis_Group_B5',
       'Diagnosis_Group_B9', 'Diagnosis_Group_C0', 'Diagnosis_Group_C4',
       'Diagnosis_Group_C5', 'Diagnosis_Group_C6', 'Diagnosis_Group_C7',
       'Diagnosis_Group_C8', 'Diagnosis_Group_D0', 'Diagnosis_Group_D1',
       'Diagnosis_Group_D2', 'Diagnosis_Group_D3', 'Diagnosis_Group_D4',
       'Diagnosis_Group_D5', 'Diagnosis_Group_E0', 'Diagnosis_Group_E1',
       'Diagnosis_Group_E2', 'Diagnosis_Group_E5', 'Diagnosis_Group_E7',
       'Diagnosis_Group_E8', 'Diagnosis_Group_F1', 'Diagnosis_Group_F2',
       'Diagnosis_Group_G0', 'Diagnosis_Group_G2', 'Diagnosis_Group_G3',
       'Diagnosis_Group_G8', 'Diagnosis_Group_H0', 'Diagnosis_Group_H6',
       'Diagnosis_Group_I0', 'Diagnosis_Group_I1', 'Diagnosis_Group_I8',
       'Diagnosis_Group_I9', 'Diagnosis_Group_J0', 

In [31]:
# Get encoded y variable
y_dummy = pd.get_dummies(df['Med_Description_Simp'], prefix = None)

In [32]:
# Counting the number of features, to avoid calculation errors later on
print("Number of features:", y_dummy.shape[1])

y_dummy.columns

Number of features: 507


Index(['ACECLOFENAC', 'ACETAZOLAMIDE', 'ACETYLCYSTEINE', 'ACTIVATED CHARCOAL',
       'ACTIVATED CHARCOAL,SIMETHICONE', 'ACYCLOVIR', 'ADALIMUMAB',
       'ADENOSINE', 'ADRENALINE', 'ADRENALINE, ARTICAINE',
       ...
       'WATER', 'WATER FOR INJECTIONS', 'XYLOMETAZOLINE HYDROCHLORIDE', 'ZINC',
       'ZINC (AS ZINC OXIDE), CALCIUM (AS CARBONATE & CITRATE)  , MAGNESIUM (AS OXIDE & CITRATE) , VITAMIN D3 (CHOLECALCIFEROL)',
       'ZINC, FOLIC ACID, COPPER, CYANOCOBALAMIN, PYRIDOXINE, IRON',
       'ZINC,CALAMINE', 'ZINC,CETRIMIDE,DEXPANTHANOL',
       'ZINC,GINSENG EXTRACT,IRON,VITAMIN B2,VITAMIN D3,NICOTINAMIDE,BIOTIN,LECITHIN,FOLIC ACID,VITAMIN B1,VITAMIN B6,VITAMIN C,VITAMIN A,VITAMIN E,SELENIUM,COPPER,MANGANESE,CALCIUM,MAGNESIUM,VITAMIN B12',
       'ZINC,MAGNESIUM,CALCIUM'],
      dtype='object', length=507)

# Prepare dataset into training and testing sets

In [33]:
# Define X (input columns)
X = dummies_for_x.values
X[:10]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0

In [34]:
# Defining target vector ('y', for output)
y = y_dummy.values

y[:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale Data

In [36]:
# Creating Scaler instance and fitting
scaled_x = StandardScaler().fit(X_train)

In [37]:
# Scaling data
X_train_scaled = scaled_x.transform(X_train)
X_test_scaled = scaled_x.transform(X_test)

# Prepare and execute random forest model

In [38]:
# NOTE: the more n_estimators the more machine power
model = RandomForestClassifier(criterion='gini', max_depth = 30, min_samples_split=1000, random_state=42, n_estimators=500)

In [39]:
# Fit data into random forest model
model = model.fit(X_train_scaled,y_train)

In [40]:
# Creating predictions
predictions = model.predict(X_test_scaled)

# Post-exeuction: Evaluate Model using confusion matrix

In [41]:
# Calculating the confusion matrix
from sklearn.metrics import hamming_loss
hamming_loss_value = hamming_loss(y_test , predictions)
print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.0019723865877712033


In [42]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
print(acc_score)

0.0


## Trial two 

In [None]:
# Encode using get_dummies() for input 
dummies_for_x1 = pd.get_dummies(df[[
                                'Diagnosis_Group',
                                'Sex',
                                'Age_Group']], prefix = None)

In [None]:
# Get encoded y variable
y_dummy1 = pd.get_dummies(df['Med_Description_Simp'], prefix = None)

In [None]:
# Define X (input columns)
X1 = dummies_for_x1.values

In [None]:
# Defining target vector ('y', for output)
y1 = y_dummy1.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, random_state=1)

In [None]:
# Creating Scaler instance and fitting
scaled_x = StandardScaler().fit(X_train)

In [None]:
# Scaling data
X_train_scaled = scaled_x.transform(X_train)
X_test_scaled = scaled_x.transform(X_test)

In [None]:
# NOTE: the more n_estimators the more machine power
model1 = RandomForestClassifier(n_estimators=500, random_state=1)

In [None]:
# Calculating feature importance
feature_importances = pd.Series(model1.feature_importances_, index=X_train.columns)
print(feature_importances)

In [None]:
# Sort the features by their importance
sorted(zip(model1.feature_importances_, X.columns), reverse=True)

In [None]:
predictions = model1.predict(X_test_scaled)

In [None]:
# Create confusion matrix using y_test in relation to predictions
confusion_results = confusion_matrix(y_test, predictions)

results_df = pd.DataFrame(confusion_results,columns=['Actual Values','Predictions'], index= [])
# qudsia: will resume from here

# Create accuracy report using y_test in relation to predictions
accuracy_results = accuracy_score(y_test,predictions)