In [1]:
# Import dependencies
import pandas as pd
import hvplot.pandas
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


# Load Data

In [8]:
# Load the data into a Pandas DataFrame
df = pd.read_csv("../Resources/Health_insurance_clean.csv")

# Display data
df.head()

Unnamed: 0,Patient,Age,Age_Group,Sex,Diagnosis_Code,Diagnosis_Group,Diagnosis_Family,Diagnosis_Description,Med_Code,Med_Description,Med_Description_Simp,Quantity,Status,Amount_Billed,Amount_Paid
0,2112140237,37,26-45,Male,K21.9,K2,K,Gastro-esophageal reflux disease without esoph...,17381119111006,(SODIUM CHLORIDE : 9 MG/ML) SOLUTION FOR INFU...,SODIUM CHLORIDE,1,Paid,3.0,3.0
1,2002110188,38,26-45,Male,I21.3,I0,I,ST elevation (STEMI) myocardial infarction of ...,9933855010391,(CLOPIDOGREL (AS BESILATE) : 75 MG) FILM COAT...,CLOPIDOGREL,4,Rejected,17.44,0.0
2,1510110229,59,46-65,Male,B34.2,B3,B,"Coronavirus infection, unspecified",1372428020342,(PANTOPRAZOLE (AS SODIUM) : 40 MG) ENTERIC CO...,PANTOPRAZOLE,2,Paid,5.36,5.36
3,2312040128,38,26-45,Male,I69.354,I1,I,Hemiplegia and hemiparesis following cerebral ...,271792030391,(AMLODIPINE : 5 MG) (VALSARTAN : 160 MG) FILM...,"AMLODIPINE, VALSARTAN",7,Paid,51.66,51.66
4,2311110151,44,26-45,Male,J32.9,J3,J,"Chronic sinusitis, unspecified",3551202010381,(GENTAMICIN : 0.3%) EYE OINTMENT,GENTAMICIN,1,Paid,8.5,8.5


In [None]:
# Plot data to view distribution
df.hvplot.line(
    rot = 90,
    height = 200,
    width= 600
)

# One-hot Encode input date

In [None]:
# View datatypes again, and take note of 'object' type for dummy conversion
df.dtypes

In [None]:
# Encode using get_dummies() for input CHECKAGAIN
dummies_for_x = pd.get_dummies(df[[
                                'Diagnosis_Family',
                                'Diagnosis_Group',
                                'Sex',
                                'Age_Group']], prefix = None)

In [None]:
# Counting the number of features, to avoid calculation errors later on
print("Number of features:", dummies_for_x.shape[1])

dummies_for_x.columns

In [None]:
# Get encoded y variable
y_dummy = pd.get_dummies(df['Med_Description'], prefix = None)

In [None]:
# Counting the number of features, to avoid calculation errors later on
print("Number of features:", y_dummy.shape[1])

y_dummy.columns

# Prepare dataset into training and testing sets

In [None]:
# Define X (input columns)
X = dummies_for_x.values
X.head()

In [None]:
# Defining target vector ('y', for output)
y = y_dummy.ravel()

y[:10]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale Data (?)

In [None]:
# Creating Scaler instance and fitting
scaled_x = StandardScaler().fit(X_train)

In [None]:
# Scaling data
X_train_scaled = scaled_x.transform(X_train)
X_test_scaled = scaled_x.transform(X_test)

# Prepare and execute random forest model

In [None]:
# NOTE: the more n_estimators the more machine power
model = RandomForestClassifier(n_estimators=500, random_state=1)

In [None]:
# Fit data into random forest model
model = model.fit(X_train_scaled,y_train)

In [10]:
# Assess quality of features as safe practice

# Calculating feature importance
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)
print(feature_importances)

# measures Gini purity : how often a randomly chosen element from the set would be incorrectly classified.
# e.x. how many times 'Diagnosis_Group' is incorrectly classified and 'Diagnosis_Family',
# if latter scored higher it means in our model, latter feature is more important than former.
#from this we can re-run model exclusively with important features as they predict outcome better

#scores ideally should sum to 1. This allows you to interpret the scores as relative importance.
#The higher the importance score, the more influential the feature is in making accurate predictions. 
#Features with higher importance contribute more to the reduction of impurity in the decision trees.

NameError: name 'model' is not defined

In [None]:
# Sort the features by their importance
sorted(zip(model.feature_importances_, X.columns), reverse=True)

In [11]:
# Creating predictions
predictions = model.predict(X_test_scaled)

NameError: name 'model' is not defined

# Post-exeuction: Evaluate Model using confusion matrix

In [None]:
# Create confusion matrix using y_test in relation to predictions
confusion_results = confusion_matrix(y_test, predictions)

results_df = pd.DataFrame(confusion_results,columns=['Actual Values','Predictions'])
## FFIX IFX FIXING

# Create accuracy report using y_test in relation to predictions
accuracy_results = accuracy_score(y_test,predictions)
