# Healthcare Classification Using PyCaret & mlrun
dataset = [kaggle](https://www.kaggle.com/datasets/prasad22/healthcare-dataset)

In [1]:
import pandas as pd
from pycaret.classification import *
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./healthcare_dataset.csv')
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Tiffany Ramirez,81,Female,O-,Diabetes,2022-11-17,Patrick Parker,Wallace-Hamilton,Medicare,37490.983364,146,Elective,2022-12-01,Aspirin,Inconclusive
1,Ruben Burns,35,Male,O+,Asthma,2023-06-01,Diane Jackson,"Burke, Griffin and Cooper",UnitedHealthcare,47304.064845,404,Emergency,2023-06-15,Lipitor,Normal
2,Chad Byrd,61,Male,B-,Obesity,2019-01-09,Paul Baker,Walton LLC,Medicare,36874.896997,292,Emergency,2019-02-08,Lipitor,Normal
3,Antonio Frederick,49,Male,B-,Asthma,2020-05-02,Brian Chandler,Garcia Ltd,Medicare,23303.322092,480,Urgent,2020-05-03,Penicillin,Abnormal
4,Mrs. Brandy Flowers,51,Male,O-,Arthritis,2021-07-09,Dustin Griffin,"Jones, Brown and Murray",UnitedHealthcare,18086.344184,477,Urgent,2021-08-02,Paracetamol,Normal


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                10000 non-null  object 
 1   Age                 10000 non-null  int64  
 2   Gender              10000 non-null  object 
 3   Blood Type          10000 non-null  object 
 4   Medical Condition   10000 non-null  object 
 5   Date of Admission   10000 non-null  object 
 6   Doctor              10000 non-null  object 
 7   Hospital            10000 non-null  object 
 8   Insurance Provider  10000 non-null  object 
 9   Billing Amount      10000 non-null  float64
 10  Room Number         10000 non-null  int64  
 11  Admission Type      10000 non-null  object 
 12  Discharge Date      10000 non-null  object 
 13  Medication          10000 non-null  object 
 14  Test Results        10000 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 1.1+

In [4]:
# Let's remove unnessary features in relation to Test Results
col_to_rem = ['Name', 'Date of Admission', 'Doctor', 'Hospital', 'Insurance Provider', 'Billing Amount', 'Room Number', 'Admission Type', 'Discharge Date']
df = df.drop(columns=col_to_rem)

In [5]:
train_df, test_df = train_test_split(df, random_state=42, test_size=0.2)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                10000 non-null  int64 
 1   Gender             10000 non-null  object
 2   Blood Type         10000 non-null  object
 3   Medical Condition  10000 non-null  object
 4   Medication         10000 non-null  object
 5   Test Results       10000 non-null  object
dtypes: int64(1), object(5)
memory usage: 468.9+ KB


In [7]:
train_df

Unnamed: 0,Age,Gender,Blood Type,Medical Condition,Medication,Test Results
9254,19,Male,A-,Diabetes,Lipitor,Abnormal
1561,37,Male,A+,Hypertension,Lipitor,Inconclusive
1670,22,Female,B+,Cancer,Paracetamol,Normal
6087,67,Female,AB+,Hypertension,Aspirin,Abnormal
6669,38,Female,O-,Hypertension,Penicillin,Normal
...,...,...,...,...,...,...
5734,34,Male,B-,Cancer,Aspirin,Inconclusive
5191,74,Female,B-,Diabetes,Paracetamol,Abnormal
5390,81,Male,B-,Hypertension,Aspirin,Inconclusive
860,45,Female,AB-,Asthma,Aspirin,Normal


In [8]:
for i in df.columns:
    print(f'{i}: {df[i].nunique()} - {df[i].dtype}')

Age: 68 - int64
Gender: 2 - object
Blood Type: 8 - object
Medical Condition: 6 - object
Medication: 5 - object
Test Results: 3 - object


In [12]:
clf = setup(data=train_df,
            target='Test Results',
            ordinal_features={ 
                              'Blood Type': ['O-', 'O+', 'B-', 'AB+', 'A+', 'AB-', 'A-', 'B+']
                             },
            categorical_features=['Gender', 'Medical Condition', 'Medication'],
            numeric_features=['Age'],
            pca=True,
            log_experiment=True,
            experiment_name='healthcare_exp1'
           )
best_model = compare_models()


Unnamed: 0,Description,Value
0,Session id,3201
1,Target,Test Results
2,Target type,Multiclass
3,Target mapping,"Abnormal: 0, Inconclusive: 1, Normal: 2"
4,Original data shape,"(8000, 6)"
5,Transformed data shape,"(8000, 15)"
6,Transformed train set shape,"(5600, 15)"
7,Transformed test set shape,"(2400, 15)"
8,Ordinal features,2
9,Numeric features,1


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.3475,0.508,0.3475,0.3495,0.337,0.02,0.0205,0.016
ridge,Ridge Classifier,0.3459,0.0,0.3459,0.3456,0.3329,0.0159,0.0164,0.014
lr,Logistic Regression,0.345,0.5073,0.345,0.3445,0.3323,0.0146,0.0151,0.017
dummy,Dummy Classifier,0.3438,0.5,0.3438,0.1182,0.1759,0.0,0.0,0.016
qda,Quadratic Discriminant Analysis,0.3425,0.5101,0.3425,0.3429,0.3406,0.0133,0.0134,0.013
lda,Linear Discriminant Analysis,0.3425,0.5057,0.3425,0.341,0.3318,0.0109,0.0113,0.011
gbc,Gradient Boosting Classifier,0.3393,0.5007,0.3393,0.3388,0.3356,0.0069,0.007,0.499
xgboost,Extreme Gradient Boosting,0.3377,0.4981,0.3377,0.3371,0.3368,0.0057,0.0057,0.247
svm,SVM - Linear Kernel,0.3375,0.0,0.3375,0.3218,0.3025,0.0046,0.0049,0.028
rf,Random Forest Classifier,0.3355,0.4961,0.3355,0.3348,0.3347,0.0028,0.0028,0.167


In [10]:
dataset_transformed = get_config('dataset_transformed')
dataset_transformed

Unnamed: 0,Age,Gender,Blood Type,Medical Condition_Arthritis,Medical Condition_Hypertension,Medical Condition_Obesity,Medical Condition_Cancer,Medical Condition_Diabetes,Medical Condition_Asthma,Medication_Ibuprofen,Medication_Lipitor,Medication_Penicillin,Medication_Aspirin,Medication_Paracetamol,Test Results
7115,69.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
236,71.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
8529,66.0,1.0,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1766,41.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
4810,32.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2027,73.0,1.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
5008,54.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
7460,47.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
5428,29.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1


In [11]:
tuned_model = tune_model(best_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.3446,0.5,0.3446,0.1188,0.1767,0.0,0.0
1,0.3446,0.5,0.3446,0.1188,0.1767,0.0,0.0
2,0.3446,0.5,0.3446,0.1188,0.1767,0.0,0.0
3,0.3446,0.5,0.3446,0.1188,0.1767,0.0,0.0
4,0.3429,0.5,0.3429,0.1176,0.1751,0.0,0.0
5,0.3429,0.5,0.3429,0.1176,0.1751,0.0,0.0
6,0.3429,0.5,0.3429,0.1176,0.1751,0.0,0.0
7,0.3429,0.5,0.3429,0.1176,0.1751,0.0,0.0
8,0.3446,0.5076,0.3446,0.3644,0.2398,0.0051,0.0089
9,0.3446,0.4802,0.3446,0.1188,0.1767,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
