#Install the libary

In [None]:
!pip install mljar-supervised

#Loading the data

In [4]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')

df_vanco = pd.read_csv('/content/drive/My Drive/data/Vancomycin_CRF_ver11.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Make functions to changing the values

In [5]:
def age_normalization(age):
    if age < 0 or age > 112:
        raise ValueError("Age must be between 0 and 112")
    elif age < 13:
        return 0
    elif age <= 18:
        return 1
    elif age <= 29:
        return 2
    elif age <= 49:
        return 3
    elif age <= 64:
        return 4
    else:  # age >= 65
        return 5

def crcl_normalization(crcl):
  if crcl < 70:
    return 1
  else: #
    return 2

def eGFR_normalization(egfr):
  if egfr < 90:
    return 1
  else:
    return 2

def bun_normalization(bun):
  if bun <7:
    return 1
  else:
    return 2

def crp_normalization(crp):
  if crp> 10:
    return 1
  else: # normal
    return 2

def hb_normalization(hb,gender):
  if gender == 1: # Male
    if hb < 13.5:
      return 0
    elif hb > 17.5:
      return 1
    else:
      return 2
  else: # Female
    if hb < 12.5:
      return 0
    elif hb >15.5:
      return 1

def plt_normalization(plt):
  if plt <150: # 150000
    return 0
  elif plt >450:
    return 1
  else:
    return 2

def bmi_normalization(bmi):
  if bmi < 18.5:
    return 0
  elif bmi < 22.9:
    return 1
  elif bmi < 24.9:
    return 2
  elif bmi < 29.9:
    return 3
  else:
    return 4

## Change the values in each column

In [6]:
df_vanco['Age'] = df_vanco['Age'].apply(age_normalization)
df_vanco['CrCl'] = df_vanco['CrCl'].apply(crcl_normalization)
df_vanco['BUN'] = df_vanco['BUN'].apply(bun_normalization)
df_vanco['CRP'] = df_vanco['CRP'].apply(crp_normalization)
df_vanco['Hb'] = df_vanco.apply(lambda row: hb_normalization(row['Hb'], row['Gender']), axis=1)
df_vanco['PLT'] = df_vanco['PLT'].apply(plt_normalization)
df_vanco['BMI'] = df_vanco['BMI'].apply(bmi_normalization)
df_vanco['eGFR'] = df_vanco['eGFR'].apply(eGFR_normalization)

#Separating data for model

In [7]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(
    df_vanco, train_size=0.8, test_size=0.2, random_state=42
)

target = "Initial VCM_daily_dose"
X_train,y_train = train_data.drop([target],axis=1),train_data[target]
X_test,y_test = test_data.drop([target],axis=1),test_data[target]

#Model

In [8]:
from supervised.automl import AutoML
from sklearn.metrics import accuracy_score,mean_absolute_error,mean_squared_error,r2_score


### Define the mode by setting parameters in AutoML

In [9]:
# Set a folder name to save trained models
result_path = 'AutoML_vacomycin'

# For more details for each parameters : https://supervised.mljar.com/features/modes/
automl = AutoML(results_path = result_path,
                mode="Perform",
                eval_metric = 'mae',
                train_ensemble=True,
                ml_task = 'regression',
                total_time_limit= 30* 60, #Each model will be trained for 30 minutes (30*60 seconds).
                golden_features = True,
                features_selection=True,
                explain_level = 2, # to obtain detailed explanation for each model
                random_state = 42)

In [10]:
automl.fit(X_train, y_train)

AutoML directory: AutoML_vacomycin
The task is regression with evaluation metric mae
AutoML will use algorithms: ['Linear', 'Random Forest', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'ensemble']
* Step simple_algorithms will try to check up to 1 model
1_Linear mae 296.412883 trained in 36.5 seconds (1-sample predict time 0.1652 seconds)
* Step default_algorithms will try to check up to 5 models
2_Default_LightGBM mae 223.563143 trained in 21.49 seconds (1-sample predict time 0.0758 seconds)
3_Default_Xgboost mae 224.064282 trained in 24.6 seconds (1-sample predict time 0.0717 seconds)
4_Default_CatBoost mae 220.299873 trained in 6.3 seconds (1-sample predict time 0.0689 seconds)
5_Default_NeuralNetwork mae 261.108393 trained in 7.14 seconds (1-sample predict tim



11_LightGBM mae 231.919736 trained in 25.09 seconds (1-sample predict time 0.0654 seconds)
7_Xgboost mae 224.881602 trained in 28.16 seconds (1-sample predict time 0.1116 seconds)
15_CatBoost mae 221.098784 trained in 9.54 seconds (1-sample predict time 0.1195 seconds)
19_RandomForest mae 212.563101 trained in 31.36 seconds (1-sample predict time 0.1307 seconds)
23_NeuralNetwork mae 263.051668 trained in 9.95 seconds (1-sample predict time 0.1567 seconds)
12_LightGBM mae 229.126741 trained in 26.43 seconds (1-sample predict time 0.0662 seconds)
8_Xgboost mae 207.274766 trained in 27.49 seconds (1-sample predict time 0.0654 seconds)
16_CatBoost mae 222.232412 trained in 12.67 seconds (1-sample predict time 0.0679 seconds)
20_RandomForest mae 222.253249 trained in 40.46 seconds (1-sample predict time 0.1659 seconds)
24_NeuralNetwork mae 257.651267 trained in 13.64 seconds (1-sample predict time 0.1178 seconds)
13_LightGBM mae 224.326081 trained in 29.6 seconds (1-sample predict time 0.06

The best modle is "Ensemble" and its mae is 187.379119.

The details of Ensemble is in the folder "AutoML_vacomycin/Ensemble"

In [23]:
predictions = automl.predict_all(X_test)
predictions[:3]


Unnamed: 0,prediction
0,1982.200183
1,1993.732278
2,1995.447174


In [40]:
import numpy as np
df_result = pd.DataFrame()
df_result['true'] = y_test
df_result['prediction'] = predictions.values
df_result['accuracy'] = np.where(abs(df_result['true'] - df_result['prediction']) <= 250, 1, 0)
df_result[:3]

Unnamed: 0,true,prediction,accuracy
128,2000,1982.200183,1
45,2000,1993.732278,1
134,2000,1995.447174,1


In [41]:
print('R2:  ',r2_score(df_result['true'], df_result['prediction']))
print('MSE: ',mean_squared_error(df_result['true'],df_result['prediction']))
print('RMSE:',mean_squared_error(df_result['true'],df_result['prediction']) ** 0.5)
print('MAE: ',mean_absolute_error(df_result['true'],df_result['prediction']))
print('Accuracy: ', round(len(df_result.loc[df_result['accuracy'] == 1])/ len(df_result), 1))

R2:   0.11553997422576978
MSE:  107568.21835537054
RMSE: 327.9759417325767
MAE:  226.90561528039973
Accuracy:  0.5
