In [None]:
import time, os, joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# !pip install catboost

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

In [None]:
df = pd.read_hdf("PatientDataClass_25Hz_88_99_feats_44_lag.h5")
df.shape

((401469, 44),)

In [None]:
df.head()

Unnamed: 0,time_stamp_corrected,time_stamp,red,ir,red_amplitude,ir_amplitude,accel_x,accel_y,accel_z,gyro_pitch,...,amp_inhale_lag_125,amp_exhale_lag_125,dur_inhale_lag_125,dur_exhale_lag_125,rr_lag_250,amp_inhale_lag_250,amp_exhale_lag_250,dur_inhale_lag_250,dur_exhale_lag_250,R
0,5.1731,2098.60374,232406.0,245777.0,99.0,72.0,0.3459,0.5294,0.8017,-1.61,...,3.975327,-3.013573,1.88,1.64,15.874008,2.86174,-3.260328,2.28,2.28,1.021095
1,5.21447,2098.64511,232370.0,245766.0,99.0,72.0,0.3494,0.5294,0.8022,-1.33,...,3.975327,-3.013573,1.88,1.64,15.874008,2.86174,-3.260328,2.28,2.28,1.022678
2,5.42132,2098.85196,232236.0,245709.0,99.0,72.0,0.344,0.5314,0.8008,-1.33,...,3.975327,-3.013573,1.88,1.64,15.874008,2.86174,-3.260328,2.28,2.28,1.020785
3,5.50406,2098.9347,232181.0,245686.0,99.0,72.0,0.3459,0.5299,0.8008,-1.68,...,3.975327,-3.013573,1.88,1.64,15.874008,2.86174,-3.260328,2.28,2.28,1.020648
4,5.54543,2098.97607,232121.0,245657.0,99.0,72.0,0.3445,0.5319,0.7998,-1.4,...,3.975327,-3.013573,1.88,1.64,15.874008,2.86174,-3.260328,2.28,2.28,1.02043


In [None]:
columns_to_remove = ["time_stamp_corrected", "time_stamp", 'location', "calibration", 'category', 'patient_ID', 'SpO2(%)']

In [None]:
def getNormalizedData(Type, X_train, X_val):
  """
  args:
  Type: Minmax or Standard scaling technique
  X_train: Training data
  X_val: Validation data

  return X_train_norm, X_val_norm, X_test_norm # Normalized
  """

  if Type == "minmax":
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler.fit(X_train)
  elif Type == "std":
    scaler = StandardScaler()
    scaler.fit(X_train)
  else:
    scaler = RobustScaler()
    scaler.fit(X_train)

  X_train_norm, X_val_norm = scaler.transform(X_train), scaler.transform(X_val)

  return X_train_norm, X_val_norm, scaler

In [None]:
def mappingLabel(x):
    """
    99-96 - Normal --> 2
    96-92 - Low    --> 1
    88-92 - Danger --> 0
    """

    if 88 <= x <= 91:
        return 0
    elif 92 <= x <= 95:
        return 1
    elif 96 <= x <= 99:
        return 2


def getAugmentedFeatures(df, cols_to_work):

    patient_records = []
    for patient in df['patient_ID'].unique():
        patient_df = df[df['patient_ID'] == patient]
        patient_df = patient_df.sort_values('time_stamp').reset_index(drop=True)

        # Features generation
        for col in cols_to_work:
            if col in ['amp_exhale_lag_125', 'amp_exhale_lag_250', 'amp_inhale_lag_125', 'amp_inhale_lag_250', 'dur_exhale_lag_125', 'dur_exhale_lag_250', 'dur_inhale_lag_125', 'dur_inhale_lag_250', 'rr_lag_125', 'rr_lag_250']:
                continue
            # Taking previous timestamp as a feature
            patient_df[col+'_shift_1' ] = patient_df[col].shift(periods=1)
            patient_df[col+'_shift_2' ] = patient_df[col].shift(periods=2)
            patient_df[col+'_shift_3' ] = patient_df[col].shift(periods=3)

            patient_df = patient_df.fillna(0)

            # Computing difference between feature at t and (t-1)
            patient_df[col+'_diff_1' ] = patient_df[col] - patient_df[col+'_shift_1' ]
            patient_df[col+'_diff_2' ] = patient_df[col] - patient_df[col+'_shift_2' ]
            patient_df[col+'_diff_3' ] = patient_df[col] - patient_df[col+'_shift_3' ]

            # Computing difference w.r.t. mean, median, min, max
            patient_df[col+'_min' ] = patient_df[col] - patient_df[col].min()
            patient_df[col+'_max' ] = patient_df[col].max() - patient_df[col]
            patient_df[col+'_mean' ] = patient_df[col] - patient_df[col].mean()
            patient_df[col+'_median' ] = patient_df[col] - patient_df[col].median()

        patient_records.append(patient_df)

    patient_records = pd.concat(patient_records).reset_index(drop=True)

    return patient_records

In [None]:
cols_to_work = list(set(df.columns) - set(columns_to_remove + ["SpO2(%)", 'patient_ID']))

df = getAugmentedFeatures(df, cols_to_work)
df['category'] = df['SpO2(%)'].apply(lambda row:mappingLabel(row))

In [None]:
os.mkdir("Tree")

In [None]:
df.shape

(401469, 325)

In [None]:
label = df['category'].values
df = df.drop(columns_to_remove, axis=1)

In [None]:
for i in range(5):
    print(f"Fold {i+1}:")

    tf.keras.backend.clear_session()
    tf.keras.utils.set_random_seed(42)
    tf.config.experimental.enable_op_determinism()

    # norm = tf.keras.layers.Normalization(axis = -1)
    # norm.adapt(X_train)

    # Building a Multi-layer perceptron model
    mlp = tf.keras.Sequential([
        tf.keras.layers.Dense(318, activation = "relu"),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(512, activation = "relu"),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(3, activation="softmax")
        ])

    mlp.compile(optimizer=tf.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'], loss= "sparse_categorical_crossentropy")

    X_train, X_val, y_train, y_val = train_test_split(df, label, test_size=0.10, stratify = label, random_state=i)

    X_train_norm, X_val_norm, scaler = getNormalizedData("std", X_train, X_val)

    joblib.dump(scaler, os.getcwd() +"/Tree/Scaler_fold_v_" + str(i+1) + ".pkl")


    print(f"Training data: {X_train_norm.shape}")
    print(f"Validation data: {X_val_norm.shape}")
    print()

    print(f"Training data y: {y_train.shape}")
    print(f"Validation data y: {y_val.shape}")
    print()

    classifiers_name = ["MLP", "LogisticRegression","RandomForest", "LGBM", "ExtraTrees", "XGBR", "CatBoost", "HistGradientBoosting"]

    lr = LogisticRegression()
    rf =  RandomForestClassifier(n_estimators = 100, criterion='gini', max_features = 'log2', max_depth=11, min_samples_split=2, random_state=42)
    lgbm = LGBMClassifier(n_estimators = 50, boosting_type = 'dart',max_depth = 11, num_leaves = 10, learning_rate=0.45, random_state =42)
    exrf = ExtraTreesClassifier(n_estimators = 50, criterion='gini', max_features = 'sqrt', max_depth=14, min_samples_split=12, random_state=42)
    xgb = XGBClassifier(n_estimators = 100, max_depth = 5, learning_rate=0.15, booster = 'gbtree', tree_method = 'auto',random_state=42)
    catbr = CatBoostClassifier(n_estimators = 50, learning_rate = 0.2, max_depth = 8, boosting_type = 'Plain',loss_function = "MultiClass",verbose = False, random_state = 42)
    hr = HistGradientBoostingClassifier(learning_rate=0.36, max_leaf_nodes=31, min_samples_leaf=20, max_iter=120, max_depth=3, random_state=42)

    clfs = [mlp, lr, rf, lgbm, exrf, xgb, catbr, hr]

    for ix,clf in enumerate(clfs):
      print("Training starts for", end=" ")
      print(classifiers_name[ix], ":")
      strt_time = time.time()
      if classifiers_name[ix] == "MLP":
        clf.fit( X_train_norm, y_train, validation_data = (X_val_norm, y_val), epochs = 20, batch_size = 512, verbose =0,
                  callbacks = [tf.keras.callbacks.EarlyStopping(patience = 3, monitor = 'val_loss', mode = 'auto',restore_best_weights=True)])

      clf.fit(X_train_norm, y_train)
      print(f"Total time taken to train {classifiers_name[ix]} is {time.time() - strt_time} sec.")
      print()

      if classifiers_name[ix] == "MLP":
        pred_train = np.argmax(clf.predict(X_train_norm),1)
        pred_val = np.argmax(clf.predict(X_val_norm),1)

      else:
        pred_train = clf.predict(X_train_norm)
        pred_val = clf.predict(X_val_norm)

      print("Confusion Matrix for Training data:")
      print(confusion_matrix(y_train, pred_train))
      print()
      print("Classification Report for Training data:")
      print(classification_report(y_train, pred_train, digits=4))
      print()
      print()
      print("*"*51)
      print()
      print()
      print("Confusion Matrix for Validation data:")
      print(confusion_matrix(y_val, pred_val))
      print()
      print("Classification Report for Validation data:")
      print(classification_report(y_val, pred_val, digits=4))
      print()
      print()

Fold 1:
Training data: (361322, 318)
Validation data: (40147, 318)

Training data y: (361322,)
Validation data y: (40147,)

Training starts for MLP :
Total time taken to train MLP is 168.6224720478058 sec.

Confusion Matrix for Training data:
[[116836   3451   1235]
 [  6350  97274  16300]
 [  1559   7969 110348]]

Classification Report for Training data:
              precision    recall  f1-score   support

           0     0.9366    0.9614    0.9489    121522
           1     0.8949    0.8111    0.8510    119924
           2     0.8629    0.9205    0.8908    119876

    accuracy                         0.8980    361322
   macro avg     0.8981    0.8977    0.8969    361322
weighted avg     0.8983    0.8980    0.8971    361322



***************************************************


Confusion Matrix for Validation data:
[[12968   405   129]
 [  742 10706  1877]
 [  183   982 12155]]

Classification Report for Validation data:
              precision    recall  f1-score   support

    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Total time taken to train LogisticRegression is 53.761338233947754 sec.

Confusion Matrix for Training data:
[[93626 18351  9545]
 [24348 60573 35003]
 [15256 34094 70526]]

Classification Report for Training data:
              precision    recall  f1-score   support

           0     0.7027    0.7704    0.7350    121522
           1     0.5360    0.5051    0.5201    119924
           2     0.6129    0.5883    0.6003    119876

    accuracy                         0.6220    361322
   macro avg     0.6172    0.6213    0.6185    361322
weighted avg     0.6176    0.6220    0.6190    361322



***************************************************


Confusion Matrix for Validation data:
[[10513  1999   990]
 [ 2760  6684  3881]
 [ 1639  3784  7897]]

Classification Report for Validation data:
              precision    recall  f1-score   support

           0     0.7050    0.7786    0.7400     13502
           1     0.5361    0.5016    0.5183     13325
           2     0.6185    0.5929    0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Total time taken to train LogisticRegression is 53.21032452583313 sec.

Confusion Matrix for Training data:
[[93728 18382  9412]
 [24406 60573 34945]
 [15089 34089 70698]]

Classification Report for Training data:
              precision    recall  f1-score   support

           0     0.7035    0.7713    0.7359    121522
           1     0.5358    0.5051    0.5200    119924
           2     0.6145    0.5898    0.6019    119876

    accuracy                         0.6227    361322
   macro avg     0.6179    0.6220    0.6192    361322
weighted avg     0.6183    0.6227    0.6198    361322



***************************************************


Confusion Matrix for Validation data:
[[10422  2009  1071]
 [ 2692  6706  3927]
 [ 1713  3861  7746]]

Classification Report for Validation data:
              precision    recall  f1-score   support

           0     0.7029    0.7719    0.7358     13502
           1     0.5332    0.5033    0.5178     13325
           2     0.6078    0.5815    0.5

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Total time taken to train LogisticRegression is 54.87801909446716 sec.

Confusion Matrix for Training data:
[[93733 18284  9505]
 [24321 60524 35079]
 [15087 34066 70723]]

Classification Report for Training data:
              precision    recall  f1-score   support

           0     0.7040    0.7713    0.7361    121522
           1     0.5362    0.5047    0.5200    119924
           2     0.6133    0.5900    0.6014    119876

    accuracy                         0.6227    361322
   macro avg     0.6179    0.6220    0.6192    361322
weighted avg     0.6182    0.6227    0.6197    361322



***************************************************


Confusion Matrix for Validation data:
[[10361  2036  1105]
 [ 2702  6751  3872]
 [ 1666  3798  7856]]

Classification Report for Validation data:
              precision    recall  f1-score   support

           0     0.7034    0.7674    0.7340     13502
           1     0.5364    0.5066    0.5211     13325
           2     0.6122    0.5898    0.6

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Total time taken to train LogisticRegression is 53.49441075325012 sec.

Confusion Matrix for Training data:
[[93759 18327  9436]
 [24336 60689 34899]
 [15104 34082 70690]]

Classification Report for Training data:
              precision    recall  f1-score   support

           0     0.7039    0.7715    0.7362    121522
           1     0.5366    0.5061    0.5209    119924
           2     0.6146    0.5897    0.6019    119876

    accuracy                         0.6231    361322
   macro avg     0.6184    0.6224    0.6196    361322
weighted avg     0.6187    0.6231    0.6202    361322



***************************************************


Confusion Matrix for Validation data:
[[10422  2044  1036]
 [ 2754  6745  3826]
 [ 1657  3848  7815]]

Classification Report for Validation data:
              precision    recall  f1-score   support

           0     0.7026    0.7719    0.7356     13502
           1     0.5338    0.5062    0.5196     13325
           2     0.6165    0.5867    0.6

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Total time taken to train LogisticRegression is 54.19049406051636 sec.

Confusion Matrix for Training data:
[[93750 18329  9443]
 [24427 60616 34881]
 [15115 34115 70646]]

Classification Report for Training data:
              precision    recall  f1-score   support

           0     0.7033    0.7715    0.7358    121522
           1     0.5361    0.5055    0.5203    119924
           2     0.6145    0.5893    0.6016    119876

    accuracy                         0.6227    361322
   macro avg     0.6180    0.6221    0.6193    361322
weighted avg     0.6184    0.6227    0.6198    361322



***************************************************


Confusion Matrix for Validation data:
[[10376  2066  1060]
 [ 2684  6661  3980]
 [ 1706  3764  7850]]

Classification Report for Validation data:
              precision    recall  f1-score   support

           0     0.7027    0.7685    0.7341     13502
           1     0.5333    0.4999    0.5160     13325
           2     0.6090    0.5893    0.5