In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

# Define the root directory containing the nested folders
root_dir = 'Dataset'
pd.set_option('display.max_rows', 500)
# Initialize an empty list to store the dataframes
dfs = []
# Iterate through the nested folders
initial_condition = 80
scenario = 1
folder1_path = os.path.join(root_dir, "scenario" + str(scenario) + "\\")
if os.path.isdir(folder1_path):
    for folder2 in os.listdir(folder1_path):
        folder2_path = os.path.join(folder1_path, folder2 +'\\patientJ\\')
        initial_condition = 80
        if os.path.isdir(folder2_path):
            lista = os.listdir(folder2_path)
            lista.insert(0, lista.pop())
            for file in lista:
                if file.endswith('.csv'):
                    file_path = os.path.join(folder2_path, file)
                    df = pd.read_csv(file_path)
                    # Add folder names as columns
                    df['day'] = folder2
                    df['initial_condition'] = initial_condition
                    dfs.append(df)
                    initial_condition += 20

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# Analyze  of FALSE vs TRUE values in target column "hazard_flag"
combined_df['hazard_flag'].value_counts()

# Pre-process data
#Checking for missing values
sns.heatmap(combined_df.isna().transpose(),
           cmap="plasma",
           cbar_kws={'label':'Missing Data'})
plt.show()
combined_df.head()

#Disregarding empty and unnecessary colums
for df in dfs:
    df.drop(labels=['Unnamed: 0', 
                    'unsafe_action_reason', 
                    'alert_msg', 
                    'hazard_msg', 
                    'detection'], axis=1, inplace=True)
    df["hazard_flag"] = df["hazard_flag"].astype(int)
    df["faultinjection"] = df["faultinjection"].astype(int)

#Correlation between variables
combined_df = pd.concat(dfs, ignore_index=True)
sns.heatmap(combined_df.corr(),annot=True,annot_kws={"size": 5},square=True,cmap='plasma',cbar=False)

i=0

dfs_importance=[]
for df in dfs:
    X = df.filter(['bg',
                        'eq_BG',
                        'CGM_glucose',
                        'BGI',
                        'Label'])
    Y = df['hazard_flag']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle = True)

    rf = RandomForestRegressor(random_state=0).fit(X_train, y_train.values.ravel()) 
    predictions_rf = rf.predict(X_test)
    print(i)
    print('Model score:',              round(rf.score(X_test, y_test),2))
    print('Mean absolute error:',      round(mean_absolute_error(y_test, predictions_rf),2))
    print('Root mean squared error:',  round(sqrt(mean_squared_error(y_test, predictions_rf)),2))
    print('R2:',                       round(r2_score(y_test, predictions_rf),2))

    #Dropping df that have single class (only one hazard type)
    if round(mean_absolute_error(y_test, predictions_rf),2) != 0:
        dfs_importance.append(df)
        print('Kept')

# Concatenate all dataframes into a single dataframe
combined_df_importance = pd.concat(dfs_importance, ignore_index=True)

#Checking importances for concatenated model
X = combined_df_importance.filter(['bg',
                'eq_BG',
                'CGM_glucose',
                'BGI',
                'Label'])
Y = combined_df_importance['hazard_flag']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle = True)

rf = RandomForestRegressor(random_state=0).fit(X_train, y_train.values.ravel())   
features = list(X_train.columns)

#Gini importance
importances = list(rf.feature_importances_)
importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
importances = sorted(importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:30} Importance: {}'.format(*pair)) for pair in importances];

performance = []
cla = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
        fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
        loss='hinge', max_iter=10, n_jobs=1, penalty='l2', power_t=0.5,
        random_state=None, shuffle=False,
        verbose=0, warm_start=True)

for df in dfs_importance:
    X = df.filter(['BGI',
                   'Label'])
    Y = df['hazard_flag']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle = True)
    cla.partial_fit(X_train, y_train, classes=[0, 1])
    conf = confusion_matrix(y_test, cla.predict(X_test))
    performance.append(np.diag(conf) / np.sum(conf, axis=1))
    y_pred = cla.predict(X_test)

plt.plot(performance)
plt.xlabel('training batches')
plt.legend(['False', 'True'])
plt.ylabel('accuracy')

disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred,
                              display_labels=cla.classes_, cmap='plasma')
disp.plot()
plt.show()

target_names = ['False', 'True']
print(classification_report(y_test, y_pred, target_names=target_names))

X = combined_df_importance.filter(['BGI',
                'Label'])
Y = combined_df_importance['hazard_flag']

X_train = X.iloc[:3471,:]
X_test = X.iloc[3472:,:]
y_train = Y.iloc[:3471]
y_test = Y.iloc[3472:]

classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 2))

# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN | means applying SGD on the whole ANN
classifier.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 50, epochs = 50,verbose = 0)

score, acc = classifier.evaluate(X_train, y_train,
                            batch_size=10)
print('Train score:', score)
print('Train accuracy:', acc)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.01)

print('*'*20)
score, acc = classifier.evaluate(X_test, y_test,
                            batch_size=5)
print('Test score:', score)
print('Test accuracy:', acc)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

p = sns.heatmap(pd.DataFrame(cm), annot=True, cmap="plasma" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
target_names = ['False', 'True']
print(classification_report(y_test, y_pred, target_names=target_names))

#### For general Population #####
generalP = ["patientA",
                "patientB",
                "patientC",
                "patientD",
                "patientE",
                "patientF",
                "patientG",
                "patientH",
                "patientI",
                "patientJ"]
# Define the root directory containing the nested folders
root_dir = 'Dataset'
pd.set_option('display.max_rows', 500)
# Initialize an empty list to store the dataframes
dfs_g = []

for patient in generalP:
    # Iterate through the nested folders
    initial_condition = 80
    scenario = 1
    folder1_path = os.path.join(root_dir, "scenario" + str(scenario) + "\\")
    if os.path.isdir(folder1_path):
        for folder2 in os.listdir(folder1_path):
            folder2_path = os.path.join(folder1_path, folder2 +'\\'+patient+'\\')
            initial_condition = 80
            if os.path.isdir(folder2_path):
                lista = os.listdir(folder2_path)
                lista.insert(0, lista.pop())
                for file in lista:
                    if file.endswith('.csv'):
                        file_path = os.path.join(folder2_path, file)
                        df = pd.read_csv(file_path)
                        # Add folder names as columns
                        df['day'] = folder2
                        df['initial_condition'] = initial_condition
                        dfs_g.append(df)
                        initial_condition += 20

# Concatenate all dataframes into a single dataframe
combined_df_g = pd.concat(dfs, ignore_index=True)

#Disregarding empty and unnecessary colums
for df in dfs_g:
    df.drop(labels=['Unnamed: 0', 
                    'unsafe_action_reason', 
                    'alert_msg', 
                    'hazard_msg', 
                    'detection'], axis=1, inplace=True)
    df["hazard_flag"] = df["hazard_flag"].astype(int)
    df["faultinjection"] = df["faultinjection"].astype(int)

j=0
dfs_importance_g=[]
for df in dfs_g:
    X = df.filter(['bg',
                        'eq_BG',
                        'CGM_glucose',
                        'BGI',
                        'Label'])
    Y = df['hazard_flag']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle = True)

    rf = RandomForestRegressor(random_state=0).fit(X_train, y_train.values.ravel()) 
    predictions_rf = rf.predict(X_test)
    
    #Dropping df that have single class (only one hazard type)
    if round(mean_absolute_error(y_test, predictions_rf),2) != 0:
        dfs_importance_g.append(df)
        j+=1


# Concatenate all dataframes into a single dataframe
combined_df_importance_g = pd.concat(dfs_importance_g, ignore_index=True)
print(j)

X = combined_df_importance_g.filter(['bg',
                'eq_BG',
                'CGM_glucose',
                'BGI',
                'Label'])
Y = combined_df_importance_g['hazard_flag']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle = True)

rf = RandomForestRegressor(random_state=0).fit(X_train, y_train.values.ravel())   
features = list(X_train.columns)

#Gini importance
importances = list(rf.feature_importances_)
importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
importances = sorted(importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:30} Importance: {}'.format(*pair)) for pair in importances];

performance = []
cla = SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
        fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
        loss='hinge', max_iter=10, n_jobs=1, power_t=0.5,
        random_state=None, shuffle=False,
        verbose=0, warm_start=True)

for df in dfs_importance_g:
    X = df.filter(['BGI',
                   'Label'])
    Y = df['hazard_flag']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42, shuffle = True)
    cla.partial_fit(X_train, y_train, classes=[0, 1])
    conf = confusion_matrix(y_test, cla.predict(X_test))
    performance.append(np.diag(conf) / np.sum(conf, axis=1))
    y_pred = cla.predict(X_test)

plt.plot(performance)
plt.xlabel('training batches')
plt.legend(['False', 'True'])
plt.ylabel('accuracy')

disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred,
                              display_labels=cla.classes_, cmap='plasma')
disp.plot()

plt.show()

target_names = ['False', 'True']
print(classification_report(y_test, y_pred, target_names=target_names))

X = combined_df_importance_g.filter(['BGI',
                'Label'])
Y = combined_df_importance_g['hazard_flag']

X_train = X.iloc[:29500,:]
X_test = X.iloc[29501:,:]
y_train = Y.iloc[:29500]
y_test = Y.iloc[29501:]

classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 2))

# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))

# Compiling the ANN | means applying SGD on the whole ANN
classifier.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 50, epochs = 50,verbose = 0)

score, acc = classifier.evaluate(X_train, y_train,
                            batch_size=20)
print('Train score:', score)
print('Train accuracy:', acc)

print('*'*20)
score, acc = classifier.evaluate(X_test, y_test,
                            batch_size=5)
print('Test score:', score)
print('Test accuracy:', acc)

# Making the Confusion Matrix
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.00001)
cm = confusion_matrix(y_test, y_pred)

p = sns.heatmap(pd.DataFrame(cm), annot=True, cmap="plasma" ,fmt='g')
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
target_names = ['False', 'True']
print(classification_report(y_test, y_pred, target_names=target_names))

#### Using K-fold Cross Validation ####
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 12))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier

##For general population
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 100,verbose=0)
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
mean = accuracies.mean()
variance = accuracies.std()

print('Mean accuracy score of 10 different models using Kfold cross validation: {}'.format(mean))
print('Standard Deviation of accuracy score of 10 different models using Kfold cross validation: {}'.format(variance))

##Per patient
X_train = X.iloc[:3471,:]
X_test = X.iloc[3472:,:]
y_train = Y.iloc[:3471]
y_test = Y.iloc[3472:]

classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 100,verbose=0)
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
mean = accuracies.mean()
variance = accuracies.std()

print('Mean accuracy score of 10 different models using Kfold cross validation: {}'.format(mean))
print('Standard Deviation of accuracy score of 10 different models using Kfold cross validation: {}'.format(variance))