Load essential libraries

In [None]:
## Load essential libraries
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('seaborn-whitegrid')
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

Mount the Google Drive folder, if needed, for accessing data

In [None]:
# ## Mount the Google Drive folder, if needed, for accessing data
# if('google.colab' in sys.modules):
#     from google.colab import drive
#     drive.mount('/content/drive', force_remount = True)
#     # Change path below starting from /content/drive/MyDrive/Colab Notebooks/
#     # depending on how data is organized inside your Colab Notebooks folder in
#     # Google Drive
#     DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/OddSem2023MAHE'
#     DATA_DIR = DIR+'/Data/'
# else:
#     DATA_DIR = 'Data/'

Load the ICU dataset

In [None]:
## Load the ICU dataset
#FILENAME = DATA_DIR + 'ICU_Complete.csv'
FILENAME =  'Data/ICU_Complete.csv'
df = pd.read_csv(FILENAME)
print('ICU dataset')
print('-----------')
print(df.info(verbose = True))

in hospital death is target variable

Create lists of categorical and continuous features

In [None]:
## Create lists of categorical and continuous features
categorical_features = [ 'In-hospital_death','Gender', 'MechVent']
continuous_features = df.columns[~df.columns.isin(categorical_features)].to_list()


target_variable= 'In-hospital_death'
categorical_features.remove(target_variable)

print("categorical_features")
print(categorical_features)
print("continuous_features")
print(continuous_features)

Assign 'category' datatype to categorical columns

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Define the sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Create an array of x values from -10 to 10
x = np.linspace(-10, 10, 400)

# Calculate the corresponding y values using the sigmoid function
y = sigmoid(x)

# Create the plot
plt.figure(figsize=(8, 6))
plt.plot(x, y, label='Sigmoid Function', color='b')
plt.xlabel('x')
plt.ylabel('Sigmoid(x)')
plt.title('Sigmoid Function')
plt.grid(True)
plt.legend()
plt.show()


One-hot encode the categorical features

In [None]:
## One-hot encode the categorical features
df = pd.concat([df, pd.get_dummies(df[categorical_features])], axis = 1).drop(categorical_features, axis = 1)

In [None]:
# pd.get_dummies(df[categorical_features])

Label encode the target variable

In [None]:
## Label encode the output label
labenc = LabelEncoder()
df[target_variable] = labenc.fit_transform(df[target_variable])

How balanced is the dataset w.r.t. the target variable?

In [None]:
## How balanced is the dataset w.r.t. the
## target variable 'In-hospital_death'?
print(np.mean(df['In-hospital_death'] == 0)*100)
print(np.mean(df['In-hospital_death'] == 1)*100)
df['In-hospital_death'].value_counts().plot(kind = 'barh')

The final dataframe for analysis

In [None]:
## The final dataframe for analysis
df.head(5)

Stratified train and test split of the data

In [None]:
## Stratified train and test split of the data
X = df.drop('In-hospital_death', axis = 1)
y = df['In-hospital_death']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify = y,
                                                    test_size = 0.2,
                                                    random_state = 1)

Check the proportion of output labels in train and test set

In [None]:
## Check the proportion of output labels in train and test set
print('Proportion of 1-to-0 labels in train set = %f, test set = %f\n'%
      (np.mean(y_train == 1), np.mean(y_test == 1)))

Logistic regression starts from here

In [None]:
num_samples =  X.shape[0] # number of samples
num_features = X.shape[1] # number of features (a.k.a. dimensionality)
num_labels = len(np.unique(y)) # number of unique target variable labels

Add bias feature to all samples

In [20]:
num_features

16

In [25]:
## Add bias feature to all samples
X = np.hstack([X, np.ones((X.shape[0], 1))])
num_features += 1

Initial weight vector including bias term

In [26]:
## Initial weight vector including bias term

w = np.random.choice(np.arange(-0.1, 0.1), size=num_features, replace = True)
print('w = ')
print(w)

w = 
[-0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1
 -0.1 -0.1 -0.1]


Define sigmoid function

In [28]:
## Define sigmoid function
sigmoid = lambda x : 1 / (1+np.exp(-x))

Calculate raw and sigmoid-activated scores for all samples

In [29]:
## Calculate raw scores for all samples
z = np.dot(X, w)

## Calculate sigmoid-activated scores for all samples
a = sigmoid(z)

Calculate average data loss

In [30]:
## Calculate total average data loss
loss_data = np.mean(y*np.log(a) + (1-y)*np.log(1-a))

Calculate regularization loss

In [31]:
## Calculate regularization loss
reg = 0.1 # regularization strength
loss_reg = reg * np.sum(w * w)

Calculate total loss as sum of data loss and regularization loss

In [32]:
## Calculate total loss as sum of data loss and regularization loss
loss = loss_data + loss_reg
print('Total loss = data loss + regularization loss')
print(loss)

Total loss = data loss + regularization loss
-15.076904277848369


Calculate the gradient w.r.t. weights

In [33]:
## Calculate the gradient w.r.t. weights
## Sum of gradient of average data loss and the gradient of the regularization loss
dw = (1 / num_samples) * np.dot(X.T, (a - y)) + 2 * reg * np.hstack([w[:-1], 0])
print('Gradient vector = ')
print(dw)

Gradient vector = 
[ -2.06741744  -9.22321761 -22.50060965  -8.02550381 -21.52508044
 -12.53058425  -4.87770533  -5.47215919 -23.79984759 -27.34387807
  -1.9795597  -11.36949196  -0.07884843  -0.09451312  -0.03312447
  -0.14023709  -0.13336156]
