#  LOT perdiciton Model Training
The task is to predict if the LOS > 3 Days by logistic regression.

This document is a based on [this code](https://github.com/MLforHealth/MIMIC_Extract/blob/master/notebooks/Baselines%20for%20Mortality%20and%20LOS%20prediction%20-%20Sklearn.ipynb)

In [1]:
from __future__ import print_function, division

In [2]:
import os, pickle, pandas as pd, numpy as np, scipy.stats as ss

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, accuracy_score, f1_score

import torch


In [3]:
DATA_FILEPATH     = './mimic_data/final/grouping_5/all_hourly_data.h5'
RAW_DATA_FILEPATH = './mimic_data/final/nogrouping_5/all_hourly_data.h5'
GAP_TIME          = 6  # In hours
WINDOW_SIZE       = 24 # In hours
SEED              = 1
ID_COLS           = ['subject_id', 'hadm_id', 'icustay_id']
GPU               = '2'

os.environ['CUDA_VISIBLE_DEVICES'] = GPU
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f9392ed1fc0>

In [4]:
data_full_lvl2 = pd.read_hdf(DATA_FILEPATH, 'vitals_labs')
data_full_raw  = pd.read_hdf(RAW_DATA_FILEPATH, 'vitals_labs') 
statics        = pd.read_hdf(DATA_FILEPATH, 'patients')

In [5]:
class DictDist():
    def __init__(self, dict_of_rvs): self.dict_of_rvs = dict_of_rvs
    def rvs(self, n):
        a = {k: v.rvs(n) for k, v in self.dict_of_rvs.items()}
        out = []
        for i in range(n): out.append({k: vs[i] for k, vs in a.items()})
        return out
    
class Choice():
    def __init__(self, options): self.options = options
    def rvs(self, n): return [self.options[i] for i in ss.randint(0, len(self.options)).rvs(n)]

In [6]:
data_full_lvl2.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,LEVEL2,alanine aminotransferase,alanine aminotransferase,alanine aminotransferase,albumin,albumin,albumin,albumin ascites,albumin ascites,albumin ascites,albumin pleural,...,white blood cell count,white blood cell count urine,white blood cell count urine,white blood cell count urine,ph,ph,ph,ph urine,ph urine,ph urine
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Aggregation Function,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
3,145834,211552,0,2.0,25.0,0.0,2.0,1.8,0.0,0.0,,,0.0,...,4.012837,0.0,,,9.0,7.4,0.147733,1.0,5.0,
3,145834,211552,1,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,2,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,3.0,7.26,0.0,0.0,,
3,145834,211552,3,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,4,0.0,,,0.0,,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,


In [7]:
data_full_raw.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,itemid,51,51,51,52,52,52,89,89,89,90,...,227465,227466,227466,227466,227467,227467,227467,227468,227468,227468
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,label,arterial bp [systolic],arterial bp [systolic],arterial bp [systolic],arterial bp mean,arterial bp mean,arterial bp mean,c.o. (fick),c.o. (fick),c.o. (fick),c.o.(thermodilution),...,prothrombin time,ptt,ptt,ptt,inr,inr,inr,fibrinogen,fibrinogen,fibrinogen
Unnamed: 0_level_2,Unnamed: 1_level_2,Unnamed: 2_level_2,LEVEL1,systolic blood pressure (arterial),systolic blood pressure (arterial),systolic blood pressure (arterial),mean blood pressure (arterial),mean blood pressure (arterial),mean blood pressure (arterial),cardiac output fick,cardiac output fick,cardiac output fick,cardiac output thermodilution,...,prothrombin time,partial thromboplastin time,partial thromboplastin time,partial thromboplastin time,prothrombin time,prothrombin time,prothrombin time,fibrinogen,fibrinogen,fibrinogen
Unnamed: 0_level_3,Unnamed: 1_level_3,Unnamed: 2_level_3,LEVEL2,systolic blood pressure,systolic blood pressure,systolic blood pressure,mean blood pressure,mean blood pressure,mean blood pressure,cardiac output fick,cardiac output fick,cardiac output fick,cardiac output thermodilution,...,prothrombin time pt,partial thromboplastin time,partial thromboplastin time,partial thromboplastin time,prothrombin time inr,prothrombin time inr,prothrombin time inr,fibrinogen,fibrinogen,fibrinogen
Unnamed: 0_level_4,Unnamed: 1_level_4,Unnamed: 2_level_4,Aggregation Function,count,mean,std,count,mean,std,count,mean,std,count,...,std,count,mean,std,count,mean,std,count,mean,std
subject_id,hadm_id,icustay_id,hours_in,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5,Unnamed: 23_level_5,Unnamed: 24_level_5
3,145834,211552,0,2.0,39.0,55.154329,2.0,159.5,140.714249,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,1,4.0,77.75,7.088723,4.0,60.25,5.123475,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,2,3.0,91.0,8.185353,3.0,71.0,5.0,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,3,4.0,117.0,19.714631,4.0,84.75,12.230427,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,
3,145834,211552,4,1.0,102.0,,1.0,77.0,,0.0,,,0.0,...,,0.0,,,0.0,,,0.0,,


In [8]:
statics.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,gender,ethnicity,age,insurance,admittime,diagnosis_at_admission,dischtime,discharge_location,fullcode_first,dnr_first,...,outtime,los_icu,admission_type,first_careunit,mort_icu,mort_hosp,hospital_expire_flag,hospstay_seq,readmission_30,max_hours
subject_id,hadm_id,icustay_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3,145834,211552,M,WHITE,76.526792,Medicare,2101-10-20 19:08:00,HYPOTENSION,2101-10-31 13:58:00,SNF,1.0,0.0,...,2101-10-26 20:43:09,6.06456,EMERGENCY,MICU,0,0,0,1,0,145
4,185777,294638,F,WHITE,47.845047,Private,2191-03-16 00:28:00,"FEVER,DEHYDRATION,FAILURE TO THRIVE",2191-03-23 18:41:00,HOME WITH HOME IV PROVIDR,1.0,0.0,...,2191-03-17 16:46:31,1.678472,EMERGENCY,MICU,0,0,0,1,0,40
6,107064,228232,F,WHITE,65.942297,Medicare,2175-05-30 07:15:00,CHRONIC RENAL FAILURE/SDA,2175-06-15 16:00:00,HOME HEALTH CARE,1.0,0.0,...,2175-06-03 13:39:54,3.672917,ELECTIVE,SICU,0,0,0,1,0,88
9,150750,220597,M,UNKNOWN/NOT SPECIFIED,41.790228,Medicaid,2149-11-09 13:06:00,HEMORRHAGIC CVA,2149-11-14 10:15:00,DEAD/EXPIRED,1.0,0.0,...,2149-11-14 20:52:14,5.323056,EMERGENCY,MICU,1,1,1,1,0,127
11,194540,229441,F,WHITE,50.148295,Private,2178-04-16 06:18:00,BRAIN MASS,2178-05-11 19:00:00,HOME HEALTH CARE,1.0,0.0,...,2178-04-17 20:21:05,1.58441,EMERGENCY,SICU,0,0,0,1,0,38


In [9]:
def simple_imputer(df):
    idx = pd.IndexSlice
    df = df.copy()
    if len(df.columns.names) > 2: df.columns = df.columns.droplevel(('label', 'LEVEL1', 'LEVEL2'))
    
    df_out = df.loc[:, idx[:, ['mean', 'count']]]
    icustay_means = df_out.loc[:, idx[:, 'mean']].groupby(ID_COLS).mean()
    
    df_out.loc[:,idx[:,'mean']] = df_out.loc[:,idx[:,'mean']].groupby(ID_COLS).fillna(
        method='ffill'
    ).groupby(ID_COLS).fillna(icustay_means).fillna(0)
    
    df_out.loc[:, idx[:, 'count']] = (df.loc[:, idx[:, 'count']] > 0).astype(float)
    df_out.rename(columns={'count': 'mask'}, level='Aggregation Function', inplace=True)
    
    is_absent = (1 - df_out.loc[:, idx[:, 'mask']])
    hours_of_absence = is_absent.cumsum()
    time_since_measured = hours_of_absence - hours_of_absence[is_absent==0].fillna(method='ffill')
    time_since_measured.rename(columns={'mask': 'time_since_measured'}, level='Aggregation Function', inplace=True)

    df_out = pd.concat((df_out, time_since_measured), axis=1)
    df_out.loc[:, idx[:, 'time_since_measured']] = df_out.loc[:, idx[:, 'time_since_measured']].fillna(100)
    
    df_out.sort_index(axis=1, inplace=True)
    return df_out

In [10]:
Ys = statics[statics.max_hours > WINDOW_SIZE + GAP_TIME][['mort_hosp', 'mort_icu', 'los_icu']]
Ys['los_3'] = Ys['los_icu'] > 3
Ys.drop(columns=['los_icu'], inplace=True)
Ys.astype(float)

lvl2, raw = [df[
    (df.index.get_level_values('icustay_id').isin(set(Ys.index.get_level_values('icustay_id')))) &
    (df.index.get_level_values('hours_in') < WINDOW_SIZE)
] for df in (data_full_lvl2, data_full_raw)]

raw.columns = raw.columns.droplevel(level=['label', 'LEVEL1', 'LEVEL2'])

train_frac, dev_frac, test_frac = 0.7, 0.1, 0.2
lvl2_subj_idx, raw_subj_idx, Ys_subj_idx = [df.index.get_level_values('subject_id') for df in (lvl2, raw, Ys)]
lvl2_subjects = set(lvl2_subj_idx)
assert lvl2_subjects == set(Ys_subj_idx), "Subject ID pools differ!"
assert lvl2_subjects == set(raw_subj_idx), "Subject ID pools differ!"

np.random.seed(SEED)
subjects, N = np.random.permutation(list(lvl2_subjects)), len(lvl2_subjects)
N_train, N_dev, N_test = int(train_frac * N), int(dev_frac * N), int(test_frac * N)
train_subj = subjects[:N_train]
dev_subj   = subjects[N_train:N_train + N_dev]
test_subj  = subjects[N_train+N_dev:]

[(lvl2_train, lvl2_dev, lvl2_test), (raw_train, raw_dev, raw_test), (Ys_train, Ys_dev, Ys_test)] = [
    [df[df.index.get_level_values('subject_id').isin(s)] for s in (train_subj, dev_subj, test_subj)] \
    for df in (lvl2, raw, Ys)
]

idx = pd.IndexSlice
lvl2_means, lvl2_stds = lvl2_train.loc[:, idx[:,'mean']].mean(axis=0), lvl2_train.loc[:, idx[:,'mean']].std(axis=0)
raw_means, raw_stds = raw_train.loc[:, idx[:,'mean']].mean(axis=0), raw_train.loc[:, idx[:,'mean']].std(axis=0)

lvl2_train.loc[:, idx[:,'mean']] = (lvl2_train.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
lvl2_dev.loc[:, idx[:,'mean']] = (lvl2_dev.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds
lvl2_test.loc[:, idx[:,'mean']] = (lvl2_test.loc[:, idx[:,'mean']] - lvl2_means)/lvl2_stds

raw_train.loc[:, idx[:,'mean']] = (raw_train.loc[:, idx[:,'mean']] - raw_means)/raw_stds
raw_dev.loc[:, idx[:,'mean']] = (raw_dev.loc[:, idx[:,'mean']] - raw_means)/raw_stds
raw_test.loc[:, idx[:,'mean']] = (raw_test.loc[:, idx[:,'mean']] - raw_means)/raw_stds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


If raw or lvl2 data

In [None]:
# raw_train, raw_dev, raw_test, lvl2_train, lvl2_dev, lvl2_test = [
#     simple_imputer(df) for df in (raw_train, raw_dev, raw_test, lvl2_train, lvl2_dev, lvl2_test)
# ]
# raw_flat_train, raw_flat_dev, raw_flat_test, lvl2_flat_train, lvl2_flat_dev, lvl2_flat_test = [
#     df.pivot_table(index=['subject_id', 'hadm_id', 'icustay_id'], columns=['hours_in']) for df in (
#         raw_train, raw_dev, raw_test, lvl2_train, lvl2_dev, lvl2_test
#     )
# ]

# for df in lvl2_train, lvl2_dev, lvl2_test, raw_train, raw_dev, raw_test: assert not df.isnull().any().any()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [11]:
lvl2_train, lvl2_dev, lvl2_test = [ simple_imputer(df) for df in ( lvl2_train, lvl2_dev, lvl2_test)]
lvl2_flat_train, lvl2_flat_dev, lvl2_flat_test = [df.pivot_table(index=['subject_id', 'hadm_id', 'icustay_id'], 
                                                    columns=['hours_in']) for df in (lvl2_train, lvl2_dev, lvl2_test) ]

for df in lvl2_train, lvl2_dev, lvl2_test: assert not df.isnull().any().any()
for df in lvl2_flat_train, lvl2_flat_dev, lvl2_flat_test: assert not df.isnull().any().any()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [None]:
# # Group DataFrames in a list
# dataframes = [raw_flat_train, raw_flat_dev, raw_flat_test, lvl2_flat_train, lvl2_flat_dev, lvl2_flat_test ]
# raw_flat_train.to_csv('raw_flat_train.csv')
# raw_flat_dev.to_csv('raw_flat_dev.csv')
# raw_flat_test.to_csv('raw_flat_test.csv')
# lvl2_flat_train.to_csv('lvl2_flat_train.csv')
# lvl2_flat_dev.to_csv('lvl2_flat_dev.csv')
# lvl2_flat_test.to_csv('lvl2_flat_test.csv')
# Ys_train.to_csv('Ys_train.csv')
# Ys_dev.to_csv('Ys_dev.csv')
# Ys_test.to_csv('Ys_test.csv')

In [None]:
# raw_flat_train = pd.read_csv('raw_flat_train.csv', index_col=[0,1,2])
# raw_flat_dev = pd.read_csv('raw_flat_dev.csv', index_col=[0,1,2])
# raw_flat_test = pd.read_csv('raw_flat_test.csv', index_col=[0,1,2])
# lvl2_flat_train = pd.read_csv('lvl2_flat_train.csv', index_col=[0,1,2])
# lvl2_flat_dev = pd.read_csv('lvl2_flat_dev.csv', index_col=[0,1,2])
# lvl2_flat_test = pd.read_csv('lvl2_flat_test.csv', index_col=[0,1,2])
# Ys_train = pd.read_csv('Ys_train.csv', index_col=[0])
# Ys_dev = pd.read_csv('Ys_dev.csv', index_col=[0])
# Ys_test = pd.read_csv('Ys_test.csv', index_col=[0])

In [12]:
Ys_test['los_3'].head

<bound method NDFrame.head of subject_id  hadm_id  icustay_id
32          175413   295037         True
33          176176   296681        False
42          119203   210828        False
44          181750   291554         True
61          176332   252348        False
64          172056   232593        False
65          143430   244776         True
68          170467   294232         True
98          188606   216929        False
99          187373   251343        False
106         145167   252051         True
123         195632   227264        False
124         172461   255660         True
141         168006   234668         True
152         117181   279643        False
160         161672   257626        False
172         148505   235343        False
211         193975   270493         True
251         117937   230307        False
253         176189   272631        False
269         106296   206613         True
270         188028   220345        False
281         111199   257572         

In [13]:

hyperparams = dict({
    'C': 0.18544999360231632,
    'penalty': 'l2',
    'solver': 'liblinear',
    'max_iter': 100
})
np.random.seed(SEED)

In [14]:
def run_only_final(model, hyperparams, X_flat_train, X_flat_dev, X_flat_test):
    best_M = model(**hyperparams)
    best_M.fit(pd.concat((X_flat_train, X_flat_dev)), pd.concat((Ys_train, Ys_dev))['los_3'])
    y_true  = Ys_test['los_3']
    y_score = best_M.predict_proba(X_flat_test)[:, 1]
    y_pred  = best_M.predict(X_flat_test)

    auc   = roc_auc_score(y_true, y_score)
    auprc = average_precision_score(y_true, y_score)
    acc   = accuracy_score(y_true, y_pred)
    F1    = f1_score(y_true, y_pred)
    
    return best_M, hyperparams, auc, auprc, acc, F1

In [16]:
results = run_only_final(LogisticRegression,
                                hyperparams,
                                lvl2_flat_train,
                                lvl2_flat_dev,
                                lvl2_flat_test) 
results

If we want to have  pytorch LR model

In [52]:
from torch.utils.data import DataLoader, TensorDataset

import torch
import torch.nn as nn
import torch.optim as optim
import pickle
from pathlib import Path

class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim: int):
        """Initialize the logistic regression model with a single linear layer.

        Args:
        ----
            input_dim (int): The size of the input feature vector.
        """
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)  # Binary classification (1 output)
        # Metadata initialization
        self.init_params = {"input_dim": input_dim}

    def forward(self, x):
        """Forward pass through the model."""
        return torch.sigmoid(self.linear(x))  # Sigmoid to produce probabilities for binary classification


# Function to save the model and metadata
# def save_model_and_metadata(  # noqa: PLR0913
#     model: torch.nn.Module,
#     data_split: dict,
#     configs: dict,
#     train_acc: float,
#     test_acc: float,
#     train_loss: float,
#     test_loss: float,
#     optimizer: optim.Optimizer,
#     loss: nn.Module,
#     n: str
# ) -> None:
#     """Save the model and metadata."""
#     log_dir = configs["run"]["log_dir"]
#     Path(log_dir).mkdir(parents=True, exist_ok=True)

#     with open(f"{log_dir}/target_model" + n + ".pkl", "wb") as f:
#         torch.save(model.state_dict(), f)

#     meta_data = {}

#     meta_data["init_params"] = model.init_params if hasattr(model, "init_params") else {}
#     # meta_data["train_indices"] = data_split["train_indices"]
#     # meta_data["test_indices"] = data_split["test_indices"]
#     # meta_data["num_train"] = len(data_split["train_indices"])

#     # read out optimizer parameters
#     meta_data["optimizer"] = {}
#     meta_data["optimizer"]["name"] = optimizer.__class__.__name__.lower()
#     meta_data["optimizer"]["lr"] = optimizer.param_groups[0].get("lr", 0)
#     meta_data["optimizer"]["weight_decay"] = optimizer.param_groups[0].get("weight_decay", 0)
#     meta_data["optimizer"]["momentum"] = optimizer.param_groups[0].get("momentum", 0)
#     meta_data["optimizer"]["dampening"] = optimizer.param_groups[0].get("dampening", 0)
#     meta_data["optimizer"]["nesterov"] = optimizer.param_groups[0].get("nesterov", False)

#     # read out loss parameters
#     meta_data["loss"] = {}
#     meta_data["loss"]["name"] = loss.__class__.__name__.lower()

#     meta_data["batch_size"] = configs["train"]["batch_size"]
#     meta_data["epochs"] = configs["train"]["epochs"]
#     meta_data["learning_rate"] = configs["train"]["learning_rate"]
#     meta_data["weight_decay"] = configs["train"]["weight_decay"]
#     meta_data["train_acc"] = train_acc
#     meta_data["test_acc"] = test_acc
#     meta_data["train_loss"] = train_loss
#     meta_data["test_loss"] = test_loss
#     meta_data["dataset"] = configs["data"]["dataset"]

#     with open(f"{log_dir}/model_metadata"+ n + ".pkl", "wb") as f:
#         pickle.dump(meta_data, f)

import torch.nn.init as init
# Training and evaluation setup
def train_and_save_logistic_regression(X_train, y_train, X_test, y_test, configs):
    # Convert the inverse regularization parameter C to weight_decay (regularization strength)


    # Initialize the model
    input_dim = X_train.shape[1]  # Assuming X_train is a NumPy array or similar
    print(input_dim)
    model = LogisticRegressionModel(input_dim)
    criterion = nn.BCELoss()


    optimizer = optim.SGD(model.parameters(), lr =configs["train"]["learning_rate"], 
                            weight_decay=configs["train"]["weight_decay"])

    # Training loop (max_iter = number of epochs)
    epochs =  configs["train"]["epochs"]
    batch_size = configs["train"]["batch_size"]
    
    inputs = torch.tensor(X_train.values, dtype=torch.float32)
    # print(f"inputs {y_train[:,None].shape}")
    labels = torch.tensor(y_train[:,None], dtype=torch.float32)
   
    # Create a TensorDataset and DataLoader for batch processing
    dataset = TensorDataset(inputs, labels)
    batch_size = 128
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
    for epoch in range(epochs):
        model.train()

        epoch_loss = 0
        for batch_inputs, batch_labels in train_loader:

            optimizer.zero_grad()  # Zero the gradient buffers

            outputs = model(batch_inputs)  # Forward pass
            loss = criterion(outputs, batch_labels)  # Calculate loss
            loss.backward()  # Backward pass

            optimizer.step()  # Optimize
            epoch_loss += loss.item() * batch_size
        
        epoch_loss /= len(dataset)
        if epoch % 2 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')

    # Evaluation on test set
    with torch.no_grad():
        inputs = torch.tensor(X_test.values, dtype=torch.float32)
        labels = torch.tensor(y_test, dtype=torch.float32)
        outputs = model(inputs).squeeze()
        predicted = (outputs >= 0.5).float()
        correct = (predicted == labels).float().sum()
        test_acc = correct / len(labels)
        test_loss = criterion(outputs, labels).item()

    # Calculate training accuracy and loss
    with torch.no_grad():
        inputs = torch.tensor(X_train.values, dtype=torch.float32)
        labels = torch.tensor(y_train, dtype=torch.float32)
        outputs = model(inputs).squeeze()
        predicted = (outputs >= 0.5).float()
        correct = (predicted == labels).float().sum()
        train_acc = correct / len(labels)
        train_loss = criterion(outputs, labels).item()

    print(f'Test Accuracy: {test_acc.item():.4f}, Test Loss: {test_loss:.4f}')
    print(f'Train Accuracy: {train_acc.item():.4f}, Train Loss: {train_loss:.4f}')
    # Save the model and metadata
    # save_model_and_metadata(
    #     model=model,
    #     data_split=data_split,
    #     configs=configs,
    #     train_acc=train_acc.item(),
    #     test_acc=test_acc.item(),
    #     train_loss=train_loss,
    #     test_loss=test_loss,
    #     optimizer=optimizer,
    #     loss=criterion,
    #     n=n
    # )


# Example configurations and data split
configs = {
    "run": {"log_dir": "./logs"},
    "train": {"batch_size": 128, "epochs": 10, "learning_rate": 0.001, "weight_decay": 0.0001},

}



# for n, X_flat_train, X_flat_dev, X_flat_test in (
#     ('lvl2', lvl2_flat_train, lvl2_flat_dev, lvl2_flat_test),
#     ('raw', raw_flat_train, raw_flat_dev, raw_flat_test)):
    # results = run_only_final(LogisticRegression,
    #                             best_hyperparams,
    #                             X_flat_train,
    #                             X_flat_dev,
    #                             X_flat_test,
    #                             'los_3')


  

train_and_save_logistic_regression(lvl2_flat_train,
                                        Ys_train['los_3'],
                                        lvl2_flat_test,
                                        Ys_test['los_3'],
                                        configs)

7488
Epoch [1/10], Loss: 43.2495
Epoch [3/10], Loss: 43.1651
Epoch [5/10], Loss: 43.1667
Epoch [7/10], Loss: 43.1655
Epoch [9/10], Loss: 43.1659
Test Accuracy: 0.5718, Test Loss: 42.8184
Train Accuracy: 0.5686, Train Loss: 43.1444


Class Distibution

In [None]:
class_0_count = ( Ys_train['los_3'] == 0).sum()  # Count of class 0 samples
class_1_count = ( Ys_train['los_3'] == 1).sum()  # Count of class 1 samples
pos_weight = class_0_count / class_1_count  # Compute class weight for class 1
pos_weight

1.3177983681371872

In [48]:

for df in lvl2_flat_train, lvl2_flat_test : assert not df.isnull().any().any()
lvl2_flat_train.values.shape


In [35]:
class DeepBinaryClassifier(nn.Module):
    def __init__(self, input_dim):
        super(DeepBinaryClassifier, self).__init__()
        
        # Hidden Layer 1
        self.fc1 = nn.Linear(input_dim, 512)
        self.bn1 = nn.BatchNorm1d(512)  # Batch Normalization
        self.dropout1 = nn.Dropout(0.3)  # Dropout for regularization
        
        # Hidden Layer 2
        self.fc2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.3)
        
        # Hidden Layer 3
        self.fc3 = nn.Linear(256, 128)
        self.bn3 = nn.BatchNorm1d(128)
        self.dropout3 = nn.Dropout(0.3)
        
        # Output Layer (Binary Classification)
        self.output = nn.Linear(128, 1)
    
    def forward(self, x):
        x = torch.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        
        x = torch.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        
        x = torch.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        
        return self.output(x)  # Do not apply sigmoid here, we will use BCEWithLogitsLoss for stability




# Convert the dataset to PyTorch tensors
X_train_tensor = torch.tensor(lvl2_flat_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor( Ys_train['los_3'], dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(lvl2_flat_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(Ys_test['los_3'], dtype=torch.float32).unsqueeze(1)

# Create a DataLoader for batching
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)




# Instantiate the model, loss function, and optimizer
model = DeepBinaryClassifier(input_dim =X_train_tensor.shape[1])

criterion = nn.BCEWithLogitsLoss()  # Combines sigmoid + BCE loss in a stable manner
optimizer = optim.Adam(model.parameters(), lr= 0.001)

# Training loop
for epoch in range(10):
    model.train()
    running_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()  # Clear previous gradients
        outputs = model(X_batch)  # Forward pass
        loss = criterion(outputs, y_batch)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
        running_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/10], Loss: {running_loss/len(train_loader):.4f}")

# Evaluation on the test set
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    predicted = torch.sigmoid(test_outputs).round()  # Convert logits to probabilities and round
    accuracy = (predicted == y_test_tensor).float().mean()
    print(f"Test Accuracy: {accuracy.item():.4f}")

Epoch [1/10], Loss: 0.6973
Epoch [2/10], Loss: 0.6902
Epoch [3/10], Loss: 0.6860
Epoch [4/10], Loss: 0.6862
Epoch [5/10], Loss: 0.6853
Epoch [6/10], Loss: 0.6847
Epoch [7/10], Loss: 0.6845
Epoch [8/10], Loss: 0.6839
Epoch [9/10], Loss: 0.6838
Epoch [10/10], Loss: 0.6833
Test Accuracy: 0.5718


In [None]:
X_train_tensor.shape[1]

7488