In [1]:
## Import dependencies 
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt 
import numpy
# %matplotlib inline

import pickle

import seaborn as sns
import matplotlib.ticker as ticker
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = 'iframe'

import os, sys, pathlib, gc
import re, math, random, time
import datetime as dt
from tqdm import tqdm
from typing import Optional, Union, Tuple
from collections import OrderedDict

import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import roc_auc_score, confusion_matrix
from torcheval.metrics.functional.aggregation.auc import auc


# import tensorflow as tf
# from tensorflow import keras
# from keras import layers
# import tensorflow_addons as tfa

import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import warnings
warnings.filterwarnings('ignore')

print('import done!')

device = torch.device("cuda")

import done!


In [2]:
## For reproducible results    
def seed_all(s):
    random.seed(s)
    np.random.seed(s)
    # tf.random.set_seed(s)
    torch.manual_seed(s)
    torch.cuda.manual_seed(s)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
    os.environ['PYTHONHASHSEED'] = str(s) 
    print('Seeds setted!')
global_seed = 2
seed_all(global_seed)

## Limit GPU Memory in TensorFlow
## Because TensorFlow, by default, allocates the full amount of available GPU memory when it is launched. 
# physical_devices = tf.config.list_physical_devices('GPU')
# if len(physical_devices) > 0:
#     for device in physical_devices:
#         tf.config.experimental.set_memory_growth(device, True)
#         print('{} memory growth: {}'.format(device, tf.config.experimental.get_memory_growth(device)))
# else:
#     print("Not enough GPU hardware devices available")
    
## For Seaborn Setting
custom_params = {
    "axes.spines.right": False,
    "axes.spines.top": False,
    'grid.alpha': 0.3,
    'figure.figsize': (16, 6),
    'axes.titlesize': 'Large',
    'axes.labelsize': 'Large',
    'figure.facecolor': '#fdfcf6',
    'axes.facecolor': '#fdfcf6',
}
cluster_colors = ['#b4d2b1', '#568f8b', '#1d4a60', '#cd7e59', '#ddb247', '#d15252']
sns.set_theme(
    style='whitegrid',
    #palette=sns.color_palette(cluster_colors),
    rc=custom_params,)

Seeds setted!


## Generate seasonal data

In [None]:
%%time

#######Create seasonal data


gridSearchData, crossValData, internalEvaluationData, externalEvaluationData = pickle.load(open('../Clean_data/dataset_scaled_2vs1_25102024.sav', 'rb'))
features = pickle.load(open('../Clean_data/sequence_features_long_full.sav', 'rb'))

gridSearch_seq = gridSearchData[['patid', 'outcome_12months']].merge(features, on = 'patid', how='inner').fillna(0).drop_duplicates('patid').reset_index(drop=True)
crossVal_seq = crossValData[['patid', 'outcome_12months']].merge(features, on = 'patid', how='inner').fillna(0).drop_duplicates('patid').reset_index(drop=True)
internalVal_seq = internalEvaluationData[['patid', 'outcome_12months']].merge(features, on = 'patid', how='inner').fillna(0).drop_duplicates('patid').reset_index(drop=True)
externalVal_seq = externalEvaluationData[['patid', 'outcome_12months']].merge(features, on = 'patid', how='inner').fillna(0).drop_duplicates('patid').reset_index(drop=True)


categorical_seq_columns = gridSearch_seq.columns[gridSearch_seq.columns.str.contains('BTS')].tolist()
numerical_seq_columns = gridSearch_seq.columns[(~gridSearch_seq.columns.str.contains('BTS'))&(gridSearch_seq.columns.str.contains('_month'))].tolist()
print(f'total features:  {len(categorical_seq_columns)+len(numerical_seq_columns)}')


#fix BTS step column from object to int
for var in categorical_seq_columns:
    gridSearch_seq[var] = gridSearch_seq[var].astype('int')
    crossVal_seq[var] = crossVal_seq[var].astype('int')
    internalVal_seq[var] = internalVal_seq[var].astype('int')
    externalVal_seq[var] = externalVal_seq[var].astype('int')


var_seq = gridSearch_seq.columns[gridSearch_seq.columns.str.contains('_month111')]
for var in var_seq:
    var = var[:-3]
    print(var, 'season 1')
    gridSearch_seq[var+'_season1'] = gridSearch_seq.apply(lambda x: x[var+'1'] + x[var+'2'], axis=1)
    start = 3
    for i in range(1, 40):
        print(var, 'season', str(i+1))
        if ('BTS' in var):
            gridSearch_seq[var+'_season'+str(i+1)] = gridSearch_seq.apply(lambda x: np.round(np.mean([x[var+str(i*start)], x[var+str(i*start+1)], x[var+str(i*3+2)]])), axis=1)
        else:
            gridSearch_seq[var+'_season'+str(i+1)] = gridSearch_seq.apply(lambda x: x[var+str(i*start)] + x[var+str(i*start+1)] + x[var+str(i*3+2)], axis=1)
    gridSearch_seq[var+'_season'+str(i+2)] = gridSearch_seq.apply(lambda x: x[var+'120'], axis=1)
    print(var, 'season 41')

var_seq = crossVal_seq.columns[crossVal_seq.columns.str.contains('_month111')]
for var in var_seq:
    var = var[:-3]
    print(var, 'season 1')
    crossVal_seq[var+'_season1'] = crossVal_seq.apply(lambda x: x[var+'1'] + x[var+'2'], axis=1)
    start = 3
    for i in range(1, 40):
        print(var, 'season', str(i+1))
        if ('BTS' in var):
            crossVal_seq[var+'_season'+str(i+1)] = crossVal_seq.apply(lambda x: np.round(np.mean([x[var+str(i*start)], x[var+str(i*start+1)], x[var+str(i*3+2)]])), axis=1)
        else:
            crossVal_seq[var+'_season'+str(i+1)] = crossVal_seq.apply(lambda x: x[var+str(i*start)] + x[var+str(i*start+1)] + x[var+str(i*3+2)], axis=1)
    crossVal_seq[var+'_season'+str(i+2)] = crossVal_seq.apply(lambda x: x[var+'120'], axis=1)
    print(var, 'season 41')

var_seq = internalVal_seq.columns[internalVal_seq.columns.str.contains('_month111')]
for var in var_seq:
    var = var[:-3]
    print(var, 'season 1')
    internalVal_seq[var+'_season1'] = internalVal_seq.apply(lambda x: x[var+'1'] + x[var+'2'], axis=1)
    start = 3
    for i in range(1, 40):
        print(var, 'season', str(i+1))
        if ('BTS' in var):
            internalVal_seq[var+'_season'+str(i+1)] = internalVal_seq.apply(lambda x: np.round(np.mean([x[var+str(i*start)], x[var+str(i*start+1)], x[var+str(i*3+2)]])), axis=1)
        else:
            internalVal_seq[var+'_season'+str(i+1)] = internalVal_seq.apply(lambda x: x[var+str(i*start)] + x[var+str(i*start+1)] + x[var+str(i*3+2)], axis=1)
    internalVal_seq[var+'_season'+str(i+2)] = internalVal_seq.apply(lambda x: x[var+'120'], axis=1)
    print(var, 'season 41')

var_seq = externalVal_seq.columns[externalVal_seq.columns.str.contains('_month111')]
for var in var_seq:
    var = var[:-3]
    print(var, 'season 1')
    externalVal_seq[var+'_season1'] = externalVal_seq.apply(lambda x: x[var+'1'] + x[var+'2'], axis=1)
    start = 3
    for i in range(1, 40):
        print(var, 'season', str(i+1))
        if ('BTS' in var):
            externalVal_seq[var+'_season'+str(i+1)] = externalVal_seq.apply(lambda x: np.round(np.mean([x[var+str(i*start)], x[var+str(i*start+1)], x[var+str(i*3+2)]])), axis=1)
        else:
            externalVal_seq[var+'_season'+str(i+1)] = externalVal_seq.apply(lambda x: x[var+str(i*start)] + x[var+str(i*start+1)] + x[var+str(i*3+2)], axis=1)
    externalVal_seq[var+'_season'+str(i+2)] = externalVal_seq.apply(lambda x: x[var+'120'], axis=1)
    print(var, 'season 41')


gridSearch_seq = gridSearch_seq[['patid']+gridSearch_seq.columns[gridSearch_seq.columns.str.contains('season')].tolist()]
crossVal_seq = crossVal_seq[['patid']+crossVal_seq.columns[crossVal_seq.columns.str.contains('season')].tolist()]
internalVal_seq = internalVal_seq[['patid']+internalVal_seq.columns[internalVal_seq.columns.str.contains('season')].tolist()]
externalVal_seq = externalVal_seq[['patid']+externalVal_seq.columns[externalVal_seq.columns.str.contains('season')].tolist()]


# pickle.dump([gridSearch_seq, crossVal_seq, internalVal_seq, externalVal_seq], open('./OPCRD_ASTHMA/Clean_data/seasonal_long_dataset_28102024.sav', 'wb'))

In [None]:
datasets = gridSearch_seq, crossVal_seq, internalVal_seq, externalVal_seq

In [None]:
pickle.dump(datasets, open('../Clean_data/seasonal_long_dataset_28102024.sav', 'wb'))

## Prepare sequence of categorical vars

In [None]:
gridSearch_seq, crossVal_seq, internalVal_seq, externalVal_seq = pickle.load(open('../Clean_data/seasonal_long_dataset_28102024.sav', 'rb'))
datasets = [gridSearch_seq, crossVal_seq, internalVal_seq, externalVal_seq]
datasets_name = ['gridSearch_seq', 'crossVal_seq', 'internalVal_seq', 'externalVal_seq']

In [None]:
def cat_continuous(x):
    if x==0:
        return '0'
    elif x==1:
        return '1'
    elif x==2:
        return '2'
    elif (x>2) :
        return '>=3'
    
#Average daily ICS
def cat_ics(x):
    if (x==0):
        return '0'
    elif (x>=1) & (x<=200):
        return '1-200'
    elif (x>200) & (x<=400):
        return '201-400'
    elif (x>400):
        return '>400'
    
# features['cat_average_daily_dose_ICS'] = features.average_daily_dose_ICS.apply(lambda x: cat_ics(x))

In [None]:
num_vars = ['numAsthmaAttacks_',
       'numAntibioticsEvents_',
       'numAntibioticswithLRTI_', 'numOCSEvents_',
       'numOCSwithLRTI_', 'numHospEvents_',
       'numPCS_', 'numPCSAsthma_']
ics_vars = ['average_daily_dose_ICS_', 'prescribed_daily_dose_ICS_']
count=0
output = []
for data, data_name in zip(datasets, datasets_name):
    print(data_name)
    for num_var in num_vars:
        season_vars = data.columns[data.columns.str.contains(num_var)]
        for var in season_vars:
            print(var)
            data[var] = data[var].apply(lambda x: cat_continuous(x))
    for ics_var in ics_vars:
        season_vars = data.columns[data.columns.str.contains(ics_var)]
        for var in season_vars:
            print(var)
            data[var] = data[var].apply(lambda x: cat_ics(x))
    output.append(data)
pickle.dump(output, open('../Clean_data/seasonal_cat_dataset_04112024.sav', 'wb'))

## Load data asthma OPCRD

In [3]:
exp_config = {
    'n_bins': 10,
    'n_splits': 5,
    'batch_size': 128,
    'learning_rate': 1e-3,
    'weight_decay': 0.0001,
    'train_epochs': 50,
    'finalize': True,
    'finalize_epochs': 8,
}

model_config = {
    'cat_embedding_dim': 2,
    'num_transformer_blocks': 4,
    'num_heads': 2,
    'tf_dropout_rates': [0.3, 0.3, 0.3, 0.3,],
    'ff_dropout_rates': [0.3, 0.3, 0.3, 0.3,],
    'mlp_dropout_rates': [0.2, 0.1],
    'mlp_hidden_units_factors': [2, 1],
    'patience': 5
}

print('Parameters setted!')

Parameters setted!


In [4]:
gridSearch_seq, crossVal_seq, internalVal_seq, externalVal_seq = pickle.load(open('../Clean_data/seasonal_cat_dataset_04112024.sav', 'rb'))

In [None]:
categorical_columns = ['sex', 'BMI_cat', 'ethnic_group', 'smokingStatus' , 'PEFStatus', 'EosinophilLevel', 
                       'DeviceType', 'imd_decile', 'PriorEducation', 'rhinitis', 'cardiovascular', 'heartfailure', 
                       'psoriasis', 'anaphylaxis', 'diabetes', 'ihd', 'anxiety', 'eczema', 'nasalpolyps',
                      'BTS_step', 'age_cat',
                      ]

# numerical_columns = ['age', 
#                      'numOCSEvents', 'numPCS', 'numPCSAsthma', 'numAntibioticsEvents',
#                       'numAntibioticswithLRTI', 'numOCSEvents', 'numOCSwithLRTI',
#                       'numAsthmaAttacks', 'numAcuteRespEvents', 'numHospEvents',
#                     ]

# categorical_columns=['sex', 'rhinitis', 'cardiovascular', 'heartfailure', 'psoriasis', 'anaphylaxis', 'diabetes', 'ihd', 
#                   'anxiety', 'eczema', 'nasalpolyps', 'asthmaPlan', 'BMI_cat_normal', 'BMI_cat_not recorded', 
#                   'BMI_cat_obese', 'BMI_cat_overweight', 'BMI_cat_underweight', 'ethnic_group_Asian', 'ethnic_group_Black', 
#                   'ethnic_group_Mixed', 'ethnic_group_Other', 'ethnic_group_White', 'ethnic_group_not recorded', 
#                   'smokingStatus_current', 'smokingStatus_former', 'smokingStatus_never', 'imd_decile_0', 
#                   'imd_decile_1', 'imd_decile_2', 'imd_decile_3', 'imd_decile_4', 'imd_decile_5', 'imd_decile_6', 
#                   'imd_decile_7', 'imd_decile_8', 'imd_decile_9', 'imd_decile_10', 'CharlsonScore_0.0', 
#                   'CharlsonScore_1.0', 'CharlsonScore_2.0', 'CharlsonScore_3.0', 'CharlsonScore_4.0', 'CharlsonScore_5.0', 
#                   'CharlsonScore_6.0', 'CharlsonScore_7.0', 'CharlsonScore_8.0', 'CharlsonScore_9.0', 'CharlsonScore_10.0', 
#                   'CharlsonScore_11.0', 'CharlsonScore_12.0', 'PEFStatus_60-80', 'PEFStatus_less than 60', 'PEFStatus_more than 80', 
#                   'PEFStatus_not recorded', 'EosinophilLevel_high', 'EosinophilLevel_normal', 'EosinophilLevel_not recorded', 
#                   'BTS_step_0', 'BTS_step_1', 'BTS_step_2', 'BTS_step_3', 'BTS_step_4', 'BTS_step_5', 'DeviceType_BAI', 
#                   'DeviceType_DPI', 'DeviceType_NEB', 'DeviceType_not recorded', 'DeviceType_pMDI', 'PriorEducation_No', 
#                   'PriorEducation_Yes']
numerical_columns = ['age', 'numAsthmaManagement', 'numAsthmaReview', 'numAsthmaMedReview', 'numAsthmaReviewRCP']

identifier = ['patid']
outcome = ['outcome_12months']


print('total features: ', len(categorical_columns)+len(numerical_columns))

In [9]:
gridSearch, crossVal, internalEvaluation, externalEvaluation = pickle.load(open('../Clean_data/dataset_scaled_2vs1_25102024.sav', 'rb'))

In [13]:
gridSearch.numAsthmaReview_cat.unique()

array([0, '0', '1', '>=3', 2, 1, '2'], dtype=object)

In [None]:
gridSearch, crossVal, internalEvaluation, externalEvaluation = pickle.load(open('../Clean_data/dataset_2vs1_25102024.sav', 'rb'))
features = pickle.load(open('../Clean_data/cleaned_features_22102024.sav', 'rb'))
# gridSearch_fold = gridSearch[identifier+outcome].merge(features[identifier+categorical_columns+numerical_columns], on='patid', how='inner').reset_index(drop=True)
# train_fold = crossVal[identifier+outcome].merge(features[identifier+categorical_columns+numerical_columns], on='patid', how='inner').reset_index(drop=True)
# valid_fold = internalEvaluation[identifier+outcome].merge(features[identifier+categorical_columns+numerical_columns], on='patid', how='inner').reset_index(drop=True)
# test = externalEvaluation[identifier+outcome].merge(features[identifier+categorical_columns+numerical_columns], on='patid', how='inner').reset_index(drop=True)

gridSearch_fold = gridSearch[identifier+categorical_columns+numerical_columns+outcome]
train_fold = crossVal[identifier+categorical_columns+numerical_columns+outcome]
valid_fold = internalEvaluation[identifier+categorical_columns+numerical_columns+outcome]
test = externalEvaluation[identifier+categorical_columns+numerical_columns+outcome]
print('Grid search: {:d}, Train set: {:d}, Validation set: {:d}, Test set: {:d}'.\
      format(gridSearch_fold.shape[0], train_fold.shape[0], valid_fold.shape[0], test.shape[0]))

In [None]:
gridSearch_fold = gridSearch_fold.merge(gridSearch_seq, on='patid', how='inner').reset_index(drop=True)
train_fold = train_fold.merge(crossVal_seq, on='patid', how='inner').reset_index(drop=True)
valid_fold = valid_fold.merge(internalVal_seq, on='patid', how='inner').reset_index(drop=True)
test = test.merge(externalVal_seq, on='patid', how='inner').reset_index(drop=True)
print('Grid search: {:d}, Train set: {:d}, Validation set: {:d}, Test set: {:d}'.\
      format(gridSearch_fold.shape[0], train_fold.shape[0], valid_fold.shape[0], test.shape[0]))

In [None]:
# sampling data 30%

train_fold = train_fold.sample(frac=.03, random_state=global_seed).reset_index(drop=True)
valid_fold = valid_fold.sample(frac=.05, random_state=global_seed).reset_index(drop=True)
test = test.sample(frac=.2, random_state=global_seed).reset_index(drop=True)

print('Train set: {:d}, Validation set: {:d}, Test set: {:d}'.\
      format(train_fold.shape[0], valid_fold.shape[0], test.shape[0]))

In [None]:
print(gridSearch_fold[outcome].value_counts(normalize=True))
print(train_fold[outcome].value_counts(normalize=True))
print(valid_fold[outcome].value_counts(normalize=True))
print(test[outcome].value_counts(normalize=True))

In [None]:
# categorical_seq_columns = training_seq.columns[training_seq.columns.str.contains('BTS')].tolist()
# numerical_seq_columns = training_seq.columns[(~training_seq.columns.str.contains('BTS'))&(training_seq.columns.str.contains('_season'))].tolist()
# print(f'total features:  {len(categorical_seq_columns)+len(numerical_seq_columns)}')

# categorical_columns 
# # = categorical_columns + categorical_seq_columns
# numerical_columns = numerical_columns + numerical_seq_columns

# print('total features: ', len(categorical_columns)+len(numerical_columns))

In [None]:
#numerical to categorical

# def convertNum2Cat(x):
#     if x == 0:
#         return 0
#     elif (x > 0) & (x<2):
#         return 1
#     else:
#         return 2

# datasets = [train_fold, valid_fold, test]
# for dataset in datasets:
#     for var in numerical_seq_columns:
#         print(var)
#         dataset[var] = dataset[var].apply(lambda x: convertNum2Cat(x))

In [None]:

# Making Lookup table of categorical featurs and target
# Using sklearn.preprocessing.OrdinalEncoder
oe = OrdinalEncoder(handle_unknown='error',
                    dtype=np.int64, )

encoded = oe.fit_transform(train_fold[categorical_columns].values)
#decoded = oe.inverse_transform(encoded)
train_fold[categorical_columns] = encoded

valid_fold[categorical_columns] = oe.transform(valid_fold[categorical_columns].values)
gridSearch_fold[categorical_columns] = oe.transform(gridSearch_fold[categorical_columns].values)
test[categorical_columns] = oe.transform(test[categorical_columns].values)

encoder_categories = oe.categories_
print(len(encoder_categories))
# encoder_categories

In [None]:
# categorical_seq_columns = training_seq.columns[(training_seq.columns.str.contains('_season'))].tolist()
numerical_seq_columns = gridSearch_seq.columns[(~gridSearch_seq.columns.str.contains('BTS'))&(gridSearch_seq.columns.str.contains('_season'))].tolist()
print(f'total features:  {len(numerical_seq_columns)}')

# categorical_columns = categorical_columns + categorical_seq_columns
numerical_columns = numerical_columns + numerical_seq_columns

print('total features: ', len(categorical_columns)+len(numerical_columns))

In [None]:
## Dataset
class BuildDataset(torch.utils.data.Dataset):
    def __init__(self, df, numerical_columns,
                 categorical_columns, target=None):
        self.df = df
        self.numerical_columns = numerical_columns
        self.categorical_columns = categorical_columns
        self.target = target
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        data = {}
        
        for nc in self.numerical_columns:
            x = torch.tensor(self.df[nc][index],
                             dtype=torch.float32)
            x = torch.unsqueeze(x, dim=0)
            data[nc] = x
            
        for cc in self.categorical_columns:
            x = torch.tensor(self.df[cc][index],
                             dtype=torch.int32)
            x = torch.unsqueeze(x, dim=0)
            data[cc] = x

       
        if self.target is not None:
            label = torch.tensor(self.df[self.target][index],
                                 dtype=torch.float32)
            label = torch.unsqueeze(label, dim=-1)
            return data, label
        else:
            return data

In [None]:
## Create Datasets
gridSearch_ds = BuildDataset(
    gridSearch_fold,
    numerical_columns,
    categorical_columns,
    target=outcome[0]
)

train_ds = BuildDataset(
    train_fold,
    numerical_columns,
    categorical_columns,
    target=outcome[0]
)

val_ds = BuildDataset(
    valid_fold,
    numerical_columns, 
    categorical_columns, 
    target=outcome[0]
)

test_ds = BuildDataset(
    test,
    numerical_columns, 
    categorical_columns,
    target=outcome[0]
)

## Operation Check
index = 0
print(gridSearch_ds.__getitem__(index))

In [None]:
## Create DataLoaders
batch_size = exp_config['batch_size']

gridSearch_dl = torch.utils.data.DataLoader(
    gridSearch_ds,
    batch_size=batch_size,
    shuffle=True
)
train_dl = torch.utils.data.DataLoader(
    train_ds,
    batch_size=batch_size,
    shuffle=True
)
val_dl = torch.utils.data.DataLoader(
    val_ds, 
    batch_size=batch_size, 
    shuffle=True
)
test_dl = torch.utils.data.DataLoader(
    test_ds, 
    batch_size=batch_size, 
    shuffle=False, 
    # drop_last=False
)

dl_dict = {'train': train_dl, 'val': val_dl}

## Operation Check
sample_data, sample_label = next(iter(dl_dict['train']))
input_dtypes = {}
for key in sample_data:
    input_dtypes[key] = sample_data[key].dtype
    print(f'{key}, shape:{sample_data[key].shape}, dtype:{sample_data[key].dtype}')

print('Label shape: ', sample_label.shape)

## Model

In [None]:
class Preprocessor(nn.Module):
    def __init__(self, numerical_columns, categorical_columns, encoder_categories, emb_dim):
        super().__init__()
        self.numerical_columns = numerical_columns
        self.categorical_columns = categorical_columns
        self.encoder_categories = encoder_categories
        self.emb_dim = emb_dim
        self.embed_layers = nn.ModuleDict()
        
        for i, categorical in enumerate(categorical_columns):
            num_embeddings = len(self.encoder_categories[i])
            embedding = nn.Embedding(
                num_embeddings=num_embeddings,
                embedding_dim=self.emb_dim,
            )
            self.embed_layers[categorical] = embedding

                    
    def forward(self, x):
        x_nums = []
        for numerical in self.numerical_columns:
            x_num = torch.unsqueeze(x[numerical], dim=1)
            x_nums.append(x_num)
        if len(x_nums) > 0:
            x_nums = torch.cat(x_nums, dim=1)
        else:
            x_nums = torch.tensor(x_nums, dtype=torch.float32)
        
        x_cats = []
        for categorical in self.categorical_columns:
            x_cat = self.embed_layers[categorical](x[categorical])
            x_cats.append(x_cat)
        if len(x_cats) > 0:
            x_cats = torch.cat(x_cats, dim=1)
        else:
            x_cats = torch.tensor(x_cats, dtype=torch.float32)
        
        return x_nums, x_cats
    
## Operation Check
preprocessor = Preprocessor(numerical_columns,
                            categorical_columns,
                            encoder_categories,
                            emb_dim=model_config['cat_embedding_dim'])
x_nums, x_cats = preprocessor(sample_data)
x_nums = x_nums.reshape(exp_config['batch_size'],len(numerical_columns))
x_nums.shape, x_cats.shape

In [None]:
class MLPBlock(nn.Module):
    def __init__(self, n_features, hidden_units,
                 dropout_rates):
        super().__init__()
        self.mlp_layers = nn.Sequential()
        num_features = n_features
        for i, units in enumerate(hidden_units):
            self.mlp_layers.add_module(f'norm_{i}', nn.BatchNorm1d(num_features))
            self.mlp_layers.add_module(f'dense_{i}', nn.Linear(num_features, units))
            self.mlp_layers.add_module(f'act_{i}', nn.SELU())
            self.mlp_layers.add_module(f'dropout_{i}', nn.Dropout(dropout_rates[i]))
            num_features = units
            
    def forward(self, x):
        y = self.mlp_layers(x)
        return y

In [None]:
class TabTransformerBlock(nn.Module):
    def __init__(self, num_heads, emb_dim,
                 attn_dropout_rate, ff_dropout_rate):
        super().__init__()
        self.attn = nn.MultiheadAttention(emb_dim, num_heads,
                                          dropout=attn_dropout_rate,
                                          batch_first=True)
        self.norm_1 = nn.LayerNorm(emb_dim)
        self.norm_2 = nn.LayerNorm(emb_dim)
        self.feedforward = nn.Sequential(
            nn.Linear(emb_dim, emb_dim*4),
            nn.GELU(),
            nn.Dropout(ff_dropout_rate), 
            nn.Linear(emb_dim*4, emb_dim))
        
    def forward(self, x_cat):
        attn_output, attn_output_weights = self.attn(x_cat, x_cat, x_cat)
        x_skip_1 = x_cat + attn_output
        x_skip_1 = self.norm_1(x_skip_1)
        feedforward_output = self.feedforward(x_skip_1)
        x_skip_2 = x_skip_1 + feedforward_output
        x_skip_2 = self.norm_2(x_skip_2)
        return x_skip_2

In [None]:
class TabTransformer(nn.Module): 
    def __init__(self, numerical_columns, categorical_columns,
                 num_transformer_blocks, num_heads, emb_dim,
                 attn_dropout_rates, ff_dropout_rates,
                 mlp_dropout_rates,
                 mlp_hidden_units_factors,
                 ):
        super().__init__()
        self.transformers = nn.Sequential()
        for i in range(num_transformer_blocks):
            self.transformers.add_module(f'transformer_{i}', 
                                        TabTransformerBlock(num_heads,
                                                            emb_dim,
                                                            attn_dropout_rates[i],
                                                            ff_dropout_rates[i]))
        
        self.flatten = nn.Flatten()
        self.num_norm = nn.LayerNorm(len(numerical_columns))
        
        self.n_features = (len(categorical_columns) * emb_dim) + len(numerical_columns)
        mlp_hidden_units = [int(factor * self.n_features) \
                            for factor in mlp_hidden_units_factors]
        self.mlp = MLPBlock(self.n_features, mlp_hidden_units,
                            mlp_dropout_rates)
        
        self.final_dense = nn.Linear(mlp_hidden_units[-1], 1)
        self.final_sigmoid = nn.Sigmoid()
        
    def forward(self, x_nums, x_cats):
        contextualized_x_cats = self.transformers(x_cats)
        contextualized_x_cats = self.flatten(contextualized_x_cats)
        
        if x_nums.shape[-1] > 0:
            x_nums = self.num_norm(x_nums)
            features = torch.cat((x_nums, contextualized_x_cats), -1)
        else:
            features = contextualized_x_cats
            
        mlp_output = self.mlp(features)
        model_output = self.final_dense(mlp_output)
        output = self.final_sigmoid(model_output)
        return output

In [None]:
## TabTransformer Model Check

## Settings for TabTransformer
emb_dim = model_config['cat_embedding_dim']
num_transformer_blocks = model_config['num_transformer_blocks']
num_heads = model_config['num_heads']
attn_dropout_rates = model_config['tf_dropout_rates']
ff_dropout_rates = model_config['ff_dropout_rates']
mlp_dropout_rates = model_config['mlp_dropout_rates']
mlp_hidden_units_factors = model_config['mlp_hidden_units_factors']

## Building Models
preprocessor = Preprocessor(numerical_columns, categorical_columns,
                            encoder_categories, emb_dim)

model = TabTransformer(numerical_columns, categorical_columns,
                       num_transformer_blocks, num_heads, emb_dim,
                       attn_dropout_rates, ff_dropout_rates,
                       mlp_dropout_rates, mlp_hidden_units_factors)

## Operation, Parameters and Model Structure Check
x_nums, x_cats = preprocessor(sample_data)
x_nums = x_nums.reshape(exp_config['batch_size'],len(numerical_columns))
y = model(x_nums, x_cats)
print('Numerical Input shape: ', x_nums.shape)
print('Categorical Input shape: ', x_cats.shape)
print('Output shape: ', y.shape)

print('# of Preprocessor parameters: ',
      sum(p.numel() for p in preprocessor.parameters() if p.requires_grad))
print('# of N-BEATS parameters: ',
      sum(p.numel() for p in model.parameters() if p.requires_grad))

model

## Training

In [None]:
def BCELoss_class_weighted(weights):
    def loss(input, target):
        input = torch.clamp(input,min=1e-7,max=1-1e-7)
        bce = - weights[1] * target * torch.log(input) - (1 - target) * weights[0] * torch.log(1 - input)
        return torch.mean(bce)
    return loss

In [None]:
## Loss Function
pos_weight = np.round(train_fold[outcome].value_counts()[0]/train_fold[outcome].value_counts()[1], 2)
weights = torch.FloatTensor([1, pos_weight])  #class weight
criterion = nn.BCELoss()
# criterion = BCELoss_class_weighted(weights)

## Optimizer and Learning Rate Scheduler
epochs = exp_config['train_epochs']
batch_size = exp_config['batch_size']
steps_per_epoch = len(train_fold) // batch_size

learning_rate = exp_config['learning_rate']
weight_decay = exp_config['weight_decay']
params = list(preprocessor.parameters()) + list(model.parameters())
optimizer = torch.optim.AdamW(
    params=params,
    lr=learning_rate,
    weight_decay=weight_decay
)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer=optimizer,
    T_max=epochs*steps_per_epoch
)

In [None]:
## Displaying Learning Rate
def lr_plot(lr_scheduler, steps):
    lrs = []
    for _ in range(steps):
        optimizer.step()
        lrs.append(optimizer.param_groups[0]["lr"])
        lr_scheduler.step()
    xs = [i+1 for i in range(steps)]
    plt.figure(figsize=(7,5))
    # print(lrs)
    ax = sns.lineplot(x = xs, y = lrs)
    ax.set_xlabel('Steps')
    ax.set_ylabel('Learning Rate')
    


## Create New Optimizer and Lr_scheduler
optimizer = torch.optim.AdamW(
    params=params,
    lr=learning_rate,
    weight_decay=weight_decay
)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer=optimizer,
    T_max=epochs*steps_per_epoch
)
lr_plot(lr_scheduler, epochs*steps_per_epoch)

In [None]:
# Calculate AUC
def auc_metric(predictions, targets):
    # Convert predictions and targets to numpy arrays
    # predictions = predictions.detach().numpy()
    # targets = targets.detach().numpy()

    # Calculate AUC using sklearn
    from sklearn.metrics import roc_auc_score
    auc = roc_auc_score(targets, predictions)

    return auc


In [None]:
## Function for the Model Training
def train_model(model, preprocessor,
                dl_dict, criterion,
                optimizer, lr_scheduler,
                num_epochs, best_model_path, 
                patience, finalize=False):
    ## Checking usability of GUP
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f'device: {device}')
    print('-------Start Training-------')
    model.to(device)
    
    ## training and validation loop
    if finalize:
        phases = ['train']
    else:
        phases = ['train', 'val']
        
    losses = {phase: [] for phase in phases}
    AUCs = {phase: [] for phase in phases}
    best_auc = 0
    break_counter = 0
    for epoch in range(num_epochs):
        break_flag = False
    for epoch in range(num_epochs):
        for phase in phases:
            if phase == 'train':
                preprocessor.train()
                model.train()
            else:
                preprocessor.eval()
                model.eval()
                
            epoch_loss = 0.0
            epoch_corrects = 0
            epoch_auc = 0.0
            raw_preds = torch.empty((0, 1), dtype=torch.float32).to(device)
            final_preds = torch.empty((0, 1), dtype=torch.float32).to(device)
            all_labels = torch.empty((0, 1), dtype=torch.float32).to(device)
            for data, labels in tqdm(dl_dict[phase]):
                x_nums, x_cats = preprocessor(data)
                x_nums = x_nums.reshape(x_nums.shape[0],x_nums.shape[1])
                # x_nums = x_nums.to(device)
                # x_cats = x_cats.to(device)
                labels = labels.to(device)
                
                ## Optimizer Initialization
                optimizer.zero_grad()
                
                ## Forward Processing
                with torch.set_grad_enabled(phase=='train'):
                    outputs = model(x_nums.to(device), x_cats.to(device))
                    loss = criterion(outputs, labels)
                    preds = torch.where(outputs>0.5, 1., 0.)
                    
                    ## Backward Processing and Optimization
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        lr_scheduler.step()
                        
                    epoch_loss += loss.item() * x_cats.size(0)
                    epoch_corrects += torch.sum(preds == labels)
                    raw_preds=torch.cat([raw_preds, outputs])
                    final_preds=torch.cat([final_preds, preds])
                    all_labels = torch.cat([all_labels, labels])
            
            epoch_loss = epoch_loss / len(dl_dict[phase].dataset)
            losses[phase].append(epoch_loss)
            epoch_acc = epoch_corrects / len(dl_dict[phase].dataset)
            raw_preds = raw_preds.detach().cpu().flatten().numpy()
            final_preds = final_preds.detach().cpu().flatten().numpy()
            all_labels = all_labels.detach().cpu().flatten().numpy()
            epoch_auc = roc_auc_score(all_labels, raw_preds)
            AUCs[phase].append(epoch_auc)
            ## Displaying results
            print('Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f} AUC: {:.4f} '.\
                  format(epoch+1, num_epochs, phase, epoch_loss, epoch_acc, epoch_auc))
            if phase == 'val':
                if epoch_auc > best_auc: #save model with best auc
                    torch.save({
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'val loss': epoch_loss,
                        'val auc' : epoch_auc
                        }, best_model_path)
                    best_auc = epoch_auc #update best model
                    break_counter = 0
                else:
                    break_counter+=1
                    if break_counter>patience:
                        break_flag = True
                        break #break from epoch loop
        if break_flag==True: 
            print('no more AUC improvement . . . . . . ')
            break #break all the loops
            
    return model, preprocessor, losses, AUCs

In [None]:
## Function for Plotting Losses
def plot_losses(losses, title=None):
    plt.figure(figsize=(7, 5))
    losses = pd.DataFrame(losses)
    losses.index = [i+1 for i in range(len(losses))]
    ax = sns.lineplot(data=losses)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.legend()
    ax.set_title(title)

In [None]:
# initiate model and optimiser instances
model = TabTransformer(numerical_columns, categorical_columns,
                           num_transformer_blocks, num_heads, emb_dim,
                           attn_dropout_rates, ff_dropout_rates,
                           mlp_dropout_rates, mlp_hidden_units_factors)
    
optimizer = torch.optim.AdamW(
    params=params,
    lr=learning_rate,
    weight_decay=weight_decay
)


continue_training = False #set to True if you want to continue training using a pretrained modedl

if continue_training: #load pretrained model
    best_model_path= '../MODELS/TabTransformerLong_22072024.tar' #specify pretrained model here
    checkpoint = torch.load(best_model_path)
    model.load_state_dict(checkpoint['model_state_dict']) #load saved parameter to the model
    optimizer.load_state_dict(checkpoint['optimizer_state_dict']) #load saved parameter to the optimiser
    epoch = checkpoint['epoch']
    loss = checkpoint['val loss']
    auc = checkpoint['val auc']
    
    # lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    #     optimizer=optimizer,
    #     T_max=epochs*steps_per_epoch
    # )

In [None]:
##### Training
best_model_path= '../MODELS/TabTransformerLong.tar'
model_trained, preprocessor_trained, losses, AUCs = train_model(
    model,
    preprocessor,
    dl_dict,
    criterion,
    optimizer,
    lr_scheduler,
    epochs,
    best_model_path,
    model_config['patience']
)



In [None]:
## Plot Losses
plot_losses(losses)

In [None]:
## Plot Losses
plot_losses(AUCs)

In [None]:
torch.save(model.state_dict(), './OPCRD_ASTHMA/MODELS/tabTranasformer_seqLong_09072024.pt')

In [None]:
raw_preds = torch.empty((0, 1), dtype=torch.float32).to('cuda')
final_preds = torch.empty((0, 1), dtype=torch.float32).to('cuda')
all_labels = torch.empty((0, 1), dtype=torch.float32).to('cuda')
for data, labels in tqdm(test_dl):
    labels = labels.to('cuda')
    x_nums, x_cats = preprocessor(data)
    x_nums = x_nums.reshape(x_nums.shape[0],x_nums.shape[1])
    outputs = model(x_nums.to('cuda'), x_cats.to('cuda'))
    preds = torch.where(outputs>0.5, 1., 0.)
    raw_preds=torch.cat([raw_preds, outputs])
    final_preds=torch.cat([final_preds, preds])
    all_labels = torch.cat([all_labels, labels])

raw_preds = raw_preds.detach().cpu().flatten().numpy()
final_preds = final_preds.detach().cpu().flatten().numpy()
all_labels = all_labels.detach().cpu().flatten().numpy()
print(roc_auc_score(all_labels, raw_preds))

In [None]:
confusion_matrix(all_labels, final_preds)

In [None]:
## Plot Losses
# plot_losses(losses)

In [None]:
data

In [None]:
import 