## Potential Differences

- transformers instead of transformers_custom
- new_xallFile instead of xallFile

In [1]:
import numpy as np
import random
import pickle
import scipy.sparse
from tqdm.auto import tqdm
from transformers_custom import BertForMaskedLM, BertConfig, AdamW, get_scheduler, BertForTokenClassification, BertTokenizerFast
import torch

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print (device)

cuda


### Data Read In

In [7]:
headerFile = "/gpfs/data/razavianlab/capstone/2021_ehr/preprocessed_headers.pkl"
mrnsFile = "/gpfs/data/razavianlab/capstone/2021_ehr/mrns_all_deid.pkl"
#xallFile = "/gpfs/data/razavianlab/capstone/2021_ehr/preprocessed_xall.npz"
new_xallFile = "/gpfs/data/razavianlab/capstone/2021_ehr/preprocessed_xall_with_outcome.npz"

headers = np.load(headerFile, allow_pickle=True)
data = scipy.sparse.load_npz(new_xallFile)

print (data.shape)
dementia_outcomes = data[:, -1]
data = data[:,:(data.shape[1]-1)]
print (data.shape)

dementia_outcomes = dementia_outcomes.toarray()

positive_indices = np.where(dementia_outcomes == 1)[0]
negative_indices = np.where(dementia_outcomes == 0)[0]

(10950619, 192330)
(10950619, 192329)


## Functions

In [10]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        #return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return {key: (val[idx].clone().detach()) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings["input_ids"])


def masking(input_ids, maskedTokenID, masked_precentage):
    # labelling
    inputs = {'input_ids':input_ids}
    labels = inputs["input_ids"].detach().clone()

    # create random array of floats with equal dimensions to input_ids tensor
    rand = torch.rand(inputs["input_ids"].shape)
    
    # create mask array
    mask_arr = (rand < masked_precentage) * (inputs["input_ids"] != 0) #* (inputs["input_ids"] != 102) * (inputs["input_ids"] != 0)
    non_mask_arr = ~mask_arr
    selection = []
    normal_selection = []
    # getting true (masked value) indexes
    for i in range(inputs["input_ids"].shape[0]):
        selection.append(
            torch.flatten(mask_arr[i].nonzero()).tolist()
          )
        normal_selection.append(
            torch.flatten(non_mask_arr[i].nonzero()).tolist()
          )

    # giving a new input mask input id of vocabSize
    for i in range(inputs["input_ids"].shape[0]):
        inputs["input_ids"][i, selection[i]] = maskedTokenID
        labels[i,normal_selection[i]] = -100

    inputs["labels"] = labels

    return inputs

def mapping(input_ids_tensor):
    
    old_max = torch.max(input_ids_tensor).item()
    # mapping input ids to lower numbers 
    #(i.e. if largest input id is 110,000. Then mapped to 99 if max seq length is 100)
    dict_keys = torch.unique(input_ids_tensor).tolist()
    dict_values = list(range(torch.unique(input_ids_tensor).shape[0]))
    d = dict(zip(dict_keys,dict_values))
    new_ids_tensor = input_ids_tensor.apply_(d.get)
    
    
    print("shifting values from max of: ", old_max, "to",dict_values[len(dict_values)-1])
    
    return new_ids_tensor, d

# creating a tensor of padded input id sequences
def input_ids_creator(data, MAX_SEQUENCE_LENGTH,MIN_SEQUENCE_LENGTH, N, negative_indices, positive_indices):
    final_list = []
    lens = []
    study_indexes = []

    negative_indices_sample = random.sample(list(negative_indices), data.shape[0]//N)
    
    indices_to_train_on = negative_indices_sample+list(positive_indices)

    # getting an input sequence from every 1000 patients
    for i in indices_to_train_on:

        patient_study_index = i
        row = data.indices[data.indptr[patient_study_index]:data.indptr[patient_study_index+1]]


        if (len(row)<MAX_SEQUENCE_LENGTH and len(row)>MIN_SEQUENCE_LENGTH):
            lens.append(len(row))
            final_list.append(torch.tensor(row))
            study_indexes.append(patient_study_index)

    emb_length = max(lens)
    print("embedding length", emb_length)
    print("number of patients", len(final_list), "out of a possible", str(data.shape[0]//N), "iterated on") 
    
    # stacking sequences and padding with zeroes
    input_ids_tensor = torch.stack([torch.cat([i, i.new_zeros(emb_length - i.size(0))], 0) for i in final_list],0)
    
    return input_ids_tensor, emb_length, study_indexes, final_list

def get_train_val_indexes(input_ids_tensor, precentage):
    indexes = list(range(input_ids_tensor.shape[0]))
    random.shuffle(indexes)
    max_train_ind = int(len(indexes)*.8)
    train_indexes = indexes[:max_train_ind]
    val_indexes = indexes[max_train_ind:len(indexes)]
    
    return train_indexes, val_indexes

def train_val_split(inputs, trainIDs, valIDs):
    
    train = {}
    val = {}
    
    train["input_ids"] = inputs['input_ids'][trainIDs,:]
    train["labels"] = inputs['labels'][trainIDs,:]
    
    val["input_ids"] = inputs['input_ids'][valIDs,:]
    val["labels"] = inputs['labels'][valIDs,:]
    
    return train,val

### Creating Input

In [11]:
MAX_SEQUENCE_LENGTH = 100 # max number of features
MIN_SEQUENCE_LENGTH = 0
N = 5 # divides dataset by N and loop through every patient

# creating a tensor of padded input id sequences
input_ids_tensor, emb_length, study_indexes, final_list  = input_ids_creator(data, MAX_SEQUENCE_LENGTH,MIN_SEQUENCE_LENGTH, N, negative_indices, positive_indices)

embedding length 99
number of patients 1799820 out of a possible 2190123 iterated on


In [13]:
# creating train, validation indexes
# .8 is the train/val split
trainIDs, valIDs = get_train_val_indexes(input_ids_tensor, .8)

### Mapping and Masking 

In [14]:
# maps feature ids to a lower integers to space model space
input_ids_tensor, mapping_dict = mapping(input_ids_tensor)

shifting values from max of:  192328 to 59840


In [16]:
masked_precentage = .3

# grabbing largest token ID
vocabSize = torch.max(input_ids_tensor).item()
# creating maskedToken
dimentia = vocabSize
maskedTokenID = vocabSize + 1

# creating labels and masking select input ids
inputs = masking(input_ids_tensor, maskedTokenID, masked_precentage)

print("maskedTokenID", maskedTokenID)

maskedTokenID 59841


### Train/Val Split

In [18]:
train_inputs, val_inputs = train_val_split(inputs, trainIDs, valIDs)
train_dataset = OurDataset(train_inputs)
val_dataset = OurDataset(val_inputs)

### BERT CONFIG

In [26]:
# config 
config = BertConfig(
    vocab_size=vocabSize+2,
    max_position_embeddings=emb_length,
    intermediate_size=3072,
    hidden_size=512,
    num_attention_heads=8,
    num_hidden_layers=8,
    #type_vocab_size=5,
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    #num_labels=3,
    position_embedding_type = None
)

# MLM
model = BertForMaskedLM(config)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-4)

# Steps and Epochs
NUM_EPOCHS = 1
BATCH_SIZE = 32

print (len(train_dataset), len(val_dataset))

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print("sending model to ", device)

sending model to  cuda


### TRAINING

In [29]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [30]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
model.train()

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(59842, 512, padding_idx=0)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inpla

In [None]:
# Single GPU
# model = BertForMaskedLM(config)
# print("sending model to ", device)

In [None]:
# model.train()

file = open("epochs.txt","w+")
file = open("epochs_val.txt", "w+")

train_losses = []
train_iterations = []

validation_losses = []
validation_iterations = []

for epoch in range(NUM_EPOCHS):
  
    loop = tqdm(train_dataloader, leave=True)
    iteration = 0
    
    total_validation_loss = 0.0
    total_train_loss = 0.0


    for batch in loop:

        outputs = model(input_ids = batch["input_ids"].long().to(device), labels=batch["labels"].long().to(device), return_dict=True)
        loss = outputs.loss
        loss.backward()
        
        # saving model
        if (iteration%50 == 0 or iteration == 0):
            train_iterations.append(iteration)
            train_losses.append(loss.item()/BATCH_SIZE)
            print (iteration, loss.item()/BATCH_SIZE)
            file = open("epochs.txt", "a")
            file.write(str(epoch) + " at iteration " + str(iteration) + "\n")
            file.write("loss " + str(loss.item()/BATCH_SIZE) + "\n")
            file.close()
            torch.save(model, 'model.pkl')
            
            trained_embedding_matrix = model.bert.embeddings.word_embeddings.weight.data
            torch.save(trained_embedding_matrix, 'trained_embedding_matrix.pkl')

        # step, zero grad
        optimizer.step()
        optimizer.zero_grad()
        
       
        # validation set
        if (iteration%500 == 0 or iteration == 0):
            print ("val iteration", iteration)
            validation_iterations.append(iteration)
            val_iteration = 0
            total_validation_loss = 0.0
            with torch.no_grad():
                for val_batch in val_dataloader:
                    val_iteration+=1
                    outputs = model(input_ids = val_batch["input_ids"].long().to(device), labels=val_batch["labels"].long().to(device), return_dict=True)
                    total_validation_loss += outputs.loss.detach()
            mean_validation_loss = total_validation_loss / len(val_dataset)
            print (total_validation_loss, len(val_dataset), mean_validation_loss)
            validation_losses.append(mean_validation_loss.item())
            print ("validation loss:", mean_validation_loss)
            file = open("epochs_val.txt", "a")
            #file.write(str(epoch) + " at iteration " + str(iteration) + "\n")
            file.write("loss " + str(mean_validation_loss) + "\n")
            file.write("iteration " + str(iteration) + "\n")
            file.close()              
        iteration += 1         
        gc.collect()
        torch.cuda.empty_cache()

file.close()

  0%|          | 0/44996 [00:00<?, ?it/s]

0 0.3470538854598999
val iteration 0
tensor(119747.5625, device='cuda:0') 359964 tensor(0.3327, device='cuda:0')
validation loss: tensor(0.3327, device='cuda:0')
50 0.17420728504657745
100 0.16393838822841644
150 0.2005435675382614
200 0.23440062999725342
250 0.19395661354064941
300 0.22425399720668793
350 0.18233589828014374
400 0.16007289290428162
450 0.21747060120105743
500 0.2001754194498062
val iteration 500
tensor(66478.0391, device='cuda:0') 359964 tensor(0.1847, device='cuda:0')
validation loss: tensor(0.1847, device='cuda:0')
550 0.17144235968589783
600 0.14796429872512817
650 0.17956772446632385
700 0.2062743902206421
750 0.22472381591796875
800 0.1879940927028656
850 0.17452885210514069
900 0.20086173713207245
950 0.16901443898677826
1000 0.22801579535007477
val iteration 1000


#### Saving iteration losses

In [35]:
with open('train_iterations.pkl','wb') as f:
    pickle.dump(train_iterations, f)
    
with open('train_losses.pkl','wb') as f:
    pickle.dump(train_losses, f)
    
with open('validation_iterations.pkl','wb') as f:
    pickle.dump(validation_iterations, f)
    
with open('validation_losses.pkl','wb') as f:
    pickle.dump(validation_losses, f)

### Plotting losses

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.lineplot(validation_iterations, validation_losses)
plt.show()

sns.lineplot(train_iterations, train_losses)
plt.show()

In [None]:
validation_iterations

In [None]:
validation_losses

In [None]:
train_losses

### Multiple GPU Training Run

In [None]:
from parallel import DataParallelModel, DataParallelCriterion, gather
import gc

In [None]:
# Multiple GPUs

model = BertForMaskedLM(config)
#model= nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5, 6, 7],output_device=[0])

model= DataParallelModel(model)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
print("sending model to ", device)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
model.train()

file = open("epochs.txt","w+")

train_losses = []
validation_losses = []

for epoch in range(NUM_EPOCHS):
  
    loop = tqdm(train_dataloader, leave=True)
    iteration = 0
    
    total_validation_loss = 0.0
    total_train_loss = 0.0


    for batch in loop:

    # forward, loss, backprop
        batch["input_ids"] = batch["input_ids"].cuda()
        batch["labels"] = batch["labels"].cuda()
        outputs = model(input_ids = batch["input_ids"].long().cuda(), labels=batch["labels"].long().cuda(), return_dict=True)
        outputs = gather(outputs, target_device=0)
        loss = outputs.loss.mean()
#         print (loss)
        total_train_loss += loss.item()
        loss.backward()
        print (loss)
        # saving model
        if (iteration%5 == 0):
            file = open("epochs.txt", "a")
            file.write(str(epoch) + " at iteration " + str(iteration) + "\n")
            file.write("loss " + str(loss) + "\n")
            file.close()
            torch.save(model, 'model.pkl')
            
            # save word embeddings after each check-in
            #trained_embedding_matrix = model.bert.embeddings.word_embeddings.weight.data
            trained_embedding_matrix = model.module.bert.embeddings.word_embeddings.weight.data
            torch.save(trained_embedding_matrix, 'trained_embedding_matrix.pkl')
        

        # step, zero grad
        optimizer.step()
        optimizer.zero_grad()
        iteration += 1
        
        # validation set
#         with torch.no_grad():
#             for val_batch in val_dataloader:
#                 outputs = model(input_ids = val_batch["input_ids"].long().to(device), labels=val_batch["labels"].long().to(device), return_dict=True)
#                 outputs = gather(outputs, target_device=0)
#                 val_loss = outputs.loss.mean()
#                 total_validation_loss += val_loss.item()
        
        del outputs
        gc.collect()
        torch.cuda.empty_cache()
        
    # loss calculations
    mean_train_loss = total_train_loss / len(train_dataset)
    mean_validation_loss = total_validation_loss / len(val_dataset)

    validation_losses.append(mean_validation_loss)
    train_losses.append(mean_train_loss)
        

file.close()

In [None]:
torch.cuda.empty_cache()

In [None]:
import matplotlib.pyplot as plt

plt.figure(dpi=125)

#plt.subplot(121)
plt.plot(train_losses, label='train')
plt.plot(validation_losses, label='validation')
plt.xlabel('Epochs')
plt.ylabel('loss')
plt.title('Losses')
plt.legend()
plt.tight_layout()
plt.show()

### Data Saving for DownStream Task

In [None]:
model.bert.embeddings.word_embeddings.weight.shape

In [None]:
# mapping_dict
trained_embedding_matrix = model.bert.embeddings.word_embeddings.weight.data

In [None]:
trained_embedding_matrix.shape

In [None]:
torch.save(trained_embedding_matrix, 'trained_embedding_matrix_LARGE.pkl')

In [31]:
with open('mapping_dict_LARGE.pkl','wb') as f:
    pickle.dump(mapping_dict, f)
with open('study_indexes_LARGE.pkl','wb') as f:
    pickle.dump(study_indexes, f)

In [32]:
final_list_modified = []

for x in final_list:
    final_list_modified.append([mapping_dict[i] for i in x.numpy()])

with open('final_list_modified_LARGE.pkl','wb') as f:
    pickle.dump(final_list_modified, f)

In [None]:
cd /gpfs/data/razavianlab/capstone/2021_ehr

In [None]:
with open('trained_embedding_matrix_LARGE.pkl','wb') as f:
    pickle.dump(trained_embedding_matrix, f)