In [None]:
# !pip3 install transformers
# !pip3 install accelerate
# !pip3 install datasets
# !pip3 install nltk


In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
vocab_clinical = pickle.load(open('../SeqModel/all_vocab_clinical_new.sav', 'rb'))
vocab_therapy = pickle.load(open('../SeqModel/all_vocab_therapy_new.sav', 'rb'))

data_clinical = pd.read_feather('../SeqModel/all_data_clinical_new.feather')
data_therapy = pd.read_feather('../SeqModel/all_data_therapy_new.feather')

vocab_stat_clinical = pd.read_csv('../FinalData/pivotClinicalCodesbyCountry.csv')
vocab_stat_therapy = pd.read_csv('../FinalData/pivotTherapyCodesbyCountry.csv')

In [3]:
def remove_padding(x):
    return np.delete(x, np.where(x=='PAD'))
data_clinical['read_code_seq_padded_noPAD'] = data_clinical.read_code_seq_padded.apply(lambda x: np.delete(x, np.where(x=='PAD')))
data_clinical['text'] = data_clinical['read_code_seq_padded_noPAD'].apply(lambda x: ' '.join(x))
data_clinical['text_no_dot'] = data_clinical.text.apply(lambda x: x.replace('.', '^'))

In [7]:
#Tokenizer NLTK
from tokenization_nltk import NlktTokenizer

In [None]:
# vocab = pickle.load(open('../SeqModel/all_vocab_clinical_new.sav', 'rb'))
# with open("../SeqModel/vocab_clinical_nltk.txt", "w") as txt_file:
#     for code in vocab:
#         txt_file.write("".join(code) + "\n") # works with any number of elements in a line

In [8]:
tokenizerNLTK = NlktTokenizer(vocab_file='../SeqModel/vocab_clinical_nltk_new.txt', eos_token = "<s>")

text = data_clinical.text_no_dot[1]
tokens = tokenizerNLTK.tokenize(text)
print("Tokens:", tokens) #ouput: Tokens: ['Hello', 'Shirin', ',', 'How', 'are', 'you', '?']


Tokens: ['42A^^', 'X77Wi', '44O^^', '423^^', '428^^', '42ZD^', '44J3^', 'Xab9D', 'XaEJK', '5372^', '892^^', '892^^', '8C15^', 'XaPkd', 'XaPVj', 'XaIrp', 'XaIvz', 'XaIrp', '9N42^', 'XaFBm', '9^^^^', 'XaZfY', '1713^', 'XaIqy', '9OJ^^', 'XaCH9', 'XaJYi', 'XaCJ0', 'XaORP', 'Ub1na', '8B314', 'XaQVY', 'H33^^', '136^^', 'XM0aD', '22K^^', 'XaZ4m', 'XaIyE', 'XaK6I', 'XaMiI', '2469^', '73050', 'XaIIW', 'XaKbt', '2431^', '8CA5^', '22A^^', 'XM1YA', '8795^', '246A^', 'XaIUi', 'XaQHq', '663g1', 'XaIQ0', 'XaIeq', 'XaY2V', 'XaINb', 'XaIfK', '242^^', '663Q^', 'XaIuD', '8CA4^', '246^^', 'XaPPD', 'XaLIn', 'XaEES', 'XaMiI', '44P^^', '44J3^', '44M4^', 'XaERu', '44I5^', 'XaEUq', 'XaELV', 'X77Wi', '44g1^', 'XM0lt', '44P5^', '44O^^', '44I4^', 'X77WP', 'XaLJx', '44F^^', 'XaK8y', '44Q^^', '44P6^', '1719^', '1719^', 'XaMiI', '1719^', 'XaMiI', 'Xa9Sm', 'XaMiI', '2431^', '1719^', '1719^', 'XaMiI', '535^^', 'XaMiI', 'H33^^', '3395^', '1719^', 'XaMiI', '9N4F^', 'XaIw3', 'Xa0Yg', '1719^', 'XaMiI', 'XaMiI', 'XaMiI', '

In [9]:
token_ids = [tokenizerNLTK.convert_token_to_id(token) for token in tokens]#lower() because the vocab.txt is all in lower case for us
print(token_ids) #output: [1997, 4634, 2004, 11560, 1000]

[9, 36, 118, 43, 47, 433, 51, 29, 40, 276, 156, 156, 1111, 147, 105, 1659, 1709, 1659, 79, 3985, 41, 206, 924, 861, 738, 306, 3784, 459, 127, 67, 50, 104, 25, 45, 333, 3, 404, 371, 158, 27, 4, 391, 151, 30, 59, 208, 2, 502, 203, 5, 499, 152, 95, 145, 1, 128, 72, 63, 35, 7, 1737, 176, 34, 478, 322, 114, 27, 96, 51, 64, 33, 52, 49, 28, 36, 217, 22, 38, 118, 53, 26, 19, 69, 24, 115, 84, 238, 238, 27, 238, 27, 3533, 27, 59, 238, 238, 27, 189, 27, 25, 61, 238, 27, 122, 729, 2087, 238, 27, 27, 27, 729, 2087, 238, 27, 27, 27, 713, 238, 493, 91, 3621, 805, 235, 23, 27, 289, 6981, 391, 34, 4918, 180, 4, 5, 226, 18253, 526, 51, 24, 328, 180, 5, 27, 2087, 53, 729, 34, 36, 22, 52, 4]


In [10]:
data_clinical = data_clinical.iloc[:1000]
# dataset = data_clinical[['read_code_string']]
from datasets import load_dataset, DatasetDict, Dataset
dataset = DatasetDict({'train': Dataset.from_pandas(data_clinical[['text']])})

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1000
    })
})

In [12]:
from transformers import AutoModel, AutoTokenizer

In [13]:
tokenizer_base = AutoTokenizer.from_pretrained("bert-base-cased")

In [14]:
tokens = tokenizer_base(dataset["train"]['text'][1]).input_ids
# [tokenizer_base.decode(id) for id in tokens]
     

Token indices sequence length is longer than the specified maximum sequence length for this model (638 > 512). Running this sequence through the model will result in indexing errors


In [15]:
training_corpus = (
    dataset['train'][i : i + 1000]["text"]
    for i in range(0, len(dataset), 1000)
)

In [16]:
training_corpus = (
    dataset['train']["text"]
)

In [17]:
# all_code_list = []
# for text in dataset['train']['text']:
#     all_code_list = all_code_list + text.split(' ')

# #total unique tokens in corpus
# vocabsize = len(set(all_code_list))
vocabsize = 3000
print(vocabsize)

3000


In [18]:
# train tokenizer with train_new_from_iterator method
tokenizer = tokenizer_base.train_new_from_iterator(training_corpus, vocabsize)






In [19]:
tokenizer.vocab_size

3000

In [None]:
tokenizer.save_pretrained("../SeqModel/tokenizer_BERTEHR_08032024")

In [20]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    hidden_size = 384,
    vocab_size= len(tokenizerNLTK.vocab),
    num_hidden_layers = 6,
    num_attention_heads = 6,
    intermediate_size = 1024,
    max_position_embeddings = 100
)

model = BertForMaskedLM(config=config)
print(model.num_parameters()) #10457864

37659425


In [None]:
data_clinical

In [None]:
tokenizer.mask_token

In [21]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizerNLTK, mlm=True, mlm_probability=0.15
)

2024-03-18 14:39:40.892796: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-18 14:39:40.927011: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-18 14:39:40.927040: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-18 14:39:40.927060: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-18 14:39:40.934529: I tensorflow/core/platform/cpu_feature_g

In [22]:
import torch
from torch.utils.data import Dataset
from accelerate import Accelerator, DistributedType

class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer, raw_datasets, max_length: int):
        self.padding = "max_length"
        self.text_column_name = 'text'
        self.max_length = max_length
        self.accelerator = Accelerator(gradient_accumulation_steps=1)
        self.tokenizer = tokenizer

        with self.accelerator.main_process_first():
            self.tokenized_datasets = raw_datasets.map(
                self.tokenize_function,
                batched=True,
                num_proc=4,
                remove_columns=[self.text_column_name],
                desc="Running tokenizer on dataset line_by_line",
            )
            self.tokenized_datasets.set_format('torch',columns=['input_ids'],dtype=torch.long)

    def tokenize_function(self,examples):
        examples[self.text_column_name] = [
            line for line in examples[self.text_column_name] if len(line[0]) > 0 and not line[0].isspace()
        ]
        return self.tokenizer(
            examples[self.text_column_name],
            padding=self.padding,
            truncation=True,
            max_length=self.max_length,
            return_special_tokens_mask=True,
        )
    def __len__(self):
        return len(self.tokenized_datasets)

    def __getitem__(self, i):
        return self.tokenized_datasets[i]

In [24]:
tokenized_dataset_train = LineByLineTextDataset(
    tokenizer= tokenizerNLTK,
    raw_datasets = dataset,
    max_length=50, # adjust this based on your requrements
)



Running tokenizer on dataset line_by_line (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

AttributeError: 'NlktTokenizer' object has no attribute '_in_target_context_manager'

In [27]:
def age_vocab(max_age, mon=1, symbol=None):
    age2idx = {}
    idx2age = {}
    if symbol is None:
        symbol = ['PAD', 'UNK']

    for i in range(len(symbol)):
        age2idx[str(symbol[i])] = i
        idx2age[i] = str(symbol[i])

    if mon == 12:
        for i in range(max_age):
            age2idx[str(i)] = len(symbol) + i
            idx2age[len(symbol) + i] = str(i)
    elif mon == 1:
        for i in range(max_age * 12):
            age2idx[str(i)] = len(symbol) + i
            idx2age[len(symbol) + i] = str(i)
    else:
        age2idx = None
        idx2age = None
    return age2idx, idx2age

global_params = {
    'max_seq_len': 64,
    'max_age': 110,
    'month': 1,
    'age_symbol': None,
    'min_visit': 5,
    'gradient_accumulation_steps': 1
}

ageVocab, _ = age_vocab(max_age=global_params['max_age'], mon=global_params['month'], symbol=global_params['age_symbol'])

In [None]:
ageVocab

In [25]:
tokenizer._in_target_context_manager

False

In [None]:
tokenized_dataset_train['train']['input_ids']

In [None]:
dataset['train']['text'][5]

In [None]:
tokenized_dataset_train['train'][5]['input_ids']

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../SeqModel",
    overwrite_output_dir=True,
    push_to_hub=False,
    hub_model_id="asthmaAttack",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_steps=500,
    eval_steps=100,
    logging_steps=100,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    report_to='none',
    hub_private_repo = True,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset_train['train'],
    eval_dataset= tokenized_dataset_train['train'], # change to your actual evaluation dataset
    )



In [None]:
trainer.train()
trainer.save_model("../SeqModel/transformer_08032024")

In [None]:
model.save_pretrained()

In [None]:
results = trainer.evaluate()

In [None]:
import math

print(f">>> Perplexity: {math.exp(results['eval_loss']):.2f}")

In [None]:
model.base_model.embeddings.word_embeddings.weight

In [None]:
model

In [None]:
modelDownstream

In [None]:
len(tokenizer.vocab.keys())

In [None]:
# for code in all_code_list:
#     if ('Op' in code):
#         print(code)

In [None]:
tokenizer.vocab_size

In [None]:
# set(all_code_list)

In [None]:
tokenizer.vocab.keys()

# Fine Tune for downstream classification

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('yelp_review_full')

In [None]:
dataset

In [None]:
data_clinical = pd.read_feather('../SeqModel/all_data_clinical_new.feather')
data_clinical['read_code_seq_padded_noPAD'] = data_clinical.read_code_seq_padded.apply(lambda x: np.delete(x, np.where(x=='PAD')))
data_clinical['text'] = data_clinical['read_code_seq_padded_noPAD'].apply(lambda x: ' '.join(x))
train_downstream = data_clinical.iloc[100000:200000]


In [None]:
from sklearn.model_selection import train_test_split
trainingData, validationData = train_test_split(train_downstream, test_size=0.2, stratify=train_downstream['12months'], random_state=1234)

In [None]:
trainingData.shape

In [None]:
trainingData['label'] = trainingData['12months']
validationData['label'] = validationData['12months']

In [None]:
# dataset = data_clinical[['read_code_string']]
from datasets import load_dataset, DatasetDict, Dataset
dataset = DatasetDict({'train': Dataset.from_pandas(trainingData[['label', 'text']]),
                      'test': Dataset.from_pandas(validationData[['label', 'text']])})

In [None]:
dataset

In [None]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_file="../SeqModel/tokenizer_BERTEHR_08032024/tokenizer.json")

In [None]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
import torch
from torch.utils.data import Dataset
from accelerate import Accelerator, DistributedType

class LineByLineTextDataset(Dataset):
    def __init__(self, tokenizer, raw_datasets, max_length: int):
        self.padding = "max_length"
        self.text_column_name = 'text'
        self.max_length = max_length
        self.accelerator = Accelerator(gradient_accumulation_steps=1)
        self.tokenizer = tokenizer

        with self.accelerator.main_process_first():
            self.tokenized_datasets = raw_datasets.map(
                self.tokenize_function,
                batched=True,
                num_proc=4,
                remove_columns=[self.text_column_name],
                desc="Running tokenizer on dataset line_by_line",
            )
            self.tokenized_datasets.set_format('torch',columns=['text'],dtype=torch.long)

    def tokenize_function(self,examples):
        examples[self.text_column_name] = [
            line for line in examples[self.text_column_name] if len(line[0]) > 0 and not line[0].isspace()
        ]
        return self.tokenizer(
            examples[self.text_column_name],
            padding=self.padding,
            truncation=True,
            max_length=self.max_length,
            return_special_tokens_mask=False,
        )
    def __len__(self):
        return len(self.tokenized_datasets)

    def __getitem__(self, i):
        return self.tokenized_datasets[i]

In [None]:
# tokenized_datasets = LineByLineTextDataset(
#     tokenizer= tokenizer,
#     raw_datasets = dataset,
#     max_length=50, # adjust this based on your requrements
# )


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=50, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification
modelDownstream = AutoModelForSequenceClassification.from_pretrained("../SeqModel/transformer_08032024/", num_labels=2)

In [None]:
# from transformers import TrainingArguments
# training_args = TrainingArguments(output_dir="../SeqModel/testResult")

In [None]:
import numpy as np
import evaluate
metric = evaluate.load('accuracy')

from sklearn.metrics import roc_auc_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = roc_auc_score(y_true=labels, y_score=predictions)
    return {"AUC": accuracy}

In [None]:
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = roc_auc_score(y_true=labels, y_score=predictions)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="../SeqModel",
    overwrite_output_dir=True,
    push_to_hub=False,
    hub_model_id="asthmaAttackDownstream",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    max_steps=500,
    eval_steps=100,
    logging_steps=100,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    report_to='none',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=modelDownstream,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets['train'],
    eval_dataset= tokenized_datasets['test'], # change to your actual evaluation dataset
    compute_metrics=compute_metrics,
    )



In [None]:
trainer.train()

In [None]:
tokenizer.vocab.keys()

# Embedding visualisation

In [None]:
model.base_model.embeddings.word_embeddings.weight.shape

In [None]:
def tune_figure(ax, title:str='Title'):
    ax.axis('off')
    ax.set_title(title, **tfont)
    ax.get_legend().set_title("")
    ax.get_legend().prop.set_family(lfont['fontname'])
    ax.get_legend().prop.set_size(lfont['fontsize'])
    ax.get_legend().get_frame().set_linewidth(0.0)
    
f, axs = plt.subplots(1,2,figsize=(14,6))
axs = axs.flatten()

sns.scatterplot(data=projected, x='Dim 1', y='Dim 2', hue='ddc1', s=5, alpha=0.1, ax=axs[0]);
tune_figure(axs[0], 'DDC1 Group')

sns.scatterplot(data=projected, x='Dim 1', y='Dim 2', hue='ddc2', s=5, alpha=0.1, ax=axs[1]);
tune_figure(axs[1], 'DDC2 Group')

# If you want to save the output then uncomment the next line
#plt.savefig(os.path.join('data','DDC_Plot.png'), dpi=150)
plt.show()