In [2]:
%load_ext autoreload
%autoreload 2

from src.data.medal import MeDALSubset
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import yaml
from src.models.trainer import ModelTrainer
from src.vectorizer.trainable import TrainableEmbedding
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
from src.utils import save_embeddings_to_file
import pyarrow.parquet as pq
import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
medal_dataset = MeDALSubset('MeDAL')
data, train_data, val_data, test_data = medal_dataset.load_dataset()
class_to_idx = medal_dataset.class_to_idx
del data, medal_dataset, test_data

MeDAL dataset initialized with name: MeDAL
Dataset downloaded to: /Users/prashanthjaganathan/.cache/kagglehub/datasets/xhlulu/medal-emnlp/versions/4
Dataset moved to: /Users/prashanthjaganathan/Desktop/CS6120 - NLP/pretaining-language-models-for-medical-text/dataset
Total number of classes: 22555


In [10]:
%load_ext autoreload
%autoreload 2

def load_config(path):
    with open(path, 'r') as f:
        return yaml.safe_load(f)

config = load_config('config/config.yaml')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# processed_train = medal_dataset.preprocess(['train', 'valid'])
# NOTE: Pre-processed for 503 minutes

In [None]:
# train_df, val_df = processed_train

# Save to CSV
# NOTE: commented out v sensitive code, files contain huge corpus of preprocessed data
# DO NOT OVERWRITE THE FILES
# train_df.to_csv("dataset/medal/preprocessed_subset/train.csv", index=False)
# val_df.to_csv("dataset/medal/preprocessed_subset/valid.csv", index=False)

# print("CSV files saved successfully!")


CSV files saved successfully!


## Read and load pre-processed dataset

In [4]:
# preprocessed_train = pd.read_csv('dataset/medal/preprocessed_subset/train.csv')
preprocessed_val = pd.read_csv('dataset/medal/preprocessed_subset/valid.csv')
# medal_dataset.train_data = preprocessed_train
medal_dataset.val_data = preprocessed_val
print(len(medal_dataset.val_data))

# del preprocessed_train
del preprocessed_val

1000000


In [None]:
# val_tokens = medal_dataset.tokenize('nltk', splits=['valid'])
# file_name = "dataset/medal/nltk_tokenized_subset/valid.parquet"
# val_tokens.to_frame().to_parquet(file_name)
# print('Parquet file saved successfully!')

# type(val_tokens)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=125000), Label(value='0 / 125000')…

pandas.core.series.Series

In [63]:
train_tokens = pd.read_parquet(
    "dataset/medal/nltk_tokenized_subset/train.parquet", 
    engine="pyarrow"
    ).squeeze()

# to make it as a list[list[str]]
tokenized_train_corpus = [doc.tolist() for doc in tqdm(train_tokens, 'Docs', len(train_tokens))] 
print(f'Number of documents in train corpus: {len(tokenized_train_corpus)}')

Docs: 100%|██████████| 3000000/3000000 [00:19<00:00, 155347.46it/s]

Number of documents in train corpus: 3000000





In [8]:
val_tokens = pd.read_parquet("dataset/medal/nltk_tokenized_subset/valid.parquet", engine="pyarrow").squeeze()

# to make it as a list[list[str]]
tokenized_val_corpus = [doc.tolist() for doc in tqdm(val_tokens, 'Docs', len(val_tokens))] 
print(f'Number of documents in val corpus: {len(tokenized_val_corpus)}')

Docs: 100%|██████████| 1000000/1000000 [00:03<00:00, 264271.45it/s]


Number of documents in val corpus: 1000000


### Training Word2Vec model on the entire corpus

In [None]:
embedding_model = TrainableEmbedding(
        tokenized_corpus=tokenized_train_corpus,
        algorithm="word2vec",
        vector_size=100,
        window=5,
        min_count=2
    )
embeddings = embedding_model.embed(tokenized_train_corpus)
print(f'Embedding dimensions: {len(embeddings[0][0])}')

### Training FastText model on the entire corpus

In [None]:
embedding_model = TrainableEmbedding(
        tokenized_corpus=tokenized_train_corpus,
        algorithm="fasttext",
        vector_size=100,
        window=7,
        min_count=2
    )
embeddings = embedding_model.embed(tokenized_train_corpus)
print(f'Embeddings Dimensions: {len(embeddings[0][0])}')
save_embeddings_to_file(embeddings, "embeddings/fasttext_val_embeddings.h5")

Reducing the embedding to Truncated Singular Value Decomposition (SVD)

In [None]:
embedding_model = TrainableEmbedding(
        tokenized_corpus=train_tokens.tolist(),
        algorithm="tfidf",
        vector_size=100,
        window=5,
        min_count=2
    )
embeddings = embedding_model.embed(train_tokens.tolist())
print(f'Embedding dimensions: {embeddings.shape}')

In [None]:
# NOTE: bio wordvec model is like 12GB and unable to load it in the memeory and build embeddings

train_embeddings = medal_dataset.embed(
    'bio_wordvec',
    splits=['train'],
    tokenized_data = train_tokens,
    model_path = 'trained_models/embeddings/pretrained/bio_wordvec.bin'
)
len(train_embeddings)

Trying not to use bio bert as it involves trasformer models and our architecture is limited to using 
LSTM + Self Attention, therefore, looking for static embedding models only

In [None]:
train_embeddings = medal_dataset.embed(
    'bio_bert', 
    splits=['train'],
    model_name='dmis-lab/biobert-base-cased-v1.1'
    )
len(train_embeddings)

In [5]:
tokenized_train_corpus = []

## Perform Model Training

First, let's create the dataloader with embeddings as features and labels.

In [15]:
import random


class LazyEmbeddingDataset(Dataset):
    def __init__(self, file_path, trainable_embed_model, labels, class_to_idx, max_seq_len=None):
        """
        Args:
            tokenized_corpus (list[list[str]]): List of tokenized documents.
            embedding_model: Pre-trained embedding model that implements .embed().
                The embed() method should accept a list of tokenized docs and return
                a list of sequences of word embeddings.
            labels (list): Labels corresponding to each document.
            class_to_idx (dict): Mapping from class label to integer index.
            max_seq_len (int, optional): If provided, pad/truncate each document
                so that the sequence length equals max_seq_len.
        """
        self.file_path = file_path
        table = pq.read_table(self.file_path)
        self.tokenized_corpus = table['TEXT']

        
        self.trainable_embed_model = trainable_embed_model
        self.labels = labels
        self.class_to_idx = class_to_idx
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
            # Get the tokens for the current document
            tokens = self.tokenized_corpus[idx].as_py()  # Convert PyArrow StringScalar to string
            # Compute the embeddings for this document on the fly.
            embedding_seq = self.trainable_embed_model.embed([tokens])[0]  # shape: (seq_len, embedding_dim)

            # Convert the embeddings into a numpy array
            embedding_seq = np.array(embedding_seq, dtype=np.float32)
            seq_len = len(embedding_seq)
            embedding_dim = len(embedding_seq[0])

            # Create a mask for the sequence
            mask = np.ones(seq_len, dtype=np.float32)  # 1 indicates a valid token
            if self.max_seq_len is not None:
                if seq_len < self.max_seq_len:
                    # Pad with zeros
                    pad = np.zeros((self.max_seq_len - seq_len, embedding_dim), dtype=np.float32)
                    embedding_seq = np.vstack([embedding_seq, pad])
                    mask = np.concatenate([mask, np.zeros(self.max_seq_len - seq_len, dtype=np.float32)])
                else:
                    # Truncate if too long
                    embedding_seq = embedding_seq[:self.max_seq_len]
                    mask = mask[:self.max_seq_len]

            # Convert label to index
            label = self.labels[idx]
            label_idx = self.class_to_idx[label]

            # Convert to PyTorch tensors (after converting to NumPy arrays for speed)
            return (torch.tensor(embedding_seq, dtype=torch.float32),
                    torch.tensor(mask, dtype=torch.float32),  # Return the mask
                    torch.tensor(label_idx, dtype=torch.int64))



def create_lazy_dataloader(file_path, trainable_embed_model, labels, class_to_idx, batch_size, max_seq_len=None):
    dataset = LazyEmbeddingDataset(file_path, trainable_embed_model, labels, class_to_idx, max_seq_len)
    indices = range(1000000)
    # Create a subset of the dataset using the sampled indices
    sampled_dataset = torch.utils.data.Subset(dataset, indices)
    return DataLoader(sampled_dataset, batch_size=batch_size, shuffle=True)

In [18]:
trainable_embed_model = TrainableEmbedding(
        tokenized_corpus=tokenized_train_corpus,
        algorithm="word2vec", # use fasttext to better handle OOV words
        vector_size=100,
        window=5,
        min_count=0
    )

max_seq_len = config['datasets']['medal']['max_sequence_length']
batch_size = config['training']['hyperparameters']['batch_size']

trainloader = create_lazy_dataloader(
    'dataset/medal/nltk_tokenized_subset/train.parquet', 
    trainable_embed_model, 
    train_data['LABEL'],
    class_to_idx, 
    batch_size=batch_size,
    max_seq_len=max_seq_len
    )


valloader = create_lazy_dataloader(
    'dataset/medal/nltk_tokenized_subset/valid.parquet', 
    trainable_embed_model, 
    val_data['LABEL'],
    class_to_idx, 
    batch_size=batch_size,
    max_seq_len=max_seq_len
    )

In [None]:
# Use the new dataloaders
model_trainer = ModelTrainer()
train_results = model_trainer.train(
    trainloader, 
    valloader, 
    dataset='medal', 
    embedding_dim=100
)


------- lstm_and_self_attention --------
{'lstm_units': 3, 'lstm_hidden_dim': 50, 'dropout': 0.1, 'num_classes': 22555, 'embedding_dim': 100}


Training:   2%|▏         | 320/15625 [05:33<4:41:36,  1.10s/it]

In [None]:
model_trainer.plot_results(train_results)

# MIMIC III Dataset