# Initial Code Repository and Structure

This notebook shows the early work we did for the project. 
We built features that let us easily switch between different tokenizers 
and embedding methods just by changing function arguments. 
The goal was to make the project more flexible and scalable for NLP tasks.

NOTE: This code doesn't show any final results — it's just our starting point 
to test out different ideas like embedding models and tokenizers.


In [1]:
%load_ext autoreload
%autoreload 2
%run ../setup.py

from src.data.medal import MeDALSubset
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
import yaml
from src.models.trainer import ModelTrainer
from src.vectorizer.trainable import TrainableEmbedding
from src.vectorizer.glove_embeddings import GloVeEmbedding
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
from src.utils import save_embeddings_to_file
import pyarrow.parquet as pq
import numpy as np
from src.vectorizer.bio_bert import BioBERTModel

Environment set up: sys.path updated, working dir set to project root.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashanthjaganathan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashanthjaganathan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
medal_dataset = MeDALSubset('MeDAL')
data, train_data, val_data, test_data = medal_dataset.load_dataset()

MeDAL dataset initialized with name: MeDAL
Dataset downloaded to: /home/jaganathan.p/.cache/kagglehub/datasets/xhlulu/medal-emnlp/versions/4
Dataset moved to: /home/jaganathan.p/pretaining-language-models-for-medical-text/dataset
Total number of classes: 22555


In [8]:
%load_ext autoreload
%autoreload 2

def load_config(path):
    with open(path, 'r') as f:
        return yaml.safe_load(f)

config = load_config('config/config.yaml')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
processed_train = medal_dataset.preprocess(['train', 'valid'])
# NOTE: Pre-processed for 503 minutes

In [None]:
train_df, val_df = processed_train

# Save to CSV
# NOTE: commented out v sensitive code, files contain huge corpus of preprocessed data
# DO NOT OVERWRITE THE FILES
train_df.to_csv("dataset/medal/preprocessed_subset/train.csv", index=False)
val_df.to_csv("dataset/medal/preprocessed_subset/valid.csv", index=False)

print("CSV files saved successfully!")


## Read and load pre-processed dataset

In [None]:
# preprocessed_train = pd.read_csv('dataset/medal/preprocessed_subset/train.csv')
preprocessed_val = pd.read_csv('dataset/medal/preprocessed_subset/valid.csv')
# medal_dataset.train_data = preprocessed_train
medal_dataset.val_data = preprocessed_val
preprocessed_val.head(1)['TEXT']


In [None]:
train_abbr_tokens = medal_dataset.tokenize('nltk', splits=['train'])
df = pd.DataFrame(train_abbr_tokens, columns=['tokenized_text', 'abbreviation'])

# Save as a Parquet file
file_name = "dataset/medal/nltk_tokenized_preprocessed_subset/train.parquet"
df.to_parquet(file_name)
print('Parquet file saved successfully!')


In [None]:
val_abbr_tokens = medal_dataset.tokenize('nltk', splits=['valid'])
df = pd.DataFrame(val_abbr_tokens, columns=['tokenized_text', 'abbreviation'])

# Save as a Parquet file
file_name = "dataset/medal/nltk_tokenized_preprocessed_subset/valid.parquet"
df.to_parquet(file_name)
print('Parquet file saved successfully!')


In [None]:
train_tokens = pd.read_parquet(
    "dataset/medal/nltk_tokenized_preprocessed_subset/train.parquet", 
    engine="pyarrow"
    ).squeeze()

# to make it as a list[list[str]]
tokenized_train_corpus = [(doc.tolist(), abbr) for doc, abbr in tqdm(zip(train_tokens['tokenized_text'], train_tokens['abbreviation']), 'Docs', len(train_tokens))] 
tokenized_train_corpus[:3]
# print(f'Number of documents in train corpus: {len(tokenized_train_corpus)}')
# train_tokens = [doc for doc, _ in tokenized_train_corpus]
# train_abbr = [abbv for _, abbv in tokenized_train_corpus]

In [None]:
val_tokens = pd.read_parquet("dataset/medal/nltk_tokenized_preprocessed_subset/valid.parquet", engine="pyarrow").squeeze()

# to make it as a list[list[str]]
tokenized_val_corpus = [doc.tolist() for doc in tqdm(val_tokens, 'Docs', len(val_tokens))] 
print(f'Number of documents in val corpus: {len(tokenized_val_corpus)}')
val_tokens = [doc for doc, _ in tokenized_val_corpus]
val_abbr = [abbv for _, abbv in tokenized_val_corpus]

### Training Word2Vec model on the entire corpus

In [None]:
embedding_model = TrainableEmbedding(
        tokenized_corpus=tokenized_train_corpus,
        algorithm="word2vec",
        vector_size=100,
        window=5,
        min_count=2
    )
embeddings = embedding_model.embed(tokenized_train_corpus)
print(f'Embedding dimensions: {len(embeddings[0][0])}')

### Training FastText model on the entire corpus

In [None]:
embedding_model = TrainableEmbedding(
        tokenized_corpus=tokenized_train_corpus,
        algorithm="fasttext",
        vector_size=100,
        window=7,
        min_count=2
    )
embeddings = embedding_model.embed(tokenized_train_corpus)
print(f'Embeddings Dimensions: {len(embeddings[0][0])}')

Reducing the embedding to Truncated Singular Value Decomposition (SVD)

In [None]:
embedding_model = TrainableEmbedding(
        tokenized_corpus=train_tokens.tolist(),
        algorithm="tfidf",
        vector_size=100,
        window=5,
        min_count=2
    )
embeddings = embedding_model.embed(train_tokens.tolist())
print(f'Embedding dimensions: {embeddings.shape}')

In [None]:
# NOTE: bio wordvec model is like 12GB and unable to load it in the memeory and build embeddings

train_embeddings = medal_dataset.embed(
    'bio_wordvec',
    splits=['train'],
    tokenized_data = train_tokens,
    model_path = 'trained_models/embeddings/pretrained/bio_wordvec.bin'
)
len(train_embeddings)

Trying not to use bio bert as it involves trasformer models and our architecture is limited to using 
LSTM + Self Attention, therefore, looking for static embedding models only

## Perform Model Training

First, let's create the dataloader with embeddings as features and labels.

In [51]:
class LazyEmbeddingDataset(Dataset):
    def __init__(
            self, 
            file_path, 
            embedding_model,
            class_to_idx, 
            max_seq_len=None,
            return_tokens=True,):
        """
        Args:
            file_path (str): Path to the Parquet file containing the tokenized text.
            embedding_model: The custom embedding model (e.g., GloVeEmbedding).
            labels (list): Labels corresponding to each document.
            class_to_idx (dict): Mapping from class label to integer index.
            max_seq_len (int, optional): Max sequence length for padding/truncating.
        """
        self.file_path = file_path
        self.preprocessed_corpus = pd.read_csv(file_path)
        self.contexts = self.preprocessed_corpus['TEXT']
        self.corresponding_abbreviations = self.preprocessed_corpus['ABBREVIATION']
        self.labels = self.preprocessed_corpus['LABEL']
        self.embedding_model = embedding_model
        self.class_to_idx = class_to_idx
        self.max_seq_len = max_seq_len
        self.return_tokens = return_tokens

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        abbreviation = self.corresponding_abbreviations[idx]

        # Compute the embeddings for this document on the fly using the embedding model
        embedding, attention_mask = self.embedding_model.embed(context, self.max_seq_len)  # shape: (seq_len, embedding_dim)

        if embedding.ndim == 3 and embedding.shape[0] == 1:
            embedding = embedding.squeeze(0)
        if attention_mask.ndim == 2 and attention_mask.shape[0] == 1:
            attention_mask = attention_mask.squeeze(0)
            
        # Convert label to index
        label = self.labels[idx]
        label_idx = self.class_to_idx[label]

        return (torch.tensor(embedding, dtype=torch.float32),
                torch.tensor(attention_mask, dtype=torch.float32),  # Return the mask
                torch.tensor(label_idx, dtype=torch.long))


def create_lazy_dataloader(file_path, embedding_model, class_to_idx, batch_size, max_seq_len=None):
    dataset = LazyEmbeddingDataset(file_path, embedding_model, class_to_idx, max_seq_len=max_seq_len, return_tokens=True)
    return DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=True
        )

In [52]:
bert_embedding_model = BioBERTModel(**config['embedding_models']['bio_bert'])

max_seq_len = config['datasets']['medal']['max_sequence_length']
batch_size = config['training']['hyperparameters']['batch_size']

trainloader = create_lazy_dataloader(
    'dataset/medal/preprocessed_subset/train.csv', 
    bert_embedding_model, 
    medal_dataset.class_to_idx, 
    batch_size=batch_size,
    max_seq_len=max_seq_len
    )


valloader = create_lazy_dataloader(
    'dataset/medal/preprocessed_subset/valid.csv', 
    bert_embedding_model, 
    medal_dataset.class_to_idx, 
    batch_size=batch_size,
    max_seq_len=max_seq_len
    )

In [None]:
# Use the new dataloaders
model_trainer = ModelTrainer(config_file='config.yaml')
train_results = model_trainer.train(
    trainloader, 
    valloader, 
    dataset='medal', 
    embedding_dim=100,
    embedding_model = bert_embedding_model
)

# Stopped because training was too long

------- lstm_and_self_attention --------
{'lstm_units': 2, 'lstm_hidden_dim': 128, 'num_attention_heads': 4, 'dropout': 0.3, 'num_classes': 22555, 'embedding_dim': 100, 'create_embedding_layer': False, 'embedding_model': BioBERTModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Drop

  return (torch.tensor(embedding, dtype=torch.float32),
  torch.tensor(attention_mask, dtype=torch.float32),  # Return the mask
Training:   2%|▏         | 1018/46875 [1:46:52<78:39:25,  6.17s/it]

In [None]:
model_trainer.plot_results(train_results)