In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
from collections import Counter
import random
import spacy
import re
import os
import zipfile

# -------------------------------
# Set random seeds for reproducibility
# -------------------------------
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# -------------------------------
# Load spaCy English model
# -------------------------------
# Disable parser and NER to speed up tokenization.
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# -------------------------------
# Unzip the dataset files if needed
# -------------------------------
if not os.path.exists("train.csv"):
    with zipfile.ZipFile('train.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
if not os.path.exists("train.csv"):
    with zipfile.ZipFile('archive (3).zip', 'r') as zip_ref:
        zip_ref.extractall('.')

# =======================
# Step 1: Load the Dataset
# =======================
try:
    df = pd.read_csv('train.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'train.csv' not found. Please ensure the dataset is in the working directory.")
    exit()

# -------------------------------
# Reduce the dataset size for development
# -------------------------------
max_samples = 900000  # reduce to 300K samples
if len(df) > max_samples:
    df = df.sample(n=max_samples, random_state=42).reset_index(drop=True)
    print(f"Dataset reduced to {len(df)} samples.")

# ================================
# Step 2: Preprocess and Clean Data
# ================================
# Use 'TEXT', 'LABEL', and 'LOCATION' columns.
df = df.dropna(subset=['TEXT', 'LABEL', 'LOCATION'])
df['LOCATION'] = df['LOCATION'].astype(int)

# --- Reduce Label Space by Filtering Rare Labels ---
min_label_freq = 5  # drop labels occurring fewer than 5 times
label_counts = df['LABEL'].value_counts()
valid_labels = label_counts[label_counts >= min_label_freq].index
df = df[df['LABEL'].isin(valid_labels)].reset_index(drop=True)
print(f"After filtering, {len(df)} samples remain with {df['LABEL'].nunique()} unique labels.")

# --- Advanced Preprocessing Functions ---
def extract_context(text, location, window_size=50):
    """
    Extract a window of tokens around the abbreviation.
    Assumes LOCATION is based on simple whitespace tokenization.
    """
    tokens = text.split()
    start = max(0, location - window_size // 2)
    end = min(len(tokens), location + window_size // 2)
    return " ".join(tokens[start:end])

# Create a new column 'CONTEXT' using the LOCATION info.
df['CONTEXT'] = df.apply(lambda row: extract_context(row['TEXT'], row['LOCATION'], window_size=50), axis=1)

# Use the context column for training.
texts = df['CONTEXT'].tolist()
labels = df['LABEL'].tolist()

# ===============================
# Step 3: Encode Labels
# ===============================
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)
print(f"Number of classes after filtering: {num_classes}")

# ============================================
# Step 4: Tokenization and Vocabulary Construction (with Batch Processing)
# ============================================
def batch_advanced_tokenize(texts, batch_size=1000):
    """
    Use spaCy's nlp.pipe to tokenize texts in batches.
    """
    tokenized_texts = []
    for doc in nlp.pipe(texts, batch_size=batch_size):
        tokens = [token.text for token in doc if not token.is_punct and not token.is_space]
        tokenized_texts.append(tokens)
    return tokenized_texts

# Tokenize all texts in batches.
tokenized_texts = batch_advanced_tokenize(texts, batch_size=1000)
print("Tokenization complete.")

# Build vocabulary from tokenized texts.
all_tokens = [token for tokens in tokenized_texts for token in tokens]
vocab_counter = Counter(all_tokens)
min_word_freq = 2  # ignore words that appear less than 2 times
vocab = {token for token, count in vocab_counter.items() if count >= min_word_freq}

# Reserve indices: 0 for padding, 1 for unknown tokens.
word_to_index = {"<PAD>": 0, "<UNK>": 1}
for word in sorted(vocab):
    word_to_index[word] = len(word_to_index)
vocab_size = len(word_to_index)
print(f"Vocabulary size: {vocab_size}")

# Convert texts to sequences of indices.
def text_to_sequence(tokens):
    return [word_to_index.get(token, word_to_index["<UNK>"]) for token in tokens]

sequences = [text_to_sequence(tokens) for tokens in tokenized_texts]

# ===========================
# Step 5: Pad Sequences
# ===========================
max_len = 256  # Fixed maximum length for input sequences.
def pad_sequence_fn(seq, max_len):
    if len(seq) < max_len:
        return seq + [0] * (max_len - len(seq))
    else:
        return seq[:max_len]

padded_sequences = [pad_sequence_fn(seq, max_len) for seq in sequences]
X = np.array(padded_sequences)
y = np.array(labels_encoded)

# ==================================
# Step 6: Train/Validation Data Split
# ==================================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}")

# ============================================
# Step 7: Create PyTorch Dataset and DataLoader
# ============================================
class MedicalAbbrDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(self.y[idx], dtype=torch.long)

batch_size = 64
train_dataset = MedicalAbbrDataset(X_train, y_train)
val_dataset = MedicalAbbrDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# ============================================
# Step 8: Load Pre-trained GloVe Embeddings and Build Embedding Matrix
# ============================================
def load_glove_embeddings(filepath, embedding_dim):
    embeddings_index = {}
    with open(filepath, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            if vector.shape[0] == embedding_dim:
                embeddings_index[word] = vector
    return embeddings_index

embedding_dim = 100 
glove_path = "glove.6B.100d.txt"  # Update the path if needed

if not os.path.exists(glove_path):
    raise FileNotFoundError(f"{glove_path} not found. Please download it and place it in the working directory.")

glove_embeddings = load_glove_embeddings(glove_path, embedding_dim)
print(f"Loaded {len(glove_embeddings)} word vectors from GloVe.")

# Build the embedding matrix.
embedding_matrix = np.zeros((vocab_size, embedding_dim), dtype=np.float32)
for word, idx in word_to_index.items():
    if word in glove_embeddings:
        embedding_matrix[idx] = glove_embeddings[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))

# ============================================
# Step 9: Define the LSTM-only Model Using Pre-trained GloVe Embeddings
# ============================================
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, output_dim, dropout=0.3,
                 pretrained_embeddings=None, freeze_embeddings=False):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.tensor(pretrained_embeddings))
            self.embedding.weight.requires_grad = not freeze_embeddings
        # Bidirectional LSTM
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, 
                            batch_first=True, dropout=dropout, bidirectional=True)
        # Concatenate final forward and backward hidden states
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.embedding(x)  # [batch_size, seq_len, embedding_dim]
        lstm_out, (h_n, _) = self.lstm(embedded)
        # h_n shape: (num_layers*2, batch_size, hidden_dim)
        forward_h = h_n[-2, :, :]
        backward_h = h_n[-1, :, :]
        hidden = torch.cat((forward_h, backward_h), dim=1)
        hidden = self.dropout(hidden)
        logits = self.fc(hidden)
        return logits

hidden_dim = 128
num_layers = 2
output_dim = num_classes
dropout = 0.3

model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, num_layers, output_dim, dropout,
                       pretrained_embeddings=embedding_matrix, freeze_embeddings=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(model)

# ============================================
# Step 10: Define Loss, Optimizer, and Training Loop
# ============================================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    epoch_loss, epoch_correct = 0, 0
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * inputs.size(0)
        epoch_correct += (outputs.argmax(dim=1) == labels).sum().item()
    return epoch_loss / len(loader.dataset), epoch_correct / len(loader.dataset)

def evaluate_epoch(model, loader, criterion, device):
    model.eval()
    epoch_loss, epoch_correct = 0, 0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            epoch_loss += loss.item() * inputs.size(0)
            epoch_correct += (outputs.argmax(dim=1) == labels).sum().item()
    return epoch_loss / len(loader.dataset), epoch_correct / len(loader.dataset)

num_epochs = 5
for epoch in range(num_epochs):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate_epoch(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss={train_loss:.4f}, Train Acc={train_acc*100:.2f}% | Val Loss={val_loss:.4f}, Val Acc={val_acc*100:.2f}%")

# ============================================
# Step 11: Inference Function
# ============================================
def predict_abbreviation(model, text, location, word_to_index, max_len, device, label_encoder):
    # Extract context using the same procedure as during training.
    context = extract_context(text, location, window_size=50)
    # Use advanced tokenization (one text at a time is acceptable for inference)
    tokens = [token.text for token in nlp(context) if not token.is_punct and not token.is_space]
    seq = [word_to_index.get(token, word_to_index["<UNK>"]) for token in tokens]
    seq = pad_sequence_fn(seq, max_len)
    input_tensor = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        logits = model(input_tensor)
    pred_class = logits.argmax(dim=1).item()
    pred_label = label_encoder.inverse_transform([pred_class])[0]
    return pred_label

# ============================================
# Step 12: Example Inference
# ============================================
sample_text = "The patient was diagnosed with acute MI and was admitted to the ICU for further monitoring."
# Assume the abbreviation appears near token index 6 (adjust as needed)
sample_location = 6
predicted_expansion = predict_abbreviation(model, sample_text, sample_location, word_to_index, max_len, device, label_encoder)
print(f"\nFor the text: '{sample_text}' with abbreviation at position {sample_location}, predicted expansion is: {predicted_expansion}")

Dataset loaded successfully.
Dataset reduced to 900000 samples.
After filtering, 892980 samples remain with 19920 unique labels.
Number of classes after filtering: 19920
Tokenization complete.
Vocabulary size: 232241
Training samples: 714384, Validation samples: 178596
Loaded 400000 word vectors from GloVe.
LSTMClassifier(
  (embedding): Embedding(232241, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=256, out_features=19920, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)
Epoch 1/5: Train Loss=8.7647, Train Acc=0.64% | Val Loss=6.2100, Val Acc=10.02%
Epoch 2/5: Train Loss=3.5599, Train Acc=35.48% | Val Loss=1.7365, Val Acc=54.71%
Epoch 3/5: Train Loss=1.7297, Train Acc=56.28% | Val Loss=1.2766, Val Acc=62.60%
Epoch 4/5: Train Loss=1.3238, Train Acc=63.62% | Val Loss=1.0877, Val Acc=66.96%
Epoch 5/5: Train Loss=1.1046, Train Acc=68.64% | Val Loss=1.0313, Val Acc=69.30%

For the text: 'The 

In [None]:
!pip install spacy && python -m spacy download en_core_web_sm

In [None]:
!pip install gensim

In [1]:
!pip install --upgrade typing_extensions

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
