# **Preprocessing**

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
# Load the CSV files
lab_tests_df = pd.read_csv("/content/extracted_lab_tests_sorted.csv")
conversations_df = pd.read_csv("/content/patient_icd_mapping.csv")

In [None]:
conversations_df.rename(columns={"Patient ID": "Patient_ID"}, inplace=True)

In [None]:
df = pd.merge(lab_tests_df, conversations_df, on="Patient_ID", how="inner")
df.fillna(-1, inplace=True)

# Convert ICD codes to categorical labels
icd_mapping = {code: idx for idx, code in enumerate(df["ICD Code"].unique())}
df["ICD_Label"] = df["ICD Code"].map(icd_mapping)

In [None]:
# Listing lab test values and finding errors if any
lab_test_columns = [col for col in df.columns if col not in ["Patient_ID", "ICD_Code", "Conversation", "ICD_Label"]]
df_inspect = df.copy()

for col in lab_test_columns:
    df_inspect[col] = pd.to_numeric(df_inspect[col].astype(str).str.extract(r'([-+]?\d*\.\d+|\d+)', expand=False), errors='coerce')
error_rows = df_inspect[df_inspect[lab_test_columns].isna().any(axis=1)]
print("Rows with potential non-numeric lab test values or units:")
print(error_rows[["Patient_ID", "ICD Code"] + lab_test_columns].head(10))

In [None]:
lab_test_columns = [col for col in df.columns if col not in ["Patient_ID", "ICD_Code", "Conversation", "ICD_Label", "Diagnosis", "Symptoms", "Medications", "Exposure History", "Disease Category"]]
print("Data types in lab test columns:")
print(df[lab_test_columns].dtypes)

def get_cell_types(x):
    return type(x)

cell_types = df[lab_test_columns].applymap(get_cell_types)
print("\nExample of cell types in lab test columns (first 10 rows):")
# print(cell_types.head(10))

# Additionally, check for non-numeric values by trying to convert each cell to float.
def safe_float(x):
    try:
        return float(x)
    except Exception as e:
        return f"Error: {x}"

converted = df[lab_test_columns].applymap(safe_float)
print("\nConverted values (first 10 rows):")
# print(converted.head(10))

In [None]:
import re
import numpy as np
import pandas as pd

# Removing units from lab tests
units_to_remove = {'%', '/L', 'U/L', 'Z', 'g/dL', 'mg/dL', 'mmHg', 'ng/mL', 'µU/mL'}

def clean_lab_value(value, units):
    """
    Removes specified units and extracts the first numeric value.
    Returns NaN if no numeric value is found.
    """
    if pd.isna(value):
        return np.nan

    value_str = str(value)
    for unit in units:
        value_str = value_str.replace(unit, "").strip()

    # Extract the first numeric value (integer or float), allowing negative numbers
    match = re.findall(r'-?\d*\.\d+|-?\d+', value_str)
    return float(match[0]) if match else np.nan

# Testing
sample_values = ["120 mg/dL", "7.2%", "15 U/L", "-85 mmHg", "-12.5 ng/mL", "Not Detected", None]
cleaned_values = [clean_lab_value(val, units_to_remove) for val in sample_values]

print(cleaned_values)

In [None]:
# Identify lab test columns (excluding non-numeric columns)
ignore_columns = ["Patient_ID", "ICD_Code", "Conversation", "ICD_Label", "Diagnosis",
                  "Symptoms", "Medications", "Exposure History", "Disease Category"]
lab_test_columns = [col for col in df.columns if col not in ignore_columns]
for col in lab_test_columns:
    df[col] = df[col].apply(lambda x: clean_lab_value(x, units_to_remove))
error_rows = df[df[lab_test_columns].isna().any(axis=1)]
print("Rows with NaN after cleaning lab test values:", error_rows)
print(error_rows.head(10))

In [None]:
# Saving cleaned dataset for modelling
df.to_csv("cleaned_lab_data.csv", index=False)
error_rows.to_csv("error_rows.csv", index=False)

print("Cleaned data saved as 'cleaned_lab_data.csv'")
print("Rows with NaN values saved as 'error_rows.csv'")

# ***Modelling***

In [None]:
!pip install pytorch-lightning transformers torch

In [None]:
# End-to-end Dual Encoder Fusion Model with PyTorch Lightning
import torch
from torch.utils.data import DataLoader, Dataset, random_split
import torch.nn as nn
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.metrics import precision_score, recall_score
import pandas as pd
import numpy as np

In [None]:
class HealthcareDataset(Dataset):
    def __init__(self, df, lab_cont_features, lab_cat_features, conv_cont_features, conv_cat_features, label_feature,
                 lab_cat_encoders=None, conv_cat_encoders=None, lab_scaler=None, conv_scaler=None):

        # Lab continuous features
        self.lab_cont_features = df[lab_cont_features].values.astype('float32') if lab_cont_features else None
        self.lab_scaler = lab_scaler or StandardScaler()
        if self.lab_cont_features is not None:
            self.lab_cont_features = self.lab_scaler.fit_transform(self.lab_cont_features)

        # Lab categorical features
        self.lab_cat_features, self.lab_cat_encoders = self._encode_categorical(df, lab_cat_features, lab_cat_encoders)

        # Conversation continuous features
        self.conv_cont_features = df[conv_cont_features].values.astype('float32') if conv_cont_features else None
        self.conv_scaler = conv_scaler or StandardScaler()
        if self.conv_cont_features is not None:
            self.conv_cont_features = self.conv_scaler.fit_transform(self.conv_cont_features)

        # Conversation categorical features
        self.conv_cat_features, self.conv_cat_encoders = self._encode_categorical(df, conv_cat_features, conv_cat_encoders)

        # Labels
        self.labels = df[label_feature].values.astype('int64')

    def _encode_categorical(self, df, cat_features, encoders):
        if cat_features:
            cat_features_df = df[cat_features].astype(str)  # Convert to string type to prevent errors
            encoders = encoders or {}

            # Use OrdinalEncoder for handling unknown categories
            for col in cat_features:
                if col not in encoders:
                    oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
                    cat_features_df[col] = oe.fit_transform(cat_features_df[[col]])
                    encoders[col] = oe
                else:
                    cat_features_df[col] = encoders[col].transform(cat_features_df[[col]])

            return cat_features_df.values.astype('int64'), encoders

        return None, None

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'lab_cont': torch.tensor(self.lab_cont_features[idx]) if self.lab_cont_features is not None else torch.tensor([]),
            'lab_cat': torch.tensor(self.lab_cat_features[idx]) if self.lab_cat_features is not None else torch.tensor([]),
            'conv_cont': torch.tensor(self.conv_cont_features[idx]) if self.conv_cont_features is not None else torch.tensor([]),
            'conv_cat': torch.tensor(self.conv_cat_features[idx]) if self.conv_cat_features is not None else torch.tensor([]),
            'label': torch.tensor(self.labels[idx])
        }


# **Adding a Residual Block to capture more relations by building a deeper net**

In [None]:
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(in_features, out_features)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(out_features, out_features)
        self.dropout = nn.Dropout(dropout)
        # If the input and output dimensions differ, adjust the shortcut connection.
        if in_features != out_features:
            self.shortcut = nn.Linear(in_features, out_features)
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        shortcut = self.shortcut(x)
        return self.relu(out + shortcut)


In [None]:
import torch.nn.functional as F
import pytorch_lightning as pl
from sklearn.metrics import precision_score, recall_score

class DualEncoderModel(pl.LightningModule):
    def __init__(self, lab_cont_dim, lab_cat_dims, conv_cont_dim, conv_cat_dims, embedding_dim, num_classes, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()

        # Lab continuous encoder with residual blocks:
        if lab_cont_dim > 0:
            self.lab_cont_encoder = nn.Sequential(
                ResidualBlock(lab_cont_dim, 64, dropout=0.2),
                ResidualBlock(64, 64, dropout=0.2)
            )
        else:
            self.lab_cont_encoder = None

        # Lab categorical embeddings (unchanged)
        self.lab_cat_embeddings = nn.ModuleList([nn.Embedding(dim + 1, embedding_dim) for dim in lab_cat_dims])

        # Conversation continuous encoder with residual blocks:
        if conv_cont_dim > 0:
            self.conv_cont_encoder = nn.Sequential(
                ResidualBlock(conv_cont_dim, 64, dropout=0.2),
                ResidualBlock(64, 64, dropout=0.2)
            )
        else:
            self.conv_cont_encoder = None

        # Conversation categorical embeddings (unchanged)
        self.conv_cat_embeddings = nn.ModuleList([nn.Embedding(dim + 1, embedding_dim) for dim in conv_cat_dims])

        # Fusion and classification: Calculate total feature dimension dynamically.
        total_dim = 0
        if self.lab_cont_encoder: total_dim += 64
        if lab_cat_dims: total_dim += embedding_dim * len(lab_cat_dims)
        if self.conv_cont_encoder: total_dim += 64
        if conv_cat_dims: total_dim += embedding_dim * len(conv_cat_dims)

        self.classifier = nn.Sequential(
            nn.Linear(total_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

        self.loss_fn = nn.CrossEntropyLoss()
        self.test_step_outputs = []

    def forward(self, lab_cont, lab_cat, conv_cont, conv_cat):
        embeddings = []

        # Process lab continuous features
        if self.lab_cont_encoder and lab_cont.nelement() > 0:
            embeddings.append(self.lab_cont_encoder(lab_cont))
        if len(self.lab_cat_embeddings) > 0 and lab_cat.nelement() > 0:
            embeddings.extend([emb(torch.clamp(lab_cat[:, i], min=0)) for i, emb in enumerate(self.lab_cat_embeddings)])

        # Process conversation features (if available)
        if self.conv_cont_encoder and conv_cont.nelement() > 0:
            embeddings.append(self.conv_cont_encoder(conv_cont))
        if len(self.conv_cat_embeddings) > 0 and conv_cat.nelement() > 0:
            embeddings.extend([emb(torch.clamp(conv_cat[:, i], min=0)) for i, emb in enumerate(self.conv_cat_embeddings)])

        fused_emb = torch.cat(embeddings, dim=1)
        return self.classifier(fused_emb)

    def training_step(self, batch, batch_idx):
        logits = self(
            lab_cont=batch['lab_cont'],
            lab_cat=batch['lab_cat'],
            conv_cont=batch['conv_cont'],
            conv_cat=batch['conv_cat']
        )
        loss = self.loss_fn(logits, batch['label'])
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self(
            lab_cont=batch['lab_cont'],
            lab_cat=batch['lab_cat'],
            conv_cont=batch['conv_cont'],
            conv_cat=batch['conv_cat']
        )
        loss = self.loss_fn(logits, batch['label'])
        preds = torch.argmax(logits, dim=1)
        acc = (preds == batch['label']).float().mean()
        precision = precision_score(batch['label'].cpu(), preds.cpu(), average='macro', zero_division=0)
        recall = recall_score(batch['label'].cpu(), preds.cpu(), average='macro', zero_division=0)
        self.log_dict({'val_loss': loss, 'val_acc': acc, 'val_precision': precision, 'val_recall': recall}, prog_bar=True)

    #Adding scheduler to see if it works better
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.5,
            patience=2,
            verbose=True
        )
        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}


## **Using Augmented Dataset**

In [None]:
# Load dataset
df = pd.read_csv("/content/augmented_lab_data.csv")

df = df[[
 'ALT (SGPT)',
 'AST (SGOT)',
 'Bilirubin',
 'Albumin',
 'Platelet Count',
 'Total Cholesterol',
 'BP Systolic',
 'BP Diastolic',
 'Troponin',
 'Ejection Fraction',
 'HbA1c',
 'Fasting Glucose',
 'Postprandial Glucose',
 'Triglycerides',
 'Insulin Level',
 'WBC Count',
 'Fever',
 'Hematocrit',
 'Symptoms',
 'Medications',
 'Exposure History',
 'ICD_Label']]

lab_cont_features = ['ALT (SGPT)',
 'AST (SGOT)',
 'Bilirubin',
 'Albumin',
 'Platelet Count',
 'Total Cholesterol',
 'BP Systolic',
 'BP Diastolic',
 'Troponin',
 'Ejection Fraction',
 'HbA1c',
 'Fasting Glucose',
 'Postprandial Glucose',
 'Triglycerides',
 'Insulin Level',
 'WBC Count',
 'Fever',
 'Hematocrit']
lab_cat_features = []
conv_cont_features = []
conv_cat_features = ['Symptoms', 'Medications', 'Exposure History']
label_feature = 'ICD_Label'

# Convert lab continuous features (e.g., '272 mg/dL') to float
df[lab_cont_features] = df[lab_cont_features].astype(str).replace(r'[^\d.]', '', regex=True).astype(float)


In [None]:
#Initialize dataset
from torch.utils.data import DataLoader, random_split
dataset = HealthcareDataset(df, lab_cont_features, lab_cat_features, conv_cont_features, conv_cat_features, label_feature)

# Train-Validation-Test Split
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_set, val_set = random_split(dataset, [train_size, val_size])

val_size = int(0.5 * len(val_set))
test_size = len(val_set) - val_size
val_set, test_set = random_split(val_set, [val_size, test_size])

#DataLoaders
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32)
test_loader = DataLoader(test_set, batch_size=32)

# Number of unique ICD codes (for classification)
num_classes = len(df[label_feature].unique())

# Get category dimensions for embeddings (Handle unknown categories)
cat_dims_lab = [(len(dataset.lab_cat_encoders[col].categories_[0]) + 1) for col in lab_cat_features] if lab_cat_features else []
cat_dims_conv = [(len(dataset.conv_cat_encoders[col].categories_[0]) + 1) for col in conv_cat_features] if conv_cat_features else []

Increased Epochs

In [None]:
import pytorch_lightning as pl

# Initialize the model with updated categorical dimensions
model = DualEncoderModel(
    lab_cont_dim=len(lab_cont_features),
    lab_cat_dims=cat_dims_lab,
    conv_cont_dim=len(conv_cont_features) if conv_cont_features else 0,  # Ensure valid input if empty
    conv_cat_dims=cat_dims_conv,
    embedding_dim=16,
    num_classes=num_classes
)

# Train the model
trainer = pl.Trainer(max_epochs=15, accelerator='auto')
trainer.fit(model, train_loader, val_loader)

In [None]:
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import numpy as np

def evaluate_model(model, dataloader, class_names):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            lab_cont = batch['lab_cont'].float()
            lab_cat = batch['lab_cat'].long() if batch['lab_cat'].nelement() > 0 else torch.empty((lab_cont.shape[0], 0)).long()
            conv_cont = batch['conv_cont'].float() if batch['conv_cont'].nelement() > 0 else torch.empty((lab_cont.shape[0], 0)).float()
            conv_cat = batch['conv_cat'].long() if batch['conv_cat'].nelement() > 0 else torch.empty((lab_cont.shape[0], 0)).long()
            labels = batch['label'].long()

            logits = model(lab_cont, lab_cat, conv_cont, conv_cat)
            preds = torch.argmax(logits, dim=1)

            all_preds.append(preds)
            all_labels.append(labels)

    all_preds = torch.cat(all_preds)
    all_labels = torch.cat(all_labels)

    # Classification report
    report_dict = classification_report(
        all_labels.cpu(),
        all_preds.cpu(),
        target_names=class_names,
        output_dict=True
    )
    print("Classification Report:")
    print(classification_report(all_labels.cpu(), all_preds.cpu(), target_names=class_names))

    # Precision / Recall / F1-score chart only
    labels_clean = list(report_dict.keys())[:-3]  # remove avg/total entries
    precision = [report_dict[label]['precision'] for label in labels_clean]
    recall = [report_dict[label]['recall'] for label in labels_clean]
    f1 = [report_dict[label]['f1-score'] for label in labels_clean]

    x = np.arange(len(labels_clean))
    width = 0.25

    fig, ax = plt.subplots(figsize=(14, 6))
    ax.bar(x - width, precision, width, label='Precision', color='tab:blue')
    ax.bar(x, recall, width, label='Recall', color='tab:orange')
    ax.bar(x + width, f1, width, label='F1-score', color='tab:green')

    ax.set_ylabel('Score')
    ax.set_title('Per-Class Precision, Recall, and F1-score')
    ax.set_xticks(x)
    ax.set_xticklabels(labels_clean, rotation=45, ha='right')
    ax.set_ylim([0, 1.1])
    ax.legend()
    ax.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()
class_names = [f"ICD-{i}" for i in range(num_classes)]  # Replace with real labels if available
evaluate_model(model, val_loader, class_names)

In [None]:
# Save the trained model
import torch
torch.save(model.state_dict(), "dual_encoder_model.pth")
print("Model saved successfully.")

In [None]:
# Define model with original architecture (with conversation features)
model = DualEncoderModel(
    lab_cont_dim=len(lab_cont_features),
    lab_cat_dims=cat_dims_lab,
    conv_cont_dim=len(conv_cont_features),  # Keep same conv dimensions
    conv_cat_dims=cat_dims_conv,
    embedding_dim=16,
    num_classes=num_classes
)

model.load_state_dict(torch.load("/content/dual_encoder_model.pth", map_location=torch.device("cpu")))
model.eval()

In [None]:
state_dict = torch.load("/content/dual_encoder_model.pth", map_location="cpu")
for key, value in state_dict.items():
    print(f"{key}: {value.shape}")


In [None]:
import torch

# Define all lab continuous features
lab_cont_features_list = [
    'ALT (SGPT)', 'AST (SGOT)', 'Bilirubin', 'Albumin', 'Platelet Count',
    'Total Cholesterol', 'BP Systolic', 'BP Diastolic', 'Troponin',
    'Ejection Fraction', 'HbA1c', 'Fasting Glucose', 'Postprandial Glucose',
    'Triglycerides', 'Insulin Level', 'WBC Count', 'Fever', 'Hematocrit'
]

# Example lab data (replace with actual values)
lab_data = {
    'ALT (SGPT)': 35, 'AST (SGOT)': 42, 'Bilirubin': 1.2, 'Albumin': 4.3,
    'Platelet Count': 250, 'Total Cholesterol': 180, 'BP Systolic': 120,
    'BP Diastolic': 80, 'Troponin': 0.01, 'Ejection Fraction': 55,
    'HbA1c': 5.4, 'Fasting Glucose': 90, 'Postprandial Glucose': 130,
    'Triglycerides': 150, 'Insulin Level': 17.3, 'WBC Count': -1,
    'Fever': -1, 'Hematocrit': -1
}

# Convert lab data to tensor (keeping the order of features)
lab_cont_tensor = torch.tensor([[lab_data[feature] for feature in lab_cont_features_list]], dtype=torch.float32)

# Print tensor for verification
print("Lab Continuous Tensor Shape:", lab_cont_tensor.shape)
print("Lab Continuous Tensor:", lab_cont_tensor)


In [None]:
import torch

# Define all lab continuous features
lab_cont_features_list = [
    'ALT (SGPT)', 'AST (SGOT)', 'Bilirubin', 'Albumin', 'Platelet Count',
    'Total Cholesterol', 'BP Systolic', 'BP Diastolic', 'Troponin',
    'Ejection Fraction', 'HbA1c', 'Fasting Glucose', 'Postprandial Glucose',
    'Triglycerides', 'Insulin Level', 'WBC Count', 'Fever', 'Hematocrit'
]

# Example lab data (replace with actual values)
lab_data = {
    'ALT (SGPT)': 35, 'AST (SGOT)': 42, 'Bilirubin': 1.2, 'Albumin': 4.3,
    'Platelet Count': 250, 'Total Cholesterol': 180, 'BP Systolic': 120,
    'BP Diastolic': 80, 'Troponin': 0.01, 'Ejection Fraction': 55,
    'HbA1c': 5.4, 'Fasting Glucose': 90, 'Postprandial Glucose': 130,
    'Triglycerides': 150, 'Insulin Level': 17.3, 'WBC Count': -1,
    'Fever': -1, 'Hematocrit': -1
}

# Convert lab data to tensor (ensuring 18 features are passed)
lab_cont_tensor = torch.tensor(
    [[lab_data[feature] for feature in lab_cont_features_list]],
    dtype=torch.float32
)

# Ensure tensor shape matches expected input
print("Lab Continuous Tensor Shape:", lab_cont_tensor.shape)  # Should be [1, 18]

In [None]:

# Empty tensors for missing categorical & conversation features
lab_cat_tensor = torch.zeros((1, 0), dtype=torch.int64)  # No categorical lab features
conv_cont_tensor = torch.zeros((1, 0), dtype=torch.float32)  # No conversation features
conv_cat_tensor = torch.zeros((1, 0), dtype=torch.int64)  # No categorical conversation features

In [None]:
first_module = model.lab_cont_encoder[0]
print("Type of first module in lab_cont_encoder:", type(first_module))
if hasattr(first_module, 'fc1'):
    print("Expected Input Shape:", first_module.fc1.in_features)
else:
    print("Expected Input Shape:", first_module.in_features)

In [None]:
# Debug tensor shapes before inference
print("Lab Continuous Shape:", lab_cont_tensor.shape)  # Should be [1, 18]
print("Lab Categorical Shape:", lab_cat_tensor.shape)  # Likely empty []
print("Conv Continuous Shape:", conv_cont_tensor.shape)  # Likely missing
print("Conv Categorical Shape:", conv_cat_tensor.shape)  # Should have values

# Ensure conv_cont_tensor has the expected size
expected_conv_cont_features = 48  # Adjust based on model training
if conv_cont_tensor.shape[1] < expected_conv_cont_features:
    padding_size = expected_conv_cont_features - conv_cont_tensor.shape[1]
    conv_cont_tensor = torch.nn.functional.pad(conv_cont_tensor, (0, padding_size), "constant", 0)
elif conv_cont_tensor.shape[1] > expected_conv_cont_features:
    conv_cont_tensor = conv_cont_tensor[:, :expected_conv_cont_features]  # Trim excess features

# Ensure conv_cat_tensor has the expected number of categorical features
expected_conv_cat_features = 3  # Adjust based on model training
if conv_cat_tensor.shape[1] < expected_conv_cat_features:
    padding_size = expected_conv_cat_features - conv_cat_tensor.shape[1]
    conv_cat_tensor = torch.nn.functional.pad(conv_cat_tensor, (0, padding_size), "constant", 0)
elif conv_cat_tensor.shape[1] > expected_conv_cat_features:
    conv_cat_tensor = conv_cat_tensor[:, :expected_conv_cat_features]  # Trim excess features

# Print final shapes after correction
print("Corrected Conv Continuous Shape:", conv_cont_tensor.shape)
print("Corrected Conv Categorical Shape:", conv_cat_tensor.shape)

# Perform inference with corrected tensors
with torch.no_grad():
    logits = model(lab_cont_tensor, lab_cat_tensor, conv_cont_tensor, conv_cat_tensor)
    print("Output Shape:", logits.shape)
    predicted_class = torch.argmax(logits, dim=1).item()

print(f"Predicted ICD Code: {predicted_class}")

In [None]:
# print(model)

# **Inference - Model Usage By uploading a Single PDF**

In [None]:
!pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain pdfplumber pypdf torch fitz faiss-cpu

In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    return text

# Extract text from the uploaded PDF
pdf_path = "/content/Diabetes_Lab_Report.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

# Print extracted text for debugging
print(pdf_text[:1000])  # Print first 1000 characters


In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ["LANGCHAIN_API_KEY"] = ""
#Anuj's OpenAI Api Key
os.environ["OPENAI_API_KEY"] = ""

In [None]:
import pdfplumber

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
    return text

# Extract text from the uploaded PDF
pdf_path = "/content/Diabetes_Lab_Report.pdf"
pdf_text = extract_text_from_pdf(pdf_path)

# Print extracted text for debugging
print(pdf_text[:1000])  # Print first 1000 characters

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
import re

# Step 1: Chunking for Better Retrieval
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.create_documents([pdf_text])

# Step 2: Create Vector Store using FAISS
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = FAISS.from_documents(documents, embedding=embedding_model)

# Step 3: Define Retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Step 4: QA Chain Setup
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="refine")

# Step 5: Query for Lab Test Extraction
query_lab_tests = """
Extract only lab test names and values from this medical report.
Return the data as a JSON object where keys are test names and values are their respective results with units.
For example:
{
    "Triglycerides": "335.00 mg/dL",
    "Blood Glucose": "100.00 mg/dL",
    "BMI": "95 kg/m²"
}
Do not add any explanations, just return a valid JSON object.
"""

# Step 6: Extract Lab Tests
lab_tests_response = qa.run(query_lab_tests)

# Step 7: Clean and Print Extracted Tests
lab_tests_response = re.sub(r"[^\w\s:{}.,\"/-]", "", lab_tests_response)  # Remove unwanted characters

print("Extracted Lab Test Values:\n", lab_tests_response)

In [None]:
def extract_lab_tests_dict(lab_tests_response):
    pattern = r"[-•]?\s*([\w\s/()%.-]+?):\s*([\d.]+)\s*(\w+/?.*)?"
    matches = re.findall(pattern, lab_tests_response)

    lab_dict = {}
    for test, value, unit in matches:
        cleaned_test = test.strip()
        try:
            lab_dict[cleaned_test] = float(value)
        except ValueError:
            continue  # skip if conversion fails
    return lab_dict

In [None]:
lab_cont_dim = 18  # from shape: [64, 18]
lab_cat_dims = []  # Not used based on missing weights for lab_cat embeddings
conv_cont_dim = 0  # conv_cont_encoder is not in state_dict → was not used during training
conv_cat_dims = [49, 17, 17]  # from conv_cat_embeddings shapes: [50, 16], [18, 16], [18, 16]
embedding_dim = 16  # from conv_cat_embeddings weights
num_classes = 18  # from classifier.3.bias → shape [18]


model = DualEncoderModel(
    lab_cont_dim=18,
    lab_cat_dims=[],
    conv_cont_dim=0,
    conv_cat_dims=[49, 17, 17],
    embedding_dim=16,
    num_classes=18
)
model.load_state_dict(torch.load("/content/dual_encoder_model.pth", map_location=torch.device("cpu")))
model.eval()

In [None]:
import torch

# Batch size = 1 (for example)
lab_cont = torch.rand((1, 18))  # Random values for lab continuous features
lab_cat = torch.zeros((1, 0), dtype=torch.long)  # Empty tensor as lab_cat_dims = []

conv_cont = torch.zeros((1, 0))  # Not used
conv_cat = torch.tensor([[10, 5, 8]], dtype=torch.long)  # Replace with real category indices

In [None]:
with torch.no_grad():
    output = model(lab_cont, lab_cat, conv_cont, conv_cat)
    predicted_class = torch.argmax(output, dim=1).item()
    print("Predicted ICD class:", predicted_class)

In [None]:
import pandas as pd

def load_icd_mapping(csv_path):
    # Load CSV
    df = pd.read_csv(csv_path)

    # Sanity check: ensure required columns are present
    required_columns = ['ICD_Label', 'ICD Code', 'Diagnosis']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")

    # Remove duplicates based on ICD_Label to avoid mapping conflicts
    df = df.drop_duplicates(subset='ICD_Label')

    # Create the mapping: label index → (ICD code, ICD label, diagnosis)
    icd_mapping = {
        row['ICD_Label']: (row['ICD Code'], row['ICD_Label'], row['Diagnosis'])
        for _, row in df.iterrows()
    }

    return icd_mapping

# Example usage
csv_path = '/content/cleaned_lab_data.csv'
icd_mapping = load_icd_mapping(csv_path)

# Optional: Print sample mappings for verification
for label, (code, icd_label, diagnosis) in list(icd_mapping.items())[:5]:
    print(f"Label: {label}, ICD Code: {code}, Diagnosis: {diagnosis}")


In [None]:
import torch
import torch.nn as nn
import pdfplumber
import re
import pandas as pd
import torch.nn.functional as F
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

# ---- DualEncoderModel Class ----
class DualEncoderModel(pl.LightningModule):
    def __init__(self, lab_cont_dim, lab_cat_dims, conv_cont_dim, conv_cat_dims, embedding_dim, num_classes, lr=1e-3):
        super().__init__()
        self.save_hyperparameters()

        # Lab continuous encoder with residual blocks:
        if lab_cont_dim > 0:
            self.lab_cont_encoder = nn.Sequential(
                ResidualBlock(lab_cont_dim, 64, dropout=0.2),
                ResidualBlock(64, 64, dropout=0.2)
            )
        else:
            self.lab_cont_encoder = None

        # Lab categorical embeddings (unchanged)
        self.lab_cat_embeddings = nn.ModuleList([nn.Embedding(dim + 1, embedding_dim) for dim in lab_cat_dims])

        # Conversation continuous encoder with residual blocks:
        if conv_cont_dim > 0:
            self.conv_cont_encoder = nn.Sequential(
                ResidualBlock(conv_cont_dim, 64, dropout=0.2),
                ResidualBlock(64, 64, dropout=0.2)
            )
        else:
            self.conv_cont_encoder = None

        # Conversation categorical embeddings (unchanged)
        self.conv_cat_embeddings = nn.ModuleList([nn.Embedding(dim + 1, embedding_dim) for dim in conv_cat_dims])

        # Fusion and classification: Calculate total feature dimension dynamically.
        total_dim = 0
        if self.lab_cont_encoder: total_dim += 64
        if lab_cat_dims: total_dim += embedding_dim * len(lab_cat_dims)
        if self.conv_cont_encoder: total_dim += 64
        if conv_cat_dims: total_dim += embedding_dim * len(conv_cat_dims)

        self.classifier = nn.Sequential(
            nn.Linear(total_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

        self.loss_fn = nn.CrossEntropyLoss()
        self.test_step_outputs = []

    def forward(self, lab_cont, lab_cat, conv_cont, conv_cat):
        embeddings = []

        # Process lab continuous features
        if self.lab_cont_encoder and lab_cont.nelement() > 0:
            embeddings.append(self.lab_cont_encoder(lab_cont))
        if len(self.lab_cat_embeddings) > 0 and lab_cat.nelement() > 0:
            embeddings.extend([emb(torch.clamp(lab_cat[:, i], min=0)) for i, emb in enumerate(self.lab_cat_embeddings)])

        # Process conversation features (if available)
        if self.conv_cont_encoder and conv_cont.nelement() > 0:
            embeddings.append(self.conv_cont_encoder(conv_cont))
        if len(self.conv_cat_embeddings) > 0 and conv_cat.nelement() > 0:
            embeddings.extend([emb(torch.clamp(conv_cat[:, i], min=0)) for i, emb in enumerate(self.conv_cat_embeddings)])

        fused_emb = torch.cat(embeddings, dim=1)
        return self.classifier(fused_emb)





# ---- Lab Features ----
lab_cont_features_list = [
    'ALT (SGPT)', 'AST (SGOT)', 'Bilirubin', 'Albumin', 'Platelet Count',
    'Total Cholesterol', 'BP Systolic', 'BP Diastolic', 'Troponin',
    'Ejection Fraction', 'HbA1c', 'Fasting Glucose', 'Postprandial Glucose',
    'Triglycerides', 'Insulin Level', 'WBC Count', 'Fever', 'Hematocrit'
]

# ---- PDF Extraction ----
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())

def extract_lab_tests_dict(response_text):
    pattern = r"[-•]?\s*([\w\s/()%.-]+?):\s*([\d.]+)\s*(\w+/?.*)?"
    matches = re.findall(pattern, response_text)
    lab_dict = {}
    for test, value, unit in matches:
        test = test.strip()
        try:
            lab_dict[test] = float(value)
        except ValueError:
            continue
    return lab_dict

def prepare_lab_tensor(lab_data, feature_list):
    values = [lab_data.get(feature, -1) for feature in feature_list]
    return torch.tensor([values], dtype=torch.float32)

# ---- Updated ICD Mapping Loader ----
def load_icd_mapping(csv_path):
    df = pd.read_csv(csv_path)

    # Defensive check
    if not {'ICD_Label', 'ICD Code', 'Diagnosis'}.issubset(df.columns):
        raise ValueError("Required columns missing from CSV: ICD_Label, ICD Code, Diagnosis")

    df = df.drop_duplicates(subset='ICD_Label')
    return {
        int(row['ICD_Label']): (row['ICD Code'], row['ICD_Label'], row['Diagnosis'])
        for _, row in df.iterrows()
    }

# ---- Main Prediction Pipeline ----
def process_pdf_and_predict(pdf_path, model_path, icd_csv_path, top_k=3):
    # Step 1: Extract text & split
    text = extract_text_from_pdf(pdf_path)
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.create_documents([text])

    # Step 2: Vector store and QA chain
    embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.from_documents(docs, embedding=embedding_model)
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
    qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, chain_type="refine")

    # Step 3: Lab test extraction
    query = "List lab test names and values only with units (no suggestions). Format: Test: Value Unit"
    lab_tests_response = qa.run(query)
    lab_data = extract_lab_tests_dict(lab_tests_response)

    # Step 4: Prepare input tensors
    lab_cont_tensor = prepare_lab_tensor(lab_data, lab_cont_features_list)
    lab_cat_tensor = torch.zeros((1, 0), dtype=torch.int64)
    conv_cont_tensor = torch.zeros((1, 0), dtype=torch.float32)
    conv_cat_tensor = torch.tensor([[0, 0, 0]], dtype=torch.int64)

    # Step 5: Load model
    model = DualEncoderModel(
        lab_cont_dim=len(lab_cont_features_list),
        lab_cat_dims=[],
        conv_cont_dim=0,
        conv_cat_dims=[49, 17, 17],
        embedding_dim=16,
        num_classes=18
    )
    model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
    model.eval()

    # Step 6: Predict
    with torch.no_grad():
        logits = model(lab_cont_tensor, lab_cat_tensor, conv_cont_tensor, conv_cat_tensor)
        probs = F.softmax(logits, dim=1)
        top_k_probs = torch.topk(probs, top_k, dim=1)

    # Step 7: ICD mapping
    icd_mapping = load_icd_mapping(icd_csv_path)
    sum_top_k_probs = torch.sum(top_k_probs.values).item()

    print("\n🔍 **Predicted Diagnoses:**")
    for i, (label_idx, prob) in enumerate(zip(top_k_probs.indices[0].tolist(), top_k_probs.values[0].tolist())):
        icd_code, icd_label, diagnosis = icd_mapping.get(label_idx, ("Unknown", "Unknown", "No Description Available"))
        confidence_score = prob / sum_top_k_probs

        # Confidence emoji
        if prob > 0.6:
            confidence = "🔵 High"
        elif prob > 0.3:
            confidence = "🟡 Medium"
        else:
            confidence = "🔴 Low"

        print(f"\n{i+1}. **{diagnosis}**")
        print(f"   - ICD Code: {icd_code}")
        print(f"   - Disease Label: {icd_label}")
        print(f"   - Probability: {prob * 100:.2f}%")
        print(f"   - Confidence Score: {confidence_score:.2f}")
        print(f"   - Confidence Level: {confidence}")


In [None]:
pdf_path = "/content/Diabetes_Lab_Report.pdf"
model_path = "/content/dual_encoder_model.pth"
icd_csv_path = "/content/cleaned_lab_data.csv"

# process_pdf_and_predict - function that calls entire pipeline
process_pdf_and_predict(pdf_path, model_path, icd_csv_path, top_k=3)