In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier

import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt


In [4]:
#diagnosis file
df_diag = pd.read_csv("eicu-collaborative-research-database-2.0/diagnosis.csv.gz", compression="gzip")
df_diag.head()


Unnamed: 0,diagnosisid,patientunitstayid,activeupondischarge,diagnosisoffset,diagnosisstring,icd9code,diagnosispriority
0,4222318,141168,False,72,cardiovascular|chest pain / ASHD|coronary arte...,"414.00, I25.10",Other
1,3370568,141168,True,118,cardiovascular|ventricular disorders|cardiomyo...,,Other
2,4160941,141168,False,72,pulmonary|disorders of the airways|COPD,"491.20, J44.9",Other
3,4103261,141168,True,118,pulmonary|disorders of the airways|COPD,"491.20, J44.9",Other
4,3545241,141168,True,118,cardiovascular|ventricular disorders|congestiv...,"428.0, I50.9",Other


In [5]:
#hospital file
# df_hosp = pd.read_csv("annabel_data/hospital.csv.gz", compression="gzip")
# print("Hospital columns:", df_hosp.columns.tolist())
# df_hosp.head()


In [6]:
# Cleaning and preprocessing data
# Encode diagnosis priority string labels into numbers


# Drop rows with missing diagnosis info
df_diag = df_diag.dropna(subset=["diagnosisstring", "diagnosispriority"])

# One LabelEncoder reused
le = LabelEncoder()

# Encode diagnosisstring as target
df_diag["label"] = le.fit_transform(df_diag["diagnosisstring"])

# Overwrite le with new encoding for feature
df_diag["diagnosispriority_encoded"] = le.fit_transform(df_diag["diagnosispriority"])

# Filter to top 10 most common diagnosis labels
top_labels = df_diag["label"].value_counts().nlargest(10).index
df_filtered = df_diag[df_diag["label"].isin(top_labels)]

# Final feature + label
X = df_filtered[["diagnosispriority_encoded"]]
y = df_filtered["label"]



In [7]:
#train/test split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#for pytorch...
X_tensor_train = torch.tensor(X_train_scaled, dtype=torch.float32).unsqueeze(1)  # shape: [batch, seq, feature]
y_tensor_train = torch.tensor(y_train.values, dtype=torch.long)
X_tensor_test = torch.tensor(X_test_scaled, dtype=torch.float32).unsqueeze(1)
y_tensor_test = torch.tensor(y_test.values, dtype=torch.long)


In [8]:
print("Unique labels:", len(np.unique(y)))
print("Top labels:\n", y.value_counts().head(10))

Unique labels: 10
Top labels:
 label
2975    97836
3145    65313
1101    44491
2145    41034
2892    39729
965     37467
865     37328
767     33766
3018    33515
778     32509
Name: count, dtype: int64


In [9]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train_scaled, y_train)
preds_lr = lr.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, preds_lr))


Logistic Regression Accuracy: 0.20942136979200415


In [10]:
# en = SGDClassifier(loss="log", penalty="elasticnet", l1_ratio=0.5)
en = SGDClassifier(loss="log_loss", penalty="elasticnet", l1_ratio=0.5)
en.fit(X_train_scaled, y_train)
preds_en = en.predict(X_test_scaled)
print("Elastic Net Accuracy:", accuracy_score(y_test, preds_en))


Elastic Net Accuracy: 0.20942136979200415


In [11]:
# Filter top 10 labels and reset them to 0–9
top_labels = df_diag["label"].value_counts().nlargest(10).index
df_filtered = df_diag[df_diag["label"].isin(top_labels)].copy()

# Remap labels to 0–9
label_map = {old: new for new, old in enumerate(top_labels)}
df_filtered["label"] = df_filtered["label"].map(label_map)

# Train/test split
X = df_filtered[["diagnosispriority_encoded"]]
y = df_filtered["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to tensors
X_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).unsqueeze(1)
y_tensor = torch.tensor(y_train.values, dtype=torch.long)

# Rebuild LSTM
num_classes = len(np.unique(y_train))

class LSTMModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=16, batch_first=True)
        self.fc = nn.Linear(16, num_classes)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

model = LSTMModel()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train
for epoch in range(10):
    optimizer.zero_grad()
    output = model(X_tensor)
    loss = loss_fn(output, y_tensor)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")




Epoch 1, Loss: 2.3220
Epoch 2, Loss: 2.3144
Epoch 3, Loss: 2.3071
Epoch 4, Loss: 2.3000
Epoch 5, Loss: 2.2930
Epoch 6, Loss: 2.2862
Epoch 7, Loss: 2.2794
Epoch 8, Loss: 2.2727
Epoch 9, Loss: 2.2663
Epoch 10, Loss: 2.2600


In [12]:
from lightgbm import LGBMClassifier
print("LightGBM installed and working ")


lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)
preds_lgbm = lgbm.predict(X_test)
print("LightGBM Accuracy:", accuracy_score(y_test, preds_lgbm))


LightGBM installed and working 
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3
[LightGBM] [Info] Number of data points in the train set: 370390, number of used features: 1
[LightGBM] [Info] Start training from score -1.552172
[LightGBM] [Info] Start training from score -1.958497
[LightGBM] [Info] Start training from score -2.346574
[LightGBM] [Info] Start training from score -2.426356
[LightGBM] [Info] Start training from score -2.452452
[LightGBM] [Info] Start training from score -2.513192
[LightGBM] [Info] Start training from score -2.523005
[LightGBM] [Info] Start training from score -2.614027
[LightGBM] [Info] Start training from score -2.626677
[LightGBM] [Info] Start training from score -2.656114
LightGBM Accuracy: 0.21337393896196463
