In [None]:
!pip install numpy opencv-python

In [3]:
import pandas as pd

csv_path = '/kaggle/input/report-dataset/full_report_data.csv'
full_data = pd.read_csv(csv_path)
print(full_data.head())

  subject_id   study_id                                        examination  \
0  p18726783  s54132939  Evaluation of the patient after ICD placement ...   
1  p18726783  s54838151                                                NaN   
2  p18726783  s55647339    Evaluation of the patient with nausea, dysuria.   
3  p18726783  s55239487                                                NaN   
4  p18726783  s53461826    Evaluation of the patient with congestive heart   

                                          indication  \
0                                                NaN   
1  NSTEMI, residual shortness of breath, changes ...   
2                                                NaN   
3                                                NaN   
4                                                NaN   

                            technique                           comparison  \
0                                 NaN                                  NaN   
1                                 NaN 

In [None]:
# # Replace all NaN values with 0
# mimic_data.fillna(0, inplace=True)


In [11]:
mimic_data

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227822,19999442,58708861,,,,,,,,,1.0,,,,,1.0
227823,19999733,57132437,,,,,,,,,1.0,,,,,
227824,19999987,55368167,1.0,-1.0,,,,,0.0,,,0.0,,,0.0,
227825,19999987,58621812,1.0,,,,,,,,,,,,,1.0


In [6]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, roc_curve
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TextDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

class MultilabelFocalLoss(nn.Module):
    def __init__(self, gamma=2, weights=None, epsilon=1e-7):
        super().__init__()
        self.gamma = gamma
        self.weights = weights
        self.epsilon = epsilon

    def forward(self, inputs, targets):
        inputs = torch.clamp(inputs, self.epsilon, 1 - self.epsilon)
        loss = -targets * torch.log(inputs) - (1 - targets) * torch.log(1 - inputs)
        pt = torch.exp(-loss)
        focal_loss = (1-pt)**self.gamma * loss
        if self.weights is not None:
            focal_loss = focal_loss * self.weights.unsqueeze(0)
        return focal_loss.mean()

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True, dropout=0.3)
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim * 2, 1),
            nn.Softmax(dim=1)
        )
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        x = x.unsqueeze(1)
        lstm_out, _ = self.lstm(x)
        attention_weights = self.attention(lstm_out)
        context = torch.sum(attention_weights * lstm_out, dim=1)
        return torch.sigmoid(self.fc(context))

def process_in_batches(X, tokenizer, model, batch_size=32):
    embeddings = []
    model.eval()
    for i in tqdm(range(0, len(X), batch_size), desc="Generating embeddings"):
        batch = X[i:i + batch_size].tolist()
        tokens = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt")
        tokens = {k: v.to(device) for k, v in tokens.items()}
        with torch.no_grad():
            outputs = model(**tokens)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)

def calculate_class_weights(y):
    weights = []
    for col in y.columns:
        counts = y[col].value_counts()
        neg = len(y[y[col] <= 0])
        pos = len(y[y[col] > 0])
        weights.append(neg/pos if pos > 0 else 1.0)
    return torch.FloatTensor(weights).to(device)

def train_and_evaluate_lstm(X_train, X_test, y_train, y_test, input_dim, hidden_dim, output_dim,
                          tokenizer, bert_model, conditions, batch_size=32, epochs=15, learning_rate=0.001):
    print("Processing embeddings...")
    X_train_emb = process_in_batches(X_train, tokenizer, bert_model, batch_size)
    X_test_emb = process_in_batches(X_test, tokenizer, bert_model, batch_size)

    train_dataset = TextDataset(torch.tensor(X_train_emb, dtype=torch.float32),
                              torch.tensor(y_train.values, dtype=torch.float32))
    test_dataset = TextDataset(torch.tensor(X_test_emb, dtype=torch.float32),
                             torch.tensor(y_test.values, dtype=torch.float32))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = LSTMClassifier(input_dim=input_dim, hidden_dim=hidden_dim,
                          output_dim=output_dim).to(device)

    class_weights = calculate_class_weights(y_train)
    criterion = MultilabelFocalLoss(gamma=4, weights=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

    best_val_f1 = 0.0
    best_model_state = None
    train_losses = []

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for batch_emb, batch_labels in train_loader:
            batch_emb, batch_labels = batch_emb.to(device), batch_labels.to(device)
            optimizer.zero_grad()
            outputs = model(batch_emb)
            loss = criterion(outputs, batch_labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            epoch_loss += loss.item()

        train_losses.append(epoch_loss)

        # Validation phase
        model.eval()
        val_preds, val_true = [], []
        with torch.no_grad():
            for batch_emb, batch_labels in test_loader:
                batch_emb, batch_labels = batch_emb.to(device), batch_labels.to(device)
                outputs = model(batch_emb)
                val_preds.extend(outputs.cpu().numpy())
                val_true.extend(batch_labels.cpu().numpy())

        val_preds = np.array(val_preds)
        val_true = np.array(val_true)
        val_preds_binary = (val_preds > 0.5).astype(int)

        # Calculate overall metrics
        overall_accuracy = accuracy_score(val_true, val_preds_binary)
        overall_precision, overall_recall, overall_f1, _ = precision_recall_fscore_support(
            val_true, val_preds_binary, average='macro', zero_division=0)

        print(f"\nEpoch {epoch + 1}/{epochs}")
        print(f"Loss: {epoch_loss:.4f}")
        print("\nOverall Metrics:")
        print(f"  Accuracy: {overall_accuracy:.4f}")
        print(f"  Precision: {overall_precision:.4f}")
        print(f"  Recall: {overall_recall:.4f}")
        print(f"  F1: {overall_f1:.4f}")

        # Per-condition metrics
        print("\nPer-condition Metrics:")
        for i, condition in enumerate(conditions):
            accuracy = accuracy_score(val_true[:, i], val_preds_binary[:, i])
            precision, recall, f1, _ = precision_recall_fscore_support(
                val_true[:, i], val_preds_binary[:, i], average='binary', zero_division=0)

            if len(np.unique(val_true[:, i])) > 1:
                roc_auc = roc_auc_score(val_true[:, i], val_preds[:, i])
            else:
                roc_auc = 0

            print(f"{condition}:")
            print(f"  Accuracy: {accuracy:.4f}")
            print(f"  Precision: {precision:.4f}")
            print(f"  Recall: {recall:.4f}")
            print(f"  F1: {f1:.4f}")
            print(f"  ROC AUC: {roc_auc:.4f}")

        if overall_f1 > best_val_f1:
            best_val_f1 = overall_f1
            best_model_state = model.state_dict()

        scheduler.step(overall_f1)

    # Plot loss curve
    plt.figure(figsize=(10, 5))
    plt.plot(train_losses)
    plt.title('Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.savefig('loss_curve.png')
    plt.close()

    # Final evaluation with best model
    model.load_state_dict(best_model_state)
    model.eval()
    final_preds, final_true = [], []

    with torch.no_grad():
        for batch_emb, batch_labels in test_loader:
            batch_emb = batch_emb.to(device)
            outputs = model(batch_emb)
            final_preds.extend(outputs.cpu().numpy())
            final_true.extend(batch_labels.cpu().numpy())

    final_preds = np.array(final_preds)
    final_true = np.array(final_true)
    final_preds_binary = (final_preds > 0.5).astype(int)

    # Save ROC curves
    plt.figure(figsize=(10, 8))
    for i, condition in enumerate(conditions):
        if len(np.unique(final_true[:, i])) > 1:
            fpr, tpr, _ = roc_curve(final_true[:, i], final_preds[:, i])
            roc_auc = roc_auc_score(final_true[:, i], final_preds[:, i])
            plt.plot(fpr, tpr, label=f'{condition} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend()
    plt.savefig('roc_curves.png')
    plt.close()

    return model, best_val_f1

def main():
    print("Loading data...")
    csv_path_full = '/kaggle/input/report-dataset/full_report_data.csv'
    csv_path_mimic = '/kaggle/input/dlproject/mimic-cxr-2.0.0-chexpert.csv'

    full_data = pd.read_csv(csv_path_full)
    conditions = ['Atelectasis', 'Pneumonia', 'Edema']
    mimic_data = pd.read_csv(csv_path_mimic, usecols=['subject_id'] + conditions)

    full_data['subject_id'] = full_data['subject_id'].str.lstrip('p').astype(int)
    mimic_data['subject_id'] = mimic_data['subject_id'].astype(int)

    valid_indices = ~mimic_data[conditions].isin([-1]).any(axis=1)
    mimic_data = mimic_data[valid_indices]

    required_columns = ['examination', 'indication', 'findings', 'impression']
    filtered_data = full_data.dropna(subset=required_columns, how='any')
    filtered_data = filtered_data[filtered_data[required_columns].ne('').all(axis=1)]

    merged_data = pd.merge(filtered_data, mimic_data, on='subject_id', how='inner')
    merged_data = merged_data[merged_data['subject_id'].astype(str).str.match(r'10|11')]

    merged_data['combined_text'] = merged_data[required_columns].agg(' '.join, axis=1)

    print(f"Final dataset size: {len(merged_data)}")
    print("Label distribution:")
    for condition in conditions:
        print(f"{condition}:\n{merged_data[condition].value_counts()}\n")

    X = merged_data['combined_text']
    y = merged_data[conditions].fillna(0)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
    bert_model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext").to(device)

    input_dim = 768
    hidden_dim = 256
    output_dim = len(conditions)
    batch_size = 32

    model, best_f1 = train_and_evaluate_lstm(X_train, X_test, y_train, y_test, input_dim, hidden_dim, output_dim,
                                           tokenizer, bert_model, conditions, batch_size=batch_size, epochs=25)

    print(f"\nBest F1 Score: {best_f1:.4f}")
    torch.save(model.state_dict(), 'best_model.pt')

if __name__ == "__main__":
    main()

Loading data...
Final dataset size: 78460
Label distribution:
Atelectasis:
Atelectasis
1.0    17897
0.0      574
Name: count, dtype: int64

Pneumonia:
Pneumonia
0.0    7170
1.0    5969
Name: count, dtype: int64

Edema:
Edema
1.0    13073
0.0     9297
Name: count, dtype: int64

Processing embeddings...


Generating embeddings:   0%|          | 0/1962 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/491 [00:00<?, ?it/s]


Epoch 1/25
Loss: 329.1147

Overall Metrics:
  Accuracy: 0.6029
  Precision: 0.0000
  Recall: 0.0000
  F1: 0.0000

Per-condition Metrics:
Atelectasis:
  Accuracy: 0.7721
  Precision: 0.0000
  Recall: 0.0000
  F1: 0.0000
  ROC AUC: 0.6457
Pneumonia:
  Accuracy: 0.9213
  Precision: 0.0000
  Recall: 0.0000
  F1: 0.0000
  ROC AUC: 0.6286
Edema:
  Accuracy: 0.8318
  Precision: 0.0000
  Recall: 0.0000
  F1: 0.0000
  ROC AUC: 0.7152

Epoch 2/25
Loss: 322.1419

Overall Metrics:
  Accuracy: 0.6036
  Precision: 0.1770
  Recall: 0.0076
  F1: 0.0145

Per-condition Metrics:
Atelectasis:
  Accuracy: 0.7722
  Precision: 0.0000
  Recall: 0.0000
  F1: 0.0000
  ROC AUC: 0.6649
Pneumonia:
  Accuracy: 0.9213
  Precision: 0.0000
  Recall: 0.0000
  F1: 0.0000
  ROC AUC: 0.6325
Edema:
  Accuracy: 0.8322
  Precision: 0.5310
  Recall: 0.0227
  F1: 0.0436
  ROC AUC: 0.7176

Epoch 3/25
Loss: 319.9713

Overall Metrics:
  Accuracy: 0.6027
  Precision: 0.1616
  Recall: 0.0075
  F1: 0.0143

Per-condition Metrics:
At