In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
import matplotlib.gridspec as gridspec
import matplotlib.ticker as ticker
sns.set_style('whitegrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from keras.layers import Input
from keras.models import Model
from keras.layers import Dense
from keras.optimizers import Adam
#from generator import DataGenerator
import keras

In [None]:
data = pd.read_csv('../input/Data_Entry_2017.csv')
data = data[data['Patient Age']<100] #removing datapoints which having age greater than 100
data_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('..', 'input', 'images*', '*', '*.png'))}
print('Scans found:', len(data_image_paths), ', Total Headers', data.shape[0])
data['path'] = data['Image Index'].map(data_image_paths.get)
data['Patient Age'] = data['Patient Age'].map(lambda x: int(x))
data.sample(3)

In [None]:
data['Finding Labels'] = data['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
from itertools import chain
all_labels = np.unique(list(chain(*data['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        data[c_label] = data['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)
data.sample(3)

In [None]:
data['Finding Labels'] = data['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
from itertools import chain
all_labels = np.unique(list(chain(*data['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        data[c_label] = data['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)
data.sample(3)

In [None]:
# keep at least 1000 cases
MIN_CASES = 1000
all_labels = [c_label for c_label in all_labels if data[c_label].sum()>MIN_CASES]
print('Clean Labels ({})'.format(len(all_labels)), 
      [(c_label,int(data[c_label].sum())) for c_label in all_labels])

In [None]:
# creating vector of diseases
data['disease_vec'] = data.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, 
                                   test_size = 0.20, 
                                   random_state = 2018,
                                   stratify = data['Finding Labels'].map(lambda x: x[:4]))
print('train', train_df.shape[0], 'test', test_df.shape[0])

In [None]:
train_df, valid_df = train_test_split(train_df, 
                                   test_size = 0.10, 
                                   random_state = 2018,
                                   stratify = train_df['Finding Labels'].map(lambda x: x[:4]))
print('train', train_df.shape[0], 'valid', valid_df.shape[0])

In [None]:
def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
    base_dir = os.path.dirname(in_df[path_col].values[0])
    print('## Ignore next message from keras, values are replaced anyways')
    df_gen = img_data_gen.flow_from_directory(base_dir, 
                                     class_mode = 'sparse',
                                    **dflow_args)
    df_gen.filenames = in_df[path_col].values
    df_gen.classes = np.stack(in_df[y_col].values)
    df_gen.samples = in_df.shape[0]
    df_gen.n = in_df.shape[0]
    df_gen._set_index_array()
    df_gen.directory = '' # since we have the full path
    print('Reinserting dataframe: {} images'.format(in_df.shape[0]))
    return df_gen

In [None]:
from keras.applications.densenet import DenseNet121, preprocess_input


In [None]:
#Obtain the TSNE Plot for the data
plot_TSNE(train_loader, device, model)

In [None]:
from keras.preprocessing.image import ImageDataGenerator
from PIL import Image
IMG_SIZE = (224, 224) # slightly smaller than vgg16 normally expects
core_idg_dense = ImageDataGenerator()

In [None]:
train_gen = flow_from_dataframe(core_idg_dense, train_df, 
                             path_col = 'path',
                            y_col = 'disease_vec', 
                            target_size = IMG_SIZE,
                             color_mode = 'rgb',
                            batch_size = 16)

valid_gen = flow_from_dataframe(core_idg_dense, valid_df, 
                             path_col = 'path',
                            y_col = 'disease_vec', 
                            target_size = IMG_SIZE,
                             color_mode = 'rgb',
                            batch_size = 32) # we can use much larger batches for evaluation
# used a fixed dataset for evaluating the algorithm
test_X, test_Y = next(flow_from_dataframe(core_idg_dense, 
                               test_df, 
                             path_col = 'path',
                            y_col = 'disease_vec', 
                            target_size = IMG_SIZE,
                             color_mode = 'rgb',
                            batch_size = 8000)) # one big batch
# used a fixed dataset for final evaluation

In [None]:
t_x, t_y = next(train_gen)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(t_x, t_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0])
    c_ax.set_title(', '.join([n_class for n_class, n_score in zip(all_labels, c_y) 
                             if n_score>0.5]))
    c_ax.axis('off')

In [None]:
# dense net model
img_in = Input(t_x.shape[1:])              #input of model 
model = DenseNet121(include_top= False , # remove  the 3 fully-connected layers at the top of the network
                weights='imagenet',      # pre train weight 
                input_tensor= img_in, 
                input_shape= t_x.shape[1:],
                pooling ='avg') 

x = model.output  
predictions = Dense(len(all_labels), activation="sigmoid", name="predictions")(x)    # fuly connected layer for predict class 
model = Model(inputs=img_in, outputs=predictions)

In [None]:
optimizer = Adam(lr=0.001)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=[keras.metrics.binary_accuracy])

In [None]:
history = model.fit_generator(train_gen, 
                                  steps_per_epoch=100,
                                  validation_data = valid_gen, 
                                  epochs = 20)

In [None]:
y_pred = model.predict(test_X)

In [None]:
# look at how often the algorithm predicts certain diagnoses 
for c_label, p_count, t_count in zip(all_labels, 
                                     100*np.mean(y_pred,0), 
                                     100*np.mean(test_Y,0)):
    print('%s: actual: %2.2f%%, predicted: %2.2f%%' % (c_label, t_count, p_count))

In [None]:
from sklearn.metrics import roc_curve, auc
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:,idx].astype(int), y_pred[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
c_ax.legend()
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')
fig.savefig('trained_net.png')

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(test_Y.astype(int), y_pred)

In [None]:
history = model.fit_generator(train_gen, 
                                  steps_per_epoch=100,
                                  validation_data = valid_gen, 
                                  epochs = 20)

In [None]:
y_pred = model.predict(test_X)

In [None]:
# look at how often the algorithm predicts certain diagnoses 
for c_label, p_count, t_count in zip(all_labels, 
                                     100*np.mean(y_pred,0), 
                                     100*np.mean(test_Y,0)):
    print('%s: actual: %2.2f%%, predicted: %2.2f%%' % (c_label, t_count, p_count))

In [None]:
from sklearn.metrics import roc_curve, auc
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:,idx].astype(int), y_pred[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
c_ax.legend()
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')
fig.savefig('trained_net.png')

from sklearn.metrics import roc_auc_score
roc_auc_score(test_Y.astype(int), y_pred)

In [5]:
## Pytorch Implementation

In [None]:
import pandas as pd
import os
from glob import glob
import numpy as np
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision.models import densenet121
import torch.nn as nn
import torch.optim as optim

# Load and filter data
data = pd.read_csv('../input/Data_Entry_2017.csv')
data = data[data['Patient Age'] < 100]  # Removing invalid ages
data_image_paths = {os.path.basename(x): x for x in glob(os.path.join('..', 'input', 'images*', '*', '*.png'))}
data['path'] = data['Image Index'].map(data_image_paths.get)
data['Patient Age'] = data['Patient Age'].astype(int)
data['Finding Labels'] = data['Finding Labels'].map(lambda x: x.replace('No Finding', ''))

# Process labels
from itertools import chain
all_labels = np.unique(list(chain(*data['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if x]
for label in all_labels:
    if len(label) > 1:  # Avoid empty labels
        data[label] = data['Finding Labels'].map(lambda findings: 1.0 if label in findings else 0)

# Filter labels to keep
MIN_CASES = 1000
all_labels = [label for label in all_labels if data[label].sum() > MIN_CASES]
data['disease_vec'] = data[all_labels].values.tolist()

# Print information
print(f"Clean Labels ({len(all_labels)}): {[(label, int(data[label].sum())) for label in all_labels]}")
print('Scans found:', len(data_image_paths), ', Total Headers:', data.shape[0])

In [None]:
class ChestXrayDataset(Dataset):
    def __init__(self, dataframe, image_paths, labels, transform=None):
        self.dataframe = dataframe
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        image_path = self.dataframe.iloc[idx]['path']
        image = Image.open(image_path).convert('RGB')
        label = torch.tensor(self.dataframe.iloc[idx][self.labels].astype(float).values)

        if self.transform:
            image = self.transform(image)

        return image, label

# Define transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Split data
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size=0.20, random_state=2018, stratify=data['Finding Labels'].str[:4])
train_df, valid_df = train_test_split(train_df, test_size=0.10, random_state=2018, stratify=train_df['Finding Labels'].str[:4])

# Create DataLoaders
train_dataset = ChestXrayDataset(train_df, data_image_paths, all_labels, transform=transform)
valid_dataset = ChestXrayDataset(valid_df, data_image_paths, all_labels, transform=transform)
test_dataset = ChestXrayDataset(test_df, data_image_paths, all_labels, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
class DenseNetModel(nn.Module):
    def __init__(self, num_classes):
        super(DenseNetModel, self).__init__()
        original_model = densenet121(pretrained=True)
        self.features = original_model.features
        self.classifier = nn.Linear(1024, num_classes)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        features = self.features(x)
        out = torch.relu(features, inplace=True)
        out = torch.adaptive_avg_pool2d(out, (1, 1)).view(features.size(0), -1)
        out = self.classifier(out)
        return self.sigmoid(out)

model = DenseNetModel(len(all_labels))
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [None]:
# Training and Validation Loop
def train_and_validate(model, train_loader, valid_loader, criterion, optimizer, num_epochs=20):
    for epoch in range(num_epochs):
        # Training Phase
        model.train()
        total_train_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.cuda(), labels.cuda()
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_loader)
        
        # Validation Phase
        model.eval()
        total_valid_loss = 0.0
        with torch.no_grad():
            for images, labels in valid_loader:
                images, labels = images.cuda(), labels.cuda()
                outputs = model(images)
                loss = criterion(outputs, labels)
                total_valid_loss += loss.item()

        avg_valid_loss = total_valid_loss / len(valid_loader)
        
        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Train Loss: {avg_train_loss:.4f}, '
              f'Validation Loss: {avg_valid_loss:.4f}')

# Continue with model training
train_and_validate(model, train_loader, valid_loader, criterion, optimizer, num_epochs=20)


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Function to compute the ROC AUC score
def compute_roc_auc(model, data_loader, num_classes):
    model.eval()
    y_true = torch.FloatTensor().cuda()
    y_pred = torch.FloatTensor().cuda()
    
    with torch.no_grad():
        for images, labels in data_loader:
            images, labels = images.cuda(), labels.cuda()
            outputs = model(images)
            y_true = torch.cat((y_true, labels), 0)
            y_pred = torch.cat((y_pred, outputs), 0)

    roc_auc_dict = {}
    for i, label in enumerate(all_labels):
        fpr, tpr, _ = roc_curve(y_true.cpu()[:, i], y_pred.cpu()[:, i])
        roc_auc_dict[label] = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc_dict[label]:.2f})')
    
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

    return roc_auc_dict

# Compute and plot ROC AUC
roc_auc_scores = compute_roc_auc(model, test_loader, len(all_labels))

# Visualization of Predictions
def visualize_predictions(model, data_loader, num_images=4):
    model.eval()
    images, labels = next(iter(data_loader))
    images, labels = images.cuda(), labels.cuda()
    outputs = model(images)
    outputs = outputs > 0.5  # Threshold predictions

    fig, axs = plt.subplots(1, num_images, figsize=(15, 10))
    for i in range(num_images):
        axs[i].imshow(images[i].cpu().permute(1, 2, 0))
        axs[i].axis('off')
        disease_labels = ', '.join([all_labels[j] for j in range(outputs.shape[1]) if outputs[i, j] == 1])
        axs[i].set_title(disease_labels)

    plt.show()

# Call visualization function
visualize_predictions(model, test_loader, num_images=4)
