In [11]:
import os
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

import tensorflow as tf
from tensorflow.keras import layers, models
import joblib

In [12]:
df = pd.read_csv('../Data/sample_labels.csv')
df = df[['Image Index', 'Finding Labels']]
df['Diseases'] = df['Finding Labels'].apply(lambda x: x.split('|'))
#label_encoder = LabelEncoder()
#df['Labels'] = label_encoder.fit_transform(df['Finding Labels'])
#joblib.dump(label_encoder, 'label_encoder.joblib')


In [13]:
disease_columns = set([disease for diseases in df['Diseases'] for disease in diseases])
for disease in disease_columns:
    df[disease] = df['Diseases'].apply(lambda diseases: int(disease in diseases))


In [15]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [16]:
def load_and_preprocess_images(image_paths, labels):
    images = []
    for path, label in zip(image_paths, labels):
        img = cv2.imread(path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (224, 224))
        img = img / 255.0
        images.append(img)
    return np.array(images), labels


In [17]:
def construct_image_paths(image_folder, image_indices):
    return [os.path.join(image_folder, img) for img in image_indices]


In [18]:
image_folder = '../Data/images'

train_image_paths = construct_image_paths(image_folder, train_df['Image Index'])
test_image_paths = construct_image_paths(image_folder, test_df['Image Index'])


In [19]:
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    validation_split=0.2
)


In [22]:
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=image_folder,
    x_col="Image Index",
    y_col=list(disease_columns),
    target_size=(224, 224),
    batch_size=10, 
    class_mode="raw",
    subset="training"
)

Found 3588 validated image filenames.


In [23]:
validation_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=image_folder,
    x_col="Image Index",
    y_col=list(disease_columns),
    target_size=(224, 224),
    batch_size=10,  
    class_mode="raw",
    subset="validation"
)

Found 896 validated image filenames.


In [25]:
base_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))


In [26]:
for layer in base_model.layers:
    layer.trainable = False


In [27]:
model = models.Sequential([
    base_model,
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(disease_columns), activation='sigmoid')
])


In [28]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [29]:
history = model.fit(
    train_generator,
    epochs=5,  
    validation_data=validation_generator
)

Epoch 1/5


2023-12-12 05:01:36.600972: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2023-12-12 05:01:38.310293: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fe1d5e53a10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-12 05:01:38.310315: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4070, Compute Capability 8.9
2023-12-12 05:01:38.318883: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1702337498.398380    7432 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
predictions = model.predict(test_generator)
binary_predictions = (predictions > 0.5).astype(int)

correct_predictions = np.sum(binary_predictions == test_df[list(disease_columns)].values)

accuracy = correct_predictions / (len(test_df) * len(disease_columns))

print(f"Test Accuracy: {accuracy * 100:.2f}%")



Test Accuracy: 92.22%


In [36]:
from tensorflow.keras.models import save_model
model.save('aayush_xray_classif_binary.h5')

  saving_api.save_model(


In [37]:
import cv2
import numpy as np

# Load and preprocess a single image
def preprocess_single_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (224, 224))
    img = img / 255.0
    img = np.expand_dims(img, axis=0) 
    return img


image_path = '../presentation/00000030_001.png' 
processed_image = preprocess_single_image(image_path)


predictions = model.predict(processed_image)


binary_predictions = (predictions > 0.5).astype(int)


print("Raw Probabilities:", predictions)
print("Binary Predictions:", binary_predictions)


Raw Probabilities: [[3.4135528e-02 1.7226899e-02 1.0381831e-02 4.3075107e-02 5.8828201e-02
  5.3020085e-05 1.5780412e-01 1.4980943e-02 6.7475036e-02 6.1270541e-01
  6.8680264e-02 1.1307931e-02 6.7990646e-03 1.4539596e-02 5.6821056e-02]]
Binary Predictions: [[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]]


In [38]:

disease_mapping = {
    0: 'Atelectasis',
    1: 'Cardiomegaly',
    2: 'Effusion',
    3: 'Infiltration',
    4: 'Mass',
    5: 'Nodule',
    6: 'Pneumonia',
    7: 'Pneumothorax',
    8: 'Consolidation',
    9: 'Edema',
    10: 'Emphysema',
    11: 'Fibrosis',
    12: 'Pleural_Thickening',
    13: 'Hernia'
}


predicted_diseases = [disease_mapping[i] for i, pred in enumerate(binary_predictions[0]) if pred == 1]


print("Predicted Diseases:", predicted_diseases)


Predicted Diseases: ['Edema']
