<a href="https://colab.research.google.com/github/amanupg/Imageclef-2024/blob/main/Classification_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import pathlib
from sklearn.cluster import KMeans
from tensorflow.keras.layers import Conv2D, BatchNormalization, Activation, MaxPooling2D, Conv2DTranspose, Concatenate, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, ReduceLROnPlateau, EarlyStopping, TensorBoard

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_ds = keras.utils.image_dataset_from_directory(
    directory='/content/drive/MyDrive/Machine_Learning/Research/Train',
    labels='inferred',
    label_mode='int',
    batch_size=16,
    image_size=(256, 256),
    color_mode='grayscale'
)
validation_ds = keras.utils.image_dataset_from_directory(
    directory='/content/drive/MyDrive/Machine_Learning/Research/Test',
    labels='inferred',
    label_mode='int',
    batch_size=16,
    image_size=(256, 256),
    color_mode='grayscale'
)

Found 540 files belonging to 3 classes.
Found 60 files belonging to 3 classes.


## Image processing and model building

In [None]:
def process(image, label):
    image = tf.cast(image / 255, tf.float32)
    label = tf.one_hot(label, depth=3)  # One-hot encode labels for 3 classes
    return image, label


train_ds = train_ds.map(process)
validation_ds = validation_ds.map(process)

In [None]:
input_shape = (256, 256, 1)  # Adjust input shape for grayscale images


def conv_block(input, num_filters):
    x = Conv2D(num_filters, 3, padding="same")(input)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)

    x = Conv2D(num_filters, 3, padding="same")(x)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)

    return x


def encoder_block(input, num_filters):
    x = conv_block(input, num_filters)
    p = MaxPooling2D((2, 2))(x)
    return x, p


def decoder_block(input, skip_features, num_filters):
    x = Conv2DTranspose(num_filters, (2, 2), strides=2, padding="same")(input)
    x = Concatenate()([x, skip_features])
    x = conv_block(x, num_filters)
    return x

# def build_unet(input_shape, num_classes):
#     inputs = Input(input_shape)

#     s1, p1 = encoder_block(inputs, 64)
#     s2, p2 = encoder_block(p1, 128)
#     s3, p3 = encoder_block(p2, 256)
#     s4, p4 = encoder_block(p3, 512)

#     b1 = conv_block(p4, 1024)

#     d1 = decoder_block(b1, s4, 512)
#     d2 = decoder_block(d1, s3, 256)
#     d3 = decoder_block(d2, s2, 128)
#     d4 = decoder_block(d3, s1, 64)

#     outputs = Conv2D(3, 1, padding="same", activation="softmax")(d4)

#     model = Model(inputs, outputs, name="U-Net")
#     return model

def build_unet(input_shape, num_classes):
    inputs = Input(input_shape)

    s1, p1 = encoder_block(inputs, 64)
    s2, p2 = encoder_block(p1, 128)
    s3, p3 = encoder_block(p2, 256)
    s4, p4 = encoder_block(p3, 512)

    b1 = conv_block(p4, 1024)

    # Flatten the feature maps from your bottleneck layer
    flatten = keras.layers.Flatten()(b1)

    # Add a few dense layers
    dense1 = keras.layers.Dense(256, activation='relu')(flatten)
    dense2 = keras.layers.Dense(64, activation='relu')(dense1)

    # Output layer
    outputs = keras.layers.Dense(num_classes, activation='softmax')(dense2)

    model = Model(inputs, outputs, name="U-Net")
    return model



In [None]:
model = build_unet(input_shape, 3)
model.summary()

Model: "U-Net"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 256, 256, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 256, 256, 64)      640       
                                                                 
 batch_normalization (Batch  (None, 256, 256, 64)      256       
 Normalization)                                                  
                                                                 
 activation (Activation)     (None, 256, 256, 64)      0         
                                                                 
 conv2d_1 (Conv2D)           (None, 256, 256, 64)      36928     
                                                                 
 batch_normalization_1 (Bat  (None, 256, 256, 64)      256       
 chNormalization)                                            

In [None]:
batch_size = 16
lr = 1e-4
num_epochs = 200

model.compile(optimizer=Adam(lr), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model_path = "/content/drive/MyDrive/Machine_Learning/Research/Model/model.h5"
csv_path = "/content/drive/MyDrive/Machine_Learning/Research/Model/training.log"
callbacks = [
        ModelCheckpoint(model_path, verbose=1, save_best_only=True),
        CSVLogger(csv_path),
        TensorBoard(),
        EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=False),
    ]

model.fit(
    train_ds,
    epochs=num_epochs,
    validation_data=validation_ds,
    callbacks=callbacks,
    shuffle=False
)

Epoch 1/200
Epoch 1: val_loss improved from inf to 0.00000, saving model to /content/drive/MyDrive/Machine_Learning/Research/Model/model.h5
Epoch 2/200
Epoch 2: val_loss did not improve from 0.00000
Epoch 3/200
Epoch 3: val_loss did not improve from 0.00000
Epoch 4/200
Epoch 4: val_loss did not improve from 0.00000
Epoch 5/200
Epoch 5: val_loss did not improve from 0.00000
Epoch 6/200
Epoch 6: val_loss did not improve from 0.00000
Epoch 7/200
Epoch 7: val_loss did not improve from 0.00000
Epoch 8/200
Epoch 8: val_loss did not improve from 0.00000
Epoch 9/200
Epoch 9: val_loss did not improve from 0.00000
Epoch 10/200
Epoch 10: val_loss did not improve from 0.00000
Epoch 11/200
Epoch 11: val_loss did not improve from 0.00000
Epoch 12/200
Epoch 12: val_loss did not improve from 0.00000
Epoch 13/200
Epoch 13: val_loss did not improve from 0.00000
Epoch 14/200
Epoch 14: val_loss did not improve from 0.00000
Epoch 15/200
Epoch 15: val_loss did not improve from 0.00000
Epoch 16/200
Epoch 16:

<keras.src.callbacks.History at 0x7cd8e04a6920>

## Generating clusters on Validation set

In [None]:
model = tf.keras.models.load_model('/content/drive/MyDrive/Machine_Learning/Research/Model/model.h5')

In [None]:
# Remove the last layer of the model
feature_extractor = Model(inputs=model.input, outputs=model.layers[-2].output)

# Extract features from the test dataset
test_features = feature_extractor.predict(validation_ds)

num_clusters = 4  # specify the number of clusters
kmeans = KMeans(n_clusters=num_clusters)
clusters = kmeans.fit_predict(test_features)

# clusters variable now contains the cluster labels for each image in the test dataset

kmeans.labels_

## Generating clusters on small subset of test dataset

In [None]:
data_dir = '/content/drive/MyDrive/Machine_Learning/Research/small'
data_dir = pathlib.Path(data_dir)

In [None]:
image_count = len(list(data_dir.glob('*.*')))

# Create a dataset of image file paths
list_ds = tf.data.Dataset.list_files(str(data_dir/'*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)

In [None]:
def process_path(file_path):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_png(img, channels=1)  # Use decode_png if images are png format
    img = tf.image.resize(img, [256, 256])
    img = tf.expand_dims(img, axis=-1)  # Add a channel dimension
    img = (img / 255.0)  # Normalize pixels to 0,1
    return img


In [None]:
new_dataset = list_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)


In [None]:
# Remove the last layer of the model
feature_extractor = Model(inputs=model.input, outputs=model.layers[-2].output)

# Batch the dataset
new_dataset = new_dataset.batch(16)  # Replace 16 with your desired batch size

# Extract features from the new dataset
new_features = feature_extractor.predict(new_dataset)

# Perform clustering on the extracted features
num_clusters = 4  # specify the number of clusters
kmeans = KMeans(n_clusters=num_clusters)
new_clusters = kmeans.fit_predict(new_features)

# new_clusters variable now contains the cluster labels for each image in the new dataset
print(kmeans.labels_)






[3 3 3 1 1 2 2 1 1 3 3 0 0 0 1 3 0 1 0 3 2 3 3 0 3 0 1 1 1 3 1 3 1 1 1 1 2
 3 0 1 1 3 1 2 1 3 2 1 1 2 1 3 2 1 2 1 2 2 0 0 3 3 3 0 1 1 1 0 0 0 2 1 0 0
 0 3 1 1 1 1 1 1 2 3 0 0 2 1 1 3 2 1 3 3 2 3 1 1 3 1 0 1 1 3 1 3 2 3 3 2 3
 3 3 3 2 1 2 3 0 0 0 0 3 1 3 3 1 2 0 1 1 1 0 3 1 1 1 1 0 3 1 0 1 2 3 3 2 0
 1 3 2 3 1 1 1 0 2 0 0 1 2 3 3 3 1 3 2 3 0 3 1 3 2 3 0 1 0 2 3 0 3 3 0 1 1
 1 3 3 2 0 0 3 2 0 1 2 0 0 2 0 1 3 0 3 0 2 1 3 2 2 0 3 1 1 0 0 0 1 2 0 1 0
 1 0 3 0 2 1 1 2 1 2 1 2 0 2 2 1 1 3 1 1 3 3 0 1 1 1 0 0 0 3 3 1 0 2 1 1 3
 0 1 2 1 3 0 3 2 1 1 0 0 3 0 0 0 0 3 3 2 0 1 1 1 3 1 1 1 3 1 0 1 1 2 3 3 1
 0 3 3 1]


## Generating clusters on actual Test Dataset and logging results into a csv file with associated labels

In [None]:
data_dir = '/content/drive/MyDrive/Machine_Learning/Research/ImageCLEFmedical  GANs 2024 - Task2 test/generated_images'
data_dir = pathlib.Path(data_dir)

In [None]:
image_count = len(list(data_dir.glob('*.*')))

# Create a dataset of image file paths
list_ds = tf.data.Dataset.list_files(str(data_dir/'*'), shuffle=False)
list_ds = list_ds.shuffle(image_count, reshuffle_each_iteration=False)

In [None]:
def process_path(file_path):
    img = tf.io.read_file(file_path)
    img = tf.image.decode_png(img, channels=1)  # Use decode_png if images are png format
    img = tf.image.resize(img, [256, 256])
    img = tf.expand_dims(img, axis=-1)  # Add a channel dimension
    img = (img / 255.0)  # Normalize pixels to 0,1
    return img


In [None]:
new_dataset = list_ds.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)


In [None]:
# Remove the last layer of the model
feature_extractor = Model(inputs=model.input, outputs=model.layers[-2].output)

# Batch the dataset
new_dataset = new_dataset.batch(16)  # Replace 16 with your desired batch size

# Extract features from the new dataset
new_features = feature_extractor.predict(new_dataset)

# Perform clustering on the extracted features
num_clusters = 4  # specify the number of clusters
kmeans = KMeans(n_clusters=num_clusters)
new_clusters = kmeans.fit_predict(new_features)

# new_clusters variable now contains the cluster labels for each image in the new dataset
print(kmeans.labels_)






[1 3 3 ... 1 1 1]


In [None]:
print(kmeans.labels_[:20])

[1 3 3 1 1 3 2 1 0 3 0 1 2 0 1 0 0 0 0 1]


In [None]:
import pandas as pd

# Load the provided run.csv file
provided_run_path = '/content/drive/MyDrive/Machine_Learning/Research/ImageCLEFmedical  GANs 2024 - Task2 test/run.csv'  # Update with the actual path

# Read the file without header
provided_run_df = pd.read_csv(provided_run_path, header=None, names=['data'])

# Map figure IDs to cluster labels
figure_ids = provided_run_df['data'].str.split('\t', expand=True)[0]  # Extract figure IDs
cluster_mapping = dict(zip(figure_ids, new_clusters + 1))  # Adding 1 to clusters to match the required labels [1, 2, 3, 4]

# Update the data column with figure IDs and corresponding cluster labels separated by comma
provided_run_df['data'] = provided_run_df['data'].str.split('\t').apply(lambda x: f"{x[0]},{cluster_mapping[x[0]]}")

# Save the updated DataFrame to the same CSV file (overwrite the existing file)
provided_run_df.to_csv(provided_run_path, index=False, header=False)
