In [None]:
# !pip install kaggle kagglehub -q

In [2]:
import os
import shutil
import kagglehub

def safe_load_image_file_dataset():
    target_path = "/content/sample_data/paultimothymooney/breast-histopathology-images/versions/1"
    cache_path = "/root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1"

    # Check if dataset already exists in the target path
    if os.path.exists(target_path):
        print("Dataset already exists in the desired path.")
        return target_path

    # Check if dataset exists in the cache and move it
    if os.path.exists(cache_path):
        shutil.move(cache_path, target_path)
        print("Dataset moved to the desired path.")
        return target_path

    # Download dataset and move it
    print("Downloading dataset...")
    kagglehub.dataset_download("paultimothymooney/breast-histopathology-images")

    # Ensure cache path exists after download
    if os.path.exists(cache_path):
        shutil.move(cache_path, target_path)
        print("Dataset downloaded and moved to the desired path.")
        return target_path
    else:
        print("Download failed or path not found.")
        return None

safe_load_image_file_dataset()

Downloading dataset...
Downloading from https://www.kaggle.com/api/v1/datasets/download/paultimothymooney/breast-histopathology-images?dataset_version_number=1...


100%|██████████| 3.10G/3.10G [00:13<00:00, 255MB/s]

Extracting files...





Dataset downloaded and moved to the desired path.


'/content/sample_data/paultimothymooney/breast-histopathology-images/versions/1'

In [3]:
# Removes duplicates

path = "/content/sample_data/paultimothymooney/breast-histopathology-images/versions/1"

# Track seen file names
seen_files = set()
duplicates = []

# Traverse through all files in the dataset
for root, _, files in os.walk(path):
    for file in files:
        file_path = os.path.join(root, file)

        if file in seen_files:
            duplicates.append(file_path)  # Record duplicate files
        else:
            seen_files.add(file)

# Remove duplicate files
print(f"Found {len(duplicates)} duplicate files. Removing them...")
for duplicate_file in duplicates:
    os.remove(duplicate_file)

print("Duplicate files removed successfully!")


Found 277524 duplicate files. Removing them...
Duplicate files removed successfully!


In [4]:
import pandas as pd
import re
import glob
import os
from datetime import datetime

def load_image_data_to_dataframe(path):
    image_files = glob.glob(f"{path}/**/*.png", recursive=True)

    data = []
    pattern = re.compile(r"(\d+)_idx\d+_x(\d+)_y(\d+)_class(\d+).png")

    for file in image_files:
        match = pattern.search(file)
        if match:
            patient_id, x_coord, y_coord, class_label = match.groups()

            # Get file metadata
            file_size = os.path.getsize(file)  # File size in bytes
            created_time = os.path.getctime(file)  # File creation time (epoch)
            modified_time = os.path.getmtime(file)  # Last modified time (epoch)

            data.append({
                "patient_id": int(patient_id),
                "x_coordinate": int(x_coord),
                "y_coordinate": int(y_coord),
                "file_path": file,
                "class_label": int(class_label),
                "file_size_bytes": file_size,
                "created_at": datetime.fromtimestamp(created_time).strftime('%Y-%m-%d %H:%M:%S'),
                "last_modified_at": datetime.fromtimestamp(modified_time).strftime('%Y-%m-%d %H:%M:%S'),
            })

    return pd.DataFrame(data)

# Usage
path = "/content/sample_data/paultimothymooney/breast-histopathology-images"
df = load_image_data_to_dataframe(path)
df.head()

Unnamed: 0,patient_id,x_coordinate,y_coordinate,file_path,class_label,file_size_bytes,created_at,last_modified_at
0,9175,1801,201,/content/sample_data/paultimothymooney/breast-...,1,6416,2025-02-16 14:13:39,2025-02-16 14:13:05
1,9175,1801,151,/content/sample_data/paultimothymooney/breast-...,1,6294,2025-02-16 14:13:39,2025-02-16 14:13:05
2,9175,1901,201,/content/sample_data/paultimothymooney/breast-...,1,4615,2025-02-16 14:13:39,2025-02-16 14:13:05
3,9175,1751,251,/content/sample_data/paultimothymooney/breast-...,1,6498,2025-02-16 14:13:39,2025-02-16 14:13:05
4,9175,1801,251,/content/sample_data/paultimothymooney/breast-...,1,5505,2025-02-16 14:13:39,2025-02-16 14:13:05


In [5]:
from sklearn.model_selection import train_test_split

# Convert DataFrame columns to lists
file_paths = df["file_path"].values
labels = df["class_label"].values

# Split into training and testing sets
train_paths, test_paths, train_labels, test_labels = train_test_split(
    file_paths, labels, test_size=0.2, random_state=42, shuffle=True
)


In [6]:

import tensorflow as tf
import pandas as pd
import os


# Function to load and preprocess images
def load_and_preprocess_image(file_path, target_size=(224, 224)):
    image = tf.io.read_file(file_path)  # Read the image file
    image = tf.image.decode_png(image, channels=3)  # Decode PNG image (RGB format)
    image = tf.image.resize(image, target_size)  # Resize to target size
    image = image / 255.0  # Normalize to [0, 1]
    return image

# Convert the labels to categorical (one-hot encoded)
train_labels = tf.keras.utils.to_categorical(train_labels)
test_labels = tf.keras.utils.to_categorical(test_labels)

# Create tf.data.Dataset from file paths and labels
def create_tf_dataset(file_paths, labels, batch_size=32, target_size=(224, 224)):
    # Create a TensorFlow Dataset from the file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((file_paths, labels))

    # Map the image paths to the preprocessing function
    dataset = dataset.map(lambda x, y: (load_and_preprocess_image(x, target_size), y))

    # Shuffle, batch, and prefetch the dataset for better performance
    dataset = dataset.shuffle(buffer_size=len(file_paths))
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

# Create train and test datasets
train_dataset = create_tf_dataset(train_paths, train_labels, batch_size=32, target_size=(224, 224))
test_dataset = create_tf_dataset(test_paths, test_labels, batch_size=32, target_size=(224, 224))

# Check the size of the first batch from the training dataset
for x_batch, y_batch in train_dataset.take(1):
    print(f"Train batch shape: {x_batch.shape}")  # Expected: (batch_size, 224, 224, 3)

# Check the size of the first batch from the test dataset
for x_batch_test, y_batch_test in test_dataset.take(1):
    print(f"Test batch shape: {x_batch_test.shape}")  # Expected: (batch_size, 224, 224, 3)


Train batch shape: (32, 224, 224, 3)
Test batch shape: (32, 224, 224, 3)


In [7]:

import numpy as np
import cv2
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

base_model = VGG16(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze pretrained layers

x = Flatten()(base_model.output)
x = Dense(256, activation="relu")(x)
x = Dropout(0.5)(x)  # Dropout to prevent overfitting
x = Dense(2, activation="softmax")(x)  # 2 classes (0 or 1)

model = Model(inputs=base_model.input, outputs=x)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [8]:

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Calculate steps per epoch and validation steps based on dataset length
steps_per_epoch = len(train_paths) // 32  # Assuming a batch size of 32
validation_steps = len(test_paths) // 32  # Assuming a batch size of 32

# Train the model
model.fit(
    train_dataset,  # Use tf.data.Dataset for training
    validation_data=test_dataset,  # Use tf.data.Dataset for validation
    epochs=10,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps
)


Epoch 1/10


NotFoundError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/usr/local/lib/python3.11/dist-packages/tornado/ioloop.py", line 699, in <lambda>

  File "/usr/local/lib/python3.11/dist-packages/tornado/ioloop.py", line 750, in _run_callback

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 824, in inner

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 785, in run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-8-73848f90e55c>", line 20, in <cell line: 0>

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

could not find registered transfer manager for platform Host -- check target linkage
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_2048]

In [None]:
model.summary()


In [None]:

# test_loss, test_acc = model.evaluate(X_test, y_test)
test_loss, test_acc = model.evaluate(test_generator)
print(f"Test Accuracy: {test_acc:.4f}")

# predictions = model.predict(X_test)
predictions = model.predict(test_generator)
predicted_classes = np.argmax(predictions, axis=1)


  self._warn_if_super_not_called()


NotFoundError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 619, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/usr/local/lib/python3.11/dist-packages/tornado/ioloop.py", line 699, in <lambda>

  File "/usr/local/lib/python3.11/dist-packages/tornado/ioloop.py", line 750, in _run_callback

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 824, in inner

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 785, in run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 361, in process_one

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 539, in execute_request

  File "/usr/local/lib/python3.11/dist-packages/tornado/gen.py", line 233, in wrapper

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 302, in do_execute

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 539, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-35-1c29066b9cfc>", line 2, in <cell line: 0>

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 484, in evaluate

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

could not find registered transfer manager for platform Host -- check target linkage
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_7716]