In [1]:
import os
import fnmatch
import scipy.io as sio
import glob 
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import xml.etree.ElementTree as ET
import json


In [2]:
dataset_path = '/kaggle/input/ddti-thyroid-ultrasound-images'

In [3]:
xml_file_paths = glob.glob(os.path.join(dataset_path, '*.xml'))

In [4]:
len(xml_file_paths)

390

In [5]:
rows = []

for file_path in xml_file_paths:
    # Load the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    data = {}
    for child in root:
        if child.tag in ["number", "tirads"]:
            data[child.tag] = child.text
            # print(child.tag, child.attrib, child.text)
    if data["tirads"] == None:
        continue
    rows.append(data)

# Correct capitalization of pd.DataFrame
df = pd.DataFrame(rows)
df

Unnamed: 0,number,tirads
0,128,4c
1,346,3
2,142,4c
3,127,2
4,338,4c
...,...,...
293,263,4b
294,269,4b
295,295,4a
296,301,4a


In [6]:
ids = []
image_arrays = []
labels = []
for idx, row  in df.iterrows():
    file_path = f"/kaggle/input/ddti-thyroid-ultrasound-images/{row['number']}_1.jpg"

    if not os.path.exists(file_path):
        print(f"Skipping: {file_path} (file not found)")
        continue
    # Do your processing here
    print(f"Processing: {file_path} with triad score: {row['tirads']}")
    # Load image as PIL Image
    img = Image.open(file_path).convert('L')  # Convert to grayscale
    
    # Convert to NumPy array
    img_array = np.array(img).astype(np.float32)
    
    # Normalize to [0, 255] safely
    img_min = img_array.min()
    img_max = img_array.max()
    
    if img_max > img_min:
        img_normalized = (img_array - img_min) / (img_max - img_min)
    else:
        img_normalized = np.zeros_like(img_array)
    
    img_normalized = np.clip(img_normalized * 255, 0, 255).astype(np.uint8)
    
    # Convert back to PIL Image for classification
    img_pil = Image.fromarray(img_normalized)
    
    # # Ask for classification
    # result = ask_image(img_pil, "Classify the disease in this image according to triads score: ")
    
    # Plot the image and classification result
    # plt.figure(figsize=(6, 6))
    # plt.imshow(img_normalized, cmap='gray')
    # plt.title(row['tirads'])
    # plt.axis('off')
    # plt.show()
    # break
    ids.append(row['number'])
    image_arrays.append(img_normalized)
    labels.append(row['tirads'])

# Convert lists to arrays
patient_ids = np.array(ids)
image_arrays = np.array(image_arrays)
labels = np.array(labels)


Processing: /kaggle/input/ddti-thyroid-ultrasound-images/128_1.jpg with triad score: 4c
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/346_1.jpg with triad score: 3
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/142_1.jpg with triad score: 4c
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/127_1.jpg with triad score: 2
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/338_1.jpg with triad score: 4c
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/164_1.jpg with triad score: 4b
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/110_1.jpg with triad score: 4a
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/166_1.jpg with triad score: 4a
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/109_1.jpg with triad score: 4b
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/279_1.jpg with triad score: 4b
Processing: /kaggle/input/ddti-thyroid-ultrasound-images/391_1.jpg with triad score: 2
Processing: /kaggle/input/ddti-thyr

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
# Get unique patient IDs and split
unique_ids = np.unique(patient_ids)
train_ids, temp_ids = train_test_split(unique_ids, test_size=0.3, random_state=42)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)

# Create masks
train_mask = np.isin(patient_ids, train_ids)
val_mask = np.isin(patient_ids, val_ids)
test_mask = np.isin(patient_ids, test_ids)

# Apply masks
train_images, train_labels = images[train_mask], labels[train_mask]
val_images, val_labels = images[val_mask], labels[val_mask]
test_images, test_labels = images[test_mask], labels[test_mask]


In [14]:
print(train_images.shape)
print(test_images.shape)
print(val_images.shape)

(207, 360, 560)
(45, 360, 560)
(45, 360, 560)


In [22]:
train_images1 = train_images[:, np.newaxis, :, :]      # (207, 1, 360, 560)
train_images3 = np.repeat(train_images1, 3, axis=1)      # (207, 3, 360, 560)

val_images1 = val_images[:, np.newaxis, :, :]
val_images3 = np.repeat(val_images1, 3, axis=1)

test_images1 = test_images[:, np.newaxis, :, :]
test_images3 = np.repeat(test_images1, 3, axis=1)

In [28]:
print(train_images3.shape, train_ids.shape,train_labels.shape)
print(val_images3.shape, val_ids.shape,val_labels.shape)
print(test_images3.shape, test_ids.shape,test_labels.shape)

(207, 3, 360, 560) (207,) (207,)
(45, 3, 360, 560) (45,) (45,)
(45, 3, 360, 560) (45,) (45,)


In [36]:
from torchvision import transforms
import torch

# Define your augmentation pipeline
augment = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.RandomResizedCrop((360, 560), scale=(0.9, 1.1)),
    transforms.ToTensor()
])

aug_images = []
aug_labels = []
aug_ids = []
# 
n_augments = 4  # augment each image 4 times

for i in range(len(train_images3)):
    orig_img = train_images3[i]
    label = train_labels[i]
    id_ = train_ids[i]

    img_tensor = torch.tensor(orig_img)  # (3, 360, 560)
    
    # Original image (optional: include or skip)
    aug_images.append(img_tensor)
    aug_labels.append(label)
    aug_ids.append(id_)

    # Augmented versions
    for _ in range(n_augments):
        aug = augment(img_tensor)
        aug_images.append(aug)
        aug_labels.append(label)
        aug_ids.append(id_)

# Stack into final dataset
aug_images = torch.stack(aug_images)             # shape: (207 * (1+n_augments), 3, 360, 560)
# aug_labels = torch.tensor(aug_labels)            # shape: (207 * (1+n_augments),)
# aug_ids = torch.tensor(aug_ids)                  # shape: (207 * (1+n_augments),)

In [38]:
torch.save({
    'train_images': aug_images,
    'train_labels': aug_labels,
    'train_ids': aug_ids,
    'test_images': test_images3,
    'test_labels': test_labels,
    'test_ids': test_ids,
    'val_images': val_images3,
    'val_labels': val_labels,
    'val_ids': val_ids,
}, 'augmented_dataset.pt')

In [41]:
# Load the saved dataset
data = torch.load('augmented_dataset.pt', weights_only=False)

# Print sizes
print("Train images:", data['train_images'].shape)
print("Train labels:", len(data['train_labels']))
print("Train IDs:", len(data['train_ids']))  # list of strings

print("Val images:", data['val_images'].shape)
print("Val labels:", len(data['val_labels']))
print("Val IDs:", len(data['val_ids']))

print("Test images:", data['test_images'].shape)
print("Test labels:", len(data['test_labels']))
print("Test IDs:", len(data['test_ids']))

Train images: torch.Size([1035, 3, 360, 560])
Train labels: 1035
Train IDs: 1035
Val images: (45, 3, 360, 560)
Val labels: 45
Val IDs: 45
Test images: (45, 3, 360, 560)
Test labels: 45
Test IDs: 45


In [18]:
train_images.shape

(207, 360, 560)

In [16]:
augmented_images = []
original_images = torch.from_numpy(train_images).float() / 255.0  # Normalize to [0,1]

# Generate N augmentations per image
n_augments = 4

for img in original_images:
    pil_img = to_pil(img)
    augmented_images.append(img)  # original
    for _ in range(n_augments):
        aug = augment(pil_img)
        augmented_images.append(aug)

augmented_images = torch.stack(augmented_images)  # shape: (207 * 5, 3, 360, 560)

RuntimeError: stack expects each tensor to be equal size, but got [360, 560] at entry 0 and [1, 360, 560] at entry 1

In [7]:
# Save to file
np.savez_compressed("thyroid_images_and_labels.npz", patient_ids=patient_ids, images=image_arrays, labels=labels)
print("Saved images and labels to thyroid_images_and_labels.npz")

Saved images and labels to thyroid_images_and_labels.npz


In [8]:
data = np.load("/kaggle/working/thyroid_images_and_labels.npz")
images = data["images"]
labels = data["labels"]
patient_ids = data["patient_ids"]

In [9]:
len(patient_ids)

297