In [1]:
import os
import numpy as np
import cv2
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split

In [11]:
image_dir = "C:\\Users\\sejal\\Downloads\\diabetic_retinopathy\\colored_images"
csv_file = "C:\\Users\\sejal\\Downloads\\diabetic_retinopathy\\train.csv"
output_dir = "C:\\Users\\sejal\\Downloads\\diabetic_retinopathy\\preprocessed_images_with_socks"

In [3]:
labels_df = pd.read_csv(csv_file)
label_map = dict(zip(labels_df['id_code'], labels_df['diagnosis']))

In [4]:
def get_filtered_file_paths(directory):
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if not file.startswith("._"):  # Excluding '._' files
                file_paths.append(os.path.join(root, file))
    return file_paths

In [5]:
def calculate_mad(data):
   #median abs deviation
    median = np.median(data)
    mad = np.median(np.abs(data - median))
    return mad

def flag_outliers(image, kernel_size, threshold):

    padded_image = np.pad(image, kernel_size // 2, mode='reflect')
    flagged_image = np.zeros_like(image, dtype=np.float32)

    for i in range(image.shape[0]):
        for j in range(image.shape[1]):
            kernel = padded_image[i:i + kernel_size, j:j + kernel_size]
            median = np.median(kernel)
            mad = calculate_mad(kernel)
            z_score = (image[i, j] - median) / (mad if mad > 0 else 1)
            if abs(z_score) >= threshold:
                flagged_image[i, j] = np.nan  

    return flagged_image

def curate_image(image, flagged_image, kernel_size):
    padded_image = np.pad(image, kernel_size // 2, mode='reflect')
    curated_image = image.copy()

    for i in range(flagged_image.shape[0]):
        for j in range(flagged_image.shape[1]):
            if np.isnan(flagged_image[i, j]):
                kernel = padded_image[i:i + kernel_size, j:j + kernel_size]
                non_nan_values = kernel[~np.isnan(kernel)]
                curated_image[i, j] = np.mean(non_nan_values) if non_nan_values.size > 0 else 0

    return curated_image

def apply_socks(image, kernel_size=5, threshold=2.5, max_iterations=10, stop_threshold=75):
    for iteration in range(max_iterations):
        flagged_image = flag_outliers(image, kernel_size, threshold)
        num_outliers = np.sum(np.isnan(flagged_image))
        print(f"Iteration {iteration + 1}: Flagged {num_outliers} outliers.")

        if num_outliers == 0 or num_outliers < stop_threshold:
            print("Convergence criterion met.")
            break

        image = curate_image(image, flagged_image, kernel_size)

    return image

In [6]:
def preprocess_image_with_socks(image_path, kernel_size=5, threshold=2.5, max_iterations=10):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_image(img, channels=3).numpy()

    gray_img = tf.image.rgb_to_grayscale(img).numpy().squeeze()

    denoised_img = apply_socks(gray_img, kernel_size, threshold, max_iterations)

    resized_img = cv2.resize(denoised_img, (224, 224))
    normalized_img = resized_img / 255.0

    rgb_img = np.stack([normalized_img] * 3, axis=-1)

    return rgb_img

In [7]:
def preprocess_and_save_images_with_socks(image_dir, output_dir, kernel_size=5, threshold=2.5, max_iterations=10, max_images=200):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    processed_count = 0
    for file_path in get_filtered_file_paths(image_dir):
        if processed_count >= max_images:
            break
        try:
            filename = os.path.basename(file_path)
            preprocessed_img = preprocess_image_with_socks(file_path, kernel_size, threshold, max_iterations)
            save_path = os.path.join(output_dir, filename)
            cv2.imwrite(save_path, (preprocessed_img * 255).astype(np.uint8))
            processed_count += 1
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    print(f"Processed {processed_count} images.")
            
preprocess_and_save_images_with_socks(image_dir, output_dir)

Iteration 1: Flagged 4061 outliers.
Iteration 2: Flagged 1250 outliers.
Iteration 3: Flagged 495 outliers.
Iteration 4: Flagged 252 outliers.
Iteration 5: Flagged 130 outliers.
Iteration 6: Flagged 93 outliers.
Iteration 7: Flagged 80 outliers.
Iteration 8: Flagged 64 outliers.
Convergence criterion met.
Iteration 1: Flagged 3811 outliers.
Iteration 2: Flagged 1432 outliers.
Iteration 3: Flagged 667 outliers.
Iteration 4: Flagged 331 outliers.
Iteration 5: Flagged 198 outliers.
Iteration 6: Flagged 130 outliers.
Iteration 7: Flagged 87 outliers.
Iteration 8: Flagged 74 outliers.
Convergence criterion met.
Iteration 1: Flagged 3122 outliers.
Iteration 2: Flagged 1661 outliers.
Iteration 3: Flagged 1069 outliers.
Iteration 4: Flagged 696 outliers.
Iteration 5: Flagged 441 outliers.
Iteration 6: Flagged 312 outliers.
Iteration 7: Flagged 212 outliers.
Iteration 8: Flagged 149 outliers.
Iteration 9: Flagged 121 outliers.
Iteration 10: Flagged 91 outliers.
Iteration 1: Flagged 3240 outliers

In [27]:
import shutil
import pandas as pd

csv_path = csv_file
images_dir = "C:\\Users\\sejal\\Downloads\\diabetic_retinopathy\\preprocessed_images_with_socks" 
output_dir = "C:\\Users\\sejal\\Downloads\\diabetic_retinopathy\\output_socks"  
os.makedirs(output_dir, exist_ok=True)

df = pd.read_csv(csv_path)

for image_file in os.listdir(images_dir):

    id_code = os.path.splitext(image_file)[0]
    row = df[df['id_code'] == id_code]
    
    if not row.empty:
        class_label = row['diagnosis'].values[0]

        class_dir = os.path.join(output_dir, class_label)
        os.makedirs(class_dir, exist_ok=True)

        src_path = os.path.join(images_dir, image_file)
        dst_path = os.path.join(class_dir, image_file)
        
        shutil.move(src_path, dst_path)
    else:
        print(f"ID code not found in CSV: {id_code}")

print("Images have been successfully organized into class folders.")

Images have been successfully organized into class folders.


In [28]:
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

train_data = datagen.flow_from_directory(
    directory=output_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='training'
)

val_data = datagen.flow_from_directory(
    directory=output_dir,
    target_size=(224, 224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

Found 160 images belonging to 5 classes.
Found 40 images belonging to 5 classes.


In [29]:
images, labels = train_data.next()

print("Image batch shape:", images.shape)  
print("Label batch shape:", labels.shape)

Image batch shape: (32, 224, 224, 3)
Label batch shape: (32, 5)


In [32]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')  # 5 classes
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [33]:
history = model.fit(train_data, validation_data=val_data, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
