In [None]:
# Install required libraries
!pip install transformers torch torchvision pillow scikit-learn pandas numpy imbalanced-learn apex

Collecting apex
  Downloading apex-0.9.10dev.tar.gz (36 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting cryptacular (from apex)
  Downloading cryptacular-1.6.2.tar.gz (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.8/75.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting zope.sqlalchemy (from apex)
  Downloading zope.sqlalchemy-3.1-py3-none-any.whl.metadata (18 kB)
Collecting velruse>=1.0.3 (from apex)
  Downloading velruse-1.1.1.tar.gz (709 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m709.8/709.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyramid>1.1.2 (from apex)
  Downloading pyramid-2.0.2-py3-none-any.whl.metadata (20 kB)
Collecting pyramid_mailer (from apex)


In [None]:
# Import libraries
import os
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from transformers import AutoImageProcessor, EfficientNetForImageClassification
import torch
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, array_to_img, load_img


In [None]:
# Define directories
train_dir = '/content/drive/My Drive/AJL Team 15/Equitable AI Dermatology 2025/train/train/'
augmented_dir = "/content/drive/My Drive/AJL Team 15/Equitable AI Dermatology 2025/train/train_augmented/"
os.makedirs(augmented_dir, exist_ok=True)

# Define image data generator for augmentation
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Load and preprocess the dataset
train_df = pd.read_csv('/content/drive/My Drive/AJL Team 15/Equitable AI Dermatology 2025/train.csv')
train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
train_df['file_path'] = train_df['label'] + '/' + train_df['md5hash']

# Filter for high-quality images
train_df = train_df[train_df['qc'] == "1 Diagnostic"]

# Encode labels
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])

# Split into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Perform augmentation
augmented_data = []
num_augmented_images = 5  # Increase number of augmented images

# Perform augmentation
for index, row in train_data.iterrows():
    original_image_path = os.path.join(train_dir, row["file_path"])
    img = load_img(original_image_path)
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)

    class_label = row["label"]
    base_filename = os.path.basename(row["file_path"]).split(".")[0]
    class_dir = os.path.join(augmented_dir, class_label)
    os.makedirs(class_dir, exist_ok=True)

    # Save original image
    new_file_path = os.path.join(class_dir, f"{base_filename}.jpg")
    img.save(new_file_path)
    augmented_data.append({
        "file_path": new_file_path.replace(augmented_dir, ""),
        "label": row["label"],
        "encoded_label": row["encoded_label"],
        "md5hash": row["md5hash"],
        "fitzpatrick_scale": row["fitzpatrick_scale"],
        "fitzpatrick_centaur": row["fitzpatrick_centaur"],
        "nine_partition_label": row["nine_partition_label"],
        "three_partition_label": row["three_partition_label"],
        "qc": row["qc"],
        "ddi_scale": row["ddi_scale"]
    })

    # Generate augmented images
    aug_iter = train_datagen.flow(img_array, batch_size=1)
    for i in range(num_augmented_images):
        aug_img = next(aug_iter)[0]
        aug_img = (aug_img * 255).astype("uint8")
        aug_pil_img = array_to_img(aug_img)
        aug_file_path = os.path.join(class_dir, f"{base_filename}_aug{i}.jpg")
        aug_pil_img.save(aug_file_path)
        augmented_data.append({
            "file_path": aug_file_path.replace(augmented_dir, ""),
            "label": row["label"],
            "encoded_label": row["encoded_label"],
            "md5hash": row["md5hash"],
            "fitzpatrick_scale": row["fitzpatrick_scale"],
            "fitzpatrick_centaur": row["fitzpatrick_centaur"],
            "nine_partition_label": row["nine_partition_label"],
            "three_partition_label": row["three_partition_label"],
            "qc": row["qc"],
            "ddi_scale": row["ddi_scale"]
        })

# Save augmented metadata
augmented_df = pd.DataFrame(augmented_data)
augmented_df.to_csv(os.path.join(augmented_dir, "train_augmented_metadata.csv"), index=False)
print(f"✅ Augmentation complete. Total images in dataset: {len(augmented_df)}")

✅ Augmentation complete. Total images in dataset: 360


In [None]:
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),  # Reduced size
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Reduced size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
# Custom Dataset class
class DermatologyDataset(Dataset):
    def __init__(self, image_paths, labels=None, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        label = self.labels[idx] if self.labels is not None else -1  # Handle None case
        if self.transform:
            image = self.transform(image)
        return image, label


In [None]:
'''# Function to generate embeddings with data augmentation
def generate_embeddings(image_paths, transform, labels=None):
    embeddings = []
    dataset = DermatologyDataset(image_paths, labels=labels, transform=transform)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

    for batch in dataloader:
        images, _ = batch
        with torch.no_grad():
            outputs = model(images)
        embeddings.append(outputs.logits.numpy())

    return np.vstack(embeddings)'''


'# Function to generate embeddings with data augmentation\ndef generate_embeddings(image_paths, transform, labels=None):\n    embeddings = []\n    dataset = DermatologyDataset(image_paths, labels=labels, transform=transform)\n    dataloader = DataLoader(dataset, batch_size=32, shuffle=False)\n\n    for batch in dataloader:\n        images, _ = batch\n        with torch.no_grad():\n            outputs = model(images)\n        embeddings.append(outputs.logits.numpy())\n    \n    return np.vstack(embeddings)'

In [None]:
# Get image paths and labels for training and validation sets
train_image_paths = [os.path.join(train_dir, row["file_path"]) for _, row in train_data.iterrows()]
train_labels = train_data['encoded_label'].values

val_image_paths = [os.path.join(train_dir, row["file_path"]) for _, row in val_data.iterrows()]
val_labels = val_data['encoded_label'].values

# Create DataLoader for training and validation
train_dataset = DermatologyDataset(train_image_paths, labels=train_labels, transform=train_transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Reduced batch size

val_dataset = DermatologyDataset(val_image_paths, labels=val_labels, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [None]:
# Fine-tune EfficientNet-B0 (smaller model)
model = EfficientNetForImageClassification.from_pretrained(
    "google/efficientnet-b0",
    num_labels=len(label_encoder.classes_),
    ignore_mismatched_sizes=True  # Add this line to ignore size mismatches
)
optimizer = Adam(model.parameters(), lr=1e-4)
criterion = CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(5):  # Adjust number of epochs
    for images, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Evaluate on validation set
val_dataset = DermatologyDataset(val_image_paths, labels=val_labels, transform=val_transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

model.eval()
val_preds, val_true = [], []
with torch.no_grad():
    for images, labels in val_loader:
        outputs = model(images).logits
        preds = torch.argmax(outputs, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_true.extend(labels.cpu().numpy())

# Get unique labels from val_true
unique_labels = np.unique(val_true)

# Calculate accuracy
accuracy = accuracy_score(val_true, val_preds)
print("Validation Accuracy:", accuracy)

# Generate classification report with labels argument
print("Classification Report:\n", classification_report(
    val_true, val_preds,
    target_names=label_encoder.classes_[unique_labels],  # Use unique labels to get target names
    labels=unique_labels  # Specify labels for the report
))

Some weights of EfficientNetForImageClassification were not initialized from the model checkpoint at google/efficientnet-b0 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 1280]) in the checkpoint and torch.Size([19, 1280]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([19]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 2.92204213142395
Epoch 2, Loss: 2.823044776916504
Epoch 3, Loss: 2.735992670059204
Epoch 4, Loss: 2.701565742492676
Epoch 5, Loss: 2.637279748916626
Validation Accuracy: 0.1875
Classification Report:
                                   precision    recall  f1-score   support

               actinic-keratosis       0.00      0.00      0.00         2
            basal-cell-carcinoma       0.50      0.75      0.60         4
basal-cell-carcinoma-morpheiform       0.00      0.00      0.00         1
                          eczema       0.00      0.00      0.00         1
                  kaposi-sarcoma       0.00      0.00      0.00         1
                          keloid       0.00      0.00      0.00         2
                        melanoma       0.00      0.00      0.00         2
               mycosis-fungoides       0.00      0.00      0.00         1
               prurigo-nodularis       0.00      0.00      0.00         1
         squamous-cell-carcinoma       0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Load the test dataset
test_df = pd.read_csv('/content/drive/My Drive/AJL Team 15/Equitable AI Dermatology 2025/test.csv')
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'
test_image_paths = [os.path.join('/content/drive/My Drive/AJL Team 15/Equitable AI Dermatology 2025/test/test/', row["md5hash"]) for _, row in test_df.iterrows()]

# Create a Dataset and DataLoader for the test set
test_dataset = DermatologyDataset(test_image_paths, transform=val_transform)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Generate predictions for the test set
model.eval()
test_preds = []
with torch.no_grad():
    for images, _ in test_loader:
        outputs = model(images).logits
        preds = torch.argmax(outputs, dim=1)
        test_preds.extend(preds.cpu().numpy())

# Convert predicted labels back to original class names
predicted_labels = label_encoder.inverse_transform(test_preds)

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'md5hash': test_df['md5hash'].str.replace('.jpg', '', regex=False),
    'label': predicted_labels
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("Submission file saved!")

Submission file saved!


In [None]:
# Download the CSV file to your local device
from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>