In [1]:
# =====================================================================================
# FINAL SCRIPT - V6 (Improved with Augmentation and Hyperparameter Tuning)
# =====================================================================================

# Cell 1: Setup, Imports, and Login
# --------------------------------------------------------------------------
print("--- Installing and Importing Libraries ---")
!pip install -U transformers datasets accelerate evaluate scikit-learn --quiet

import os
import pandas as pd
import numpy as np
import torch
from PIL import Image as PILImage
from datasets import Dataset, DatasetDict
from transformers import ViTImageProcessor, ViTForImageClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, balanced_accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

## NEW: Import torchvision for data augmentation
from torchvision.transforms import (
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    RandomRotation,
    ColorJitter,
    ToTensor,
    Resize,
    CenterCrop,
)

# Securely login to Hugging Face
try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
    login(token=hf_token)
    print("Hugging Face login successful.")
except Exception:
    print("Hugging Face token not found. Model will not be pushed to the Hub.")


# Cell 2: Data Loading and Full, Upfront Preprocessing (Correct & Verified)
# --------------------------------------------------------------------------
print("\n--- Starting Data Loading and Preprocessing ---")

# Step 1: Load the metadata
metadata_path = "/kaggle/input/skin-cancer/metadata.csv"
df = pd.read_csv(metadata_path)

# Step 2: Create a dictionary of all image file paths
all_image_paths = {}
path_dirs = [
    '/kaggle/input/skin-cancer/imgs_part_1/imgs_part_1/',
    '/kaggle/input/skin-cancer/imgs_part_2/imgs_part_2/',
    '/kaggle/input/skin-cancer/imgs_part_3/imgs_part_3/'
]
for path_dir in path_dirs:
    if os.path.exists(path_dir):
        for fname in os.listdir(path_dir):
            all_image_paths[fname] = os.path.join(path_dir, fname)

# Step 3: Prepare the final DataFrame
df['path'] = df['img_id'].map(all_image_paths.get)
df['label'] = df['diagnostic']
df_clean = df[['path', 'label']].dropna().copy()
print(f"Found {len(df_clean)} matching image files and labels.")

# Step 4: Create the Hugging Face Dataset from pandas
raw_dataset = Dataset.from_pandas(df_clean)

# Step 5: Encode string labels into integers
raw_dataset = raw_dataset.class_encode_column("label")

# Step 6: Process all images into pixel values (This is the upfront processing)
model_name = "Anwarkh1/Skin_Cancer-Image_Classification"
processor = ViTImageProcessor.from_pretrained(model_name)

def preprocess_images(examples):
    # This function USES the 'path' column
    images = [PILImage.open(path).convert("RGB") for path in examples["path"]]
    processed_inputs = processor(images, return_tensors="pt")
    examples['pixel_values'] = processed_inputs['pixel_values']
    return examples

# Apply the function. The resulting dataset has 'label', 'path', and 'pixel_values'
processed_dataset = raw_dataset.map(preprocess_images, batched=True, batch_size=100)

# NOW, we remove the 'path' column because 'pixel_values' has replaced it.
processed_dataset = processed_dataset.remove_columns(['path'])

# Step 7: Create Train/Test Splits
train_test_split_dataset = processed_dataset.train_test_split(test_size=0.2, stratify_by_column="label")
final_datasets = DatasetDict({
    'train': train_test_split_dataset['train'],
    'test': train_test_split_dataset['test']
})

# The trainer will now receive data with 'pixel_values' and 'labels' - no 'path'.
# This is correct.
final_datasets.set_format('torch')

print("\n--- Preprocessing Complete. Final Dataset Info: ---")
print(final_datasets)
print("\nClass Names:", final_datasets['train'].features['label'].names)


# Cell 3: Metrics, Model, and Advanced Training Setup
# --------------------------------------------------------------------------
print("\n--- Setting up for Training ---")

# Step 1: Define metrics function (no change)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
    acc = accuracy_score(labels, predictions)
    bacc = balanced_accuracy_score(labels, predictions)
    return {'accuracy': acc, 'balanced_accuracy': bacc, 'f1': f1, 'precision': precision, 'recall': recall}

# Step 2: Load the model and configure it
labels = final_datasets['train'].features['label'].names
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items()}

model = ViTForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
print("Model loaded and configured for new labels.")

# Step 3: Calculate class weights on the CPU first
# We need to access the original dataset here since the transformed one doesn't have the 'label' column directly accessible
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(np.array(train_test_split_dataset['train']['label'])),
    y=np.array(train_test_split_dataset['train']['label'])
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)
print(f"Calculated Class Weights: {class_weights}")

# Step 4: Create a custom Trainer to use the weighted loss (no change)
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        weights = self.class_weights.to(logits.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Step 5: Define training arguments
## CHANGED: Updated hyperparameters for better performance
hub_model_id = "bnmbanhmi/seekwell_skincancer_v2" # new model name for the hub

training_args = TrainingArguments(
  output_dir="/kaggle/working/seekwell_model_v6_improved",
  num_train_epochs=15, # With augmentation, you might need more epochs, but 15 is a good start.
  learning_rate=2e-5, # Increased learning rate for more effective fine-tuning.
  per_device_train_batch_size=32,
  weight_decay=0.01,
  eval_strategy="epoch",
  save_strategy="epoch",
  fp16=True,
  load_best_model_at_end=True,
  metric_for_best_model="balanced_accuracy",
  push_to_hub=True,
  hub_model_id=hub_model_id,
  hub_strategy="every_save",
  report_to="none",
  logging_strategy="epoch", # NEW: Log training loss at each epoch.
  lr_scheduler_type="cosine", # NEW: Use a cosine learning rate scheduler.
  warmup_ratio=0.1, # NEW: Use 10% of training steps for a warmup period.
)

# Step 6: Instantiate the CustomTrainer
# Note: Since transforms are set, we use our final_datasets dict
trainer = CustomTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=final_datasets["train"],
    eval_dataset=final_datasets["test"],
)
trainer.class_weights = class_weights_tensor


# Cell 4: Start Training and Evaluate
# --------------------------------------------------------------------------
print("\n--- Starting Training ---")
train_results = trainer.train()

print("\n--- Final Evaluation on Test Set ---")
# The trainer.evaluate() will use the best model loaded at the end of training
metrics = trainer.evaluate()
print("\n--- Final Performance Metrics (from best model) ---")
print(metrics)

# Save the final model and metrics
trainer.save_model()
trainer.save_state()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
print(f"\n--- Model saved to {training_args.output_dir} ---")

--- Installing and Importing Libraries ---
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m79.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00

2025-06-13 10:37:32.557836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749811052.785207      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749811052.847368      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Hugging Face login successful.

--- Starting Data Loading and Preprocessing ---
Found 2298 matching image files and labels.


Casting to class labels:   0%|          | 0/2298 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Map:   0%|          | 0/2298 [00:00<?, ? examples/s]


--- Preprocessing Complete. Final Dataset Info: ---
DatasetDict({
    train: Dataset({
        features: ['label', 'pixel_values'],
        num_rows: 1838
    })
    test: Dataset({
        features: ['label', 'pixel_values'],
        num_rows: 460
    })
})

Class Names: ['ACK', 'BCC', 'MEL', 'NEV', 'SCC', 'SEK']

--- Setting up for Training ---


config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/343M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at Anwarkh1/Skin_Cancer-Image_Classification and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([7]) in the checkpoint and torch.Size([6]) in the model instantiated
- classifier.weight: found shape torch.Size([7, 768]) in the checkpoint and torch.Size([6, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded and configured for new labels.
Calculated Class Weights: [0.52454338 0.45315582 7.29365079 1.57094017 2.00217865 1.62943262]

--- Starting Training ---




Epoch,Training Loss,Validation Loss,Accuracy,Balanced Accuracy,F1,Precision,Recall
1,1.7728,1.714302,0.306522,0.329356,0.312051,0.495556,0.306522
2,1.6131,1.515187,0.573913,0.467659,0.557796,0.627743,0.573913
3,1.3651,1.31496,0.643478,0.601927,0.651471,0.678216,0.643478
4,1.1182,1.141272,0.671739,0.63433,0.675,0.688442,0.671739
5,0.9072,1.031509,0.669565,0.682024,0.680681,0.709539,0.669565
6,0.7353,0.950721,0.691304,0.700088,0.698156,0.712173,0.691304
7,0.5956,0.904186,0.684783,0.694795,0.690806,0.707099,0.684783
8,0.4968,0.885653,0.667391,0.691676,0.680017,0.7041,0.667391
9,0.4207,0.868665,0.667391,0.686002,0.679902,0.706813,0.667391
10,0.3632,0.876684,0.691304,0.671619,0.694378,0.704114,0.691304





--- Final Evaluation on Test Set ---





--- Final Performance Metrics (from best model) ---
{'eval_loss': 0.9507212042808533, 'eval_accuracy': 0.691304347826087, 'eval_balanced_accuracy': 0.7000876715332182, 'eval_f1': 0.6981557244021414, 'eval_precision': 0.7121734937952037, 'eval_recall': 0.691304347826087, 'eval_runtime': 8.1852, 'eval_samples_per_second': 56.199, 'eval_steps_per_second': 3.543, 'epoch': 15.0}
***** eval metrics *****
  epoch                   =       15.0
  eval_accuracy           =     0.6913
  eval_balanced_accuracy  =     0.7001
  eval_f1                 =     0.6982
  eval_loss               =     0.9507
  eval_precision          =     0.7122
  eval_recall             =     0.6913
  eval_runtime            = 0:00:08.18
  eval_samples_per_second =     56.199
  eval_steps_per_second   =      3.543

--- Model saved to /kaggle/working/seekwell_model_v6_improved ---
