<a href="https://colab.research.google.com/github/acesur/Machine-Learning-/blob/main/02_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from glob import glob
from skimage.feature import graycomatrix, graycoprops
from skimage.measure import regionprops, label
from skimage.color import rgb2gray
from sklearn.preprocessing import StandardScaler
import kagglehub

# Download dataset
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
print("Path to dataset files:", path)

IMAGES_PATH_PART1 = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_1"
IMAGES_PATH_PART2 = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_images_part_2"
METADATA_PATH = "/kaggle/input/skin-cancer-mnist-ham10000/HAM10000_metadata.csv"

print(f"Path 1 exists: {os.path.exists(IMAGES_PATH_PART1)}")
print(f"Path 2 exists: {os.path.exists(IMAGES_PATH_PART2)}")
print(f"Metadata exists: {os.path.exists(METADATA_PATH)}")

metadata = pd.read_csv(METADATA_PATH)
print(f"Metadata shape: {metadata.shape}")
print(f"Metadata columns: {metadata.columns.tolist()}")

def find_image_path_improved(image_id, part1_path, part2_path):
    if image_id.startswith('ISIC_'):
        base_id = image_id
        numeric_id = image_id.replace('ISIC_', '')
    else:
        base_id = f"ISIC_{image_id}"
        numeric_id = image_id

    patterns = [
        f"{base_id}.jpg",
        f"{numeric_id}.jpg",
        f"{base_id.lower()}.jpg",
        f"{numeric_id.lower()}.jpg"
    ]

    for pattern in patterns:
        for directory in [part1_path, part2_path]:
            path = os.path.join(directory, pattern)
            if os.path.exists(path):
                return path

    for directory in [part1_path, part2_path]:
        possible_files = glob(os.path.join(directory, f"*{base_id}*"))
        if possible_files:
            return possible_files[0]
        possible_files = glob(os.path.join(directory, f"*{numeric_id}*"))
        if possible_files:
            return possible_files[0]

    return None

def extract_abcd_features_robust(image_path):
    try:
        img = cv2.imread(image_path)
        if img is None:
            return None

        height, width, _ = img.shape
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

        _, thresh_otsu = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
        thresh_adaptive = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                                cv2.THRESH_BINARY_INV, 11, 2)

        for thresh in [thresh_otsu, thresh_adaptive]:
            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
            if not contours:
                continue

            largest_contour = max(contours, key=cv2.contourArea)
            contour_area = cv2.contourArea(largest_contour)
            if contour_area < 100:
                continue

            mask = np.zeros_like(gray)
            cv2.drawContours(mask, [largest_contour], 0, 255, -1)

            h, w = mask.shape
            flipped_mask = cv2.flip(mask, 1)
            asymmetry = np.sum(np.abs(mask - flipped_mask)) / (h * w * 255)

            perimeter = cv2.arcLength(largest_contour, True)
            border_irregularity = perimeter**2 / (4 * np.pi * contour_area) if contour_area > 0 else 0

            r, g, b = cv2.split(img)
            mask_bool = mask > 0

            color_variance_r = np.var(r[mask_bool]) if np.sum(mask_bool) > 0 else 0
            color_variance_g = np.var(g[mask_bool]) if np.sum(mask_bool) > 0 else 0
            color_variance_b = np.var(b[mask_bool]) if np.sum(mask_bool) > 0 else 0

            x, y, w, h = cv2.boundingRect(largest_contour)
            diameter = max(w, h)

            return {
                'asymmetry': float(asymmetry),
                'border_irregularity': float(border_irregularity),
                'color_variance_r': float(color_variance_r),
                'color_variance_g': float(color_variance_g),
                'color_variance_b': float(color_variance_b),
                'diameter': float(diameter)
            }

        return None

    except Exception as e:
        print(f"Error in feature extraction: {str(e)}")
        return None

# Extract features using a diverse stratified sample
sample_size = 1000
metadata_sampled = metadata.groupby('dx', group_keys=False).apply(lambda x: x.sample(min(len(x), sample_size // 7), random_state=42))

print("Sampled metadata distribution:")
print(metadata_sampled['dx'].value_counts())

feature_results = []
success_count = 0
failure_count = 0

for i, row in metadata_sampled.iterrows():
    img_id = row['image_id']
    img_path = find_image_path_improved(img_id, IMAGES_PATH_PART1, IMAGES_PATH_PART2)
    if img_path:
        features = extract_abcd_features_robust(img_path)
        if features:
            features['image_id'] = img_id
            features['dx'] = row['dx']
            feature_results.append(features)
            success_count += 1
        else:
            failure_count += 1
    else:
        failure_count += 1

    if (i + 1) % 50 == 0:
        print(f"Processed {i + 1}/{len(metadata_sampled)} images, Success: {success_count}, Failures: {failure_count}")

print(f"Final count - Success: {success_count}, Failures: {failure_count}")

if feature_results:
    feature_df = pd.DataFrame(feature_results)
    print("Feature DataFrame shape:", feature_df.shape)
    print("Class distribution:")
    print(feature_df['dx'].value_counts())
    from google.colab import drive
    drive.mount('/content/drive/')
    feature_df.to_csv('/content/drive/My Drive/abcd_features.csv', index=False)
else:
    print("No features were successfully extracted.")


Path to dataset files: /kaggle/input/skin-cancer-mnist-ham10000
Path 1 exists: True
Path 2 exists: True
Metadata exists: True
Metadata shape: (10015, 7)
Metadata columns: ['lesion_id', 'image_id', 'dx', 'dx_type', 'age', 'sex', 'localization']
Sampled metadata distribution:
dx
akiec    142
bcc      142
bkl      142
mel      142
nv       142
vasc     142
df       115
Name: count, dtype: int64


  metadata_sampled = metadata.groupby('dx', group_keys=False).apply(lambda x: x.sample(min(len(x), sample_size // 7), random_state=42))


Processed 9950/967 images, Success: 136, Failures: 0
Processed 10000/967 images, Success: 141, Failures: 0
Processed 2750/967 images, Success: 200, Failures: 0
Processed 2600/967 images, Success: 202, Failures: 0
Processed 2700/967 images, Success: 214, Failures: 0
Processed 2900/967 images, Success: 261, Failures: 0
Processed 950/967 images, Success: 332, Failures: 0
Processed 200/967 images, Success: 363, Failures: 0
Processed 400/967 images, Success: 386, Failures: 0
Processed 600/967 images, Success: 389, Failures: 0
Processed 300/967 images, Success: 405, Failures: 0
Processed 50/967 images, Success: 408, Failures: 0
Processed 1100/967 images, Success: 428, Failures: 0
Processed 1200/967 images, Success: 479, Failures: 0
Processed 1150/967 images, Success: 499, Failures: 0
Processed 1300/967 images, Success: 553, Failures: 0
Processed 1600/967 images, Success: 602, Failures: 0
Processed 1350/967 images, Success: 660, Failures: 0
Processed 4150/967 images, Success: 724, Failures: 0